# Chatbot pentru răspunderea la întrebări frecvente



## Instalarea bibliotecilor necesare

In [22]:
# daca nu sunt deja instalate
# altfel se comenteaza
# %pip install numpy
# %pip install scikit-learn
# %pip install scipy
# %pip install nltk
# %pip install pandas
# %pip install matplotlib
# %pip install spacy
# %pip install sentence-transformers

## Prelucrarea datelor

1. Setul de date

In [23]:
import pandas as pd

df = pd.read_csv("all_questions.txt", sep="\t")
df.head()

print(df.columns)

data = df[['Question', 'Answer']].dropna()

print(data)

Index(['ArticleTitle', 'Question', 'Answer', 'DifficultyFromQuestioner',
       'DifficultyFromAnswerer', 'ArticleFile'],
      dtype='object')
                                               Question  \
0     Was Abraham Lincoln the sixteenth President of...   
1     Was Abraham Lincoln the sixteenth President of...   
2     Did Lincoln sign the National Banking Act of 1...   
3     Did Lincoln sign the National Banking Act of 1...   
4                      Did his mother die of pneumonia?   
...                                                 ...   
3992          What areas do the Grevy's Zebras inhabit?   
3994  Which species of zebra is known as the common ...   
3995  Which species of zebra is known as the common ...   
3996                     At what age can a zebra breed?   
3997                     At what age can a zebra breed?   

                                                 Answer  
0                                                   yes  
1                              

2. Prelucrarea textului

Constă în:
* eliminarea duplicatelor,
* transformarea din uppercase în lowercase,
* eliminarea caracterelor care nu sunt cuvinte și care nu sunt whitespace-uri
* eliminarea cifrelor din text.

In [24]:
# eliminare duplicate

data["Question"] = data["Question"].drop_duplicates(keep="first")

print(f"Train:{len(data)}")

# transformare din uppercase in lowercase

data["Question"] = data["Question"].map(lambda x: x.lower() if isinstance(x, str) else x)

# eliminare a caracterelor care nu sunt cuvinte si care nu sunt whitespace-uri

data["Question"] = data["Question"].replace(to_replace=r'[^\w\s]', value='', regex=True)

# eliminare cifre din text 

data["Question"] = data["Question"].replace(to_replace=r'\d', value='', regex=True)

data = data.dropna(subset=["Question"])

print(f"Text curatat:")
print(data["Question"])

Train:3420
Text curatat:
0       was abraham lincoln the sixteenth president of...
2           did lincoln sign the national banking act of 
4                         did his mother die of pneumonia
6             how many long was lincolns formal education
8             when did lincoln begin his political career
                              ...                        
3988                                   what do zebras eat
3990                           what are zebras hunted for
3992              what areas do the grevys zebras inhabit
3994    which species of zebra is known as the common ...
3996                        at what age can a zebra breed
Name: Question, Length: 2203, dtype: object


3. Curatarea textului - Lemmatizarea textului

In [25]:
# tokenizare

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'\w+')

data["Question"] = data["Question"].apply(lambda x: tokenizer.tokenize(x) if isinstance(x, str) else x)

# stergere stopwords

# poate necesita decomentare
# nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

data["Question"] = data["Question"].apply(lambda x: [word for word in x if word not in stop_words])

print("Text Prelucrat")
print(data["Question"])

Text Prelucrat
0       [abraham, lincoln, sixteenth, president, unite...
2                 [lincoln, sign, national, banking, act]
4                                [mother, die, pneumonia]
6               [many, long, lincolns, formal, education]
8                     [lincoln, begin, political, career]
                              ...                        
3988                                        [zebras, eat]
3990                                     [zebras, hunted]
3992                     [areas, grevys, zebras, inhabit]
3994               [species, zebra, known, common, zebra]
3996                                  [age, zebra, breed]
Name: Question, Length: 2203, dtype: object


In [26]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# pot necesita sa fie decomentate
# nltk.download('averaged_perceptron_tagger_eng')
# nltk.download('wordnet')

# lemmatizare
# initializare lemmatizer
lemmatizer = WordNetLemmatizer()

# functie pentru lemmatizare token-uri
def lemmatize_tokens(tokens):
    # conversie din pos in wordnet
    def get_wordnet_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)
    
    # lematizare token-uri
    lemmas = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    
    # returnare ca o lista
    return lemmas

data["Question"] = data["Question"].apply(lemmatize_tokens)

print("Text dupa Lemmatizare:")
print(data["Question"])

Text dupa Lemmatizare:
0       [abraham, lincoln, sixteenth, president, unite...
2                 [lincoln, sign, national, banking, act]
4                                [mother, die, pneumonia]
6                [many, long, lincoln, formal, education]
8                     [lincoln, begin, political, career]
                              ...                        
3988                                         [zebra, eat]
3990                                        [zebra, hunt]
3992                       [area, grevys, zebra, inhabit]
3994                 [specie, zebra, know, common, zebra]
3996                                  [age, zebra, breed]
Name: Question, Length: 2203, dtype: object


4. Word Embedding

In [27]:
from sentence_transformers import SentenceTransformer, util

# reconstruire intrebari ca string
data["QuestionText"] = data["Question"].apply(lambda tokens: " ".join(tokens))

model = SentenceTransformer('all-MiniLM-L6-v2')

questions = data["QuestionText"].tolist()
answers = data["Answer"].tolist()
question_embeddings = model.encode(questions, convert_to_tensor=True)

print(question_embeddings)


tensor([[-0.0072,  0.0577, -0.0110,  ..., -0.0146, -0.0587,  0.0282],
        [-0.0099, -0.0138, -0.0911,  ..., -0.0242, -0.0420, -0.0109],
        [ 0.0005, -0.0069,  0.0078,  ..., -0.0506,  0.0968,  0.0091],
        ...,
        [-0.0427,  0.0400, -0.0233,  ...,  0.0435,  0.0533,  0.0033],
        [-0.0986,  0.0236, -0.0243,  ...,  0.0413,  0.0767,  0.0150],
        [-0.1055,  0.0927, -0.0029,  ...,  0.0076,  0.1137, -0.0036]])


5. Calcularea gradului similaritatii

Se testează mai multe funcții pentru similaritate mai bună pentru întrebări

- cosine_similarity
- distanta euclidiana
- produsul vectorial

In [32]:
import numpy as np
from scipy.spatial.distance import cosine, euclidean

# inversul funcțiilor pentru a crește cu similaritatea

def cosine_sim(a, b):
    return 1 - cosine(a, b)

def euclidean_distance(a, b):
    return 1 / (1 + euclidean(a, b))

def prod_vec(a, b):
    a = np.asarray(a)
    b = np.asarray(b)
    return np.dot(a, b)

6. Funcție best_match pentru o întrebare scrisă de utilizator și dicționarul de întrebări

Trebuiesc făcute exact aceleași operații de prelucrare și curățare de text așa cum au fost făcute pentru întrebările din setul de date.

In [29]:
import re

def best_match(u_question, method="cosine"):

    sim_func = {
        "cosine": cosine_sim,
        "euclidean": euclidean_distance,
        "prodvec": prod_vec
    }.get(method)

    text = u_question.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d', '', text)
    tokens = tokenizer.tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = lemmatize_tokens(tokens)
    clean_text = " ".join(tokens)

    u_vec = model.encode(clean_text)

    scores = [sim_func(u_vec, q_vec) for q_vec in question_embeddings]
    idx = int(np.argmax(scores))

    return {
        "Input": u_question,
        "Matched Question": questions[idx],
        "Answer": answers[idx],
        "Score": float(scores[idx]),
        "Method": method
    }

In [30]:
# How old was Lincoln in 1816? seven

print(best_match("How old is Abraham Lincoln?", method="cosine"))
print(best_match("How old is Abraham Lincoln?", method="euclidean"))
print(best_match("How old is Abraham Lincoln?", method="prodvec"))

# How has Canada helped UN peacekeeping efforts? 

print(best_match("Did Canada help UN peacekeeping?", method="cosine"))
print(best_match("Did Canada help UN peacekeeping?", method="euclidean"))
print(best_match("Did Canada help UN peacekeeping?", method="prodvec"))

{'Input': 'How old is Abraham Lincoln?', 'Matched Question': 'old lincoln', 'Answer': 'seven', 'Score': 0.9362116456031799, 'Method': 'cosine'}
{'Input': 'How old is Abraham Lincoln?', 'Matched Question': 'old lincoln', 'Answer': 'seven', 'Score': 0.7368225062582663, 'Method': 'euclidean'}
{'Input': 'How old is Abraham Lincoln?', 'Matched Question': 'old lincoln', 'Answer': 'seven', 'Score': 0.9362116456031799, 'Method': 'prodvec'}
{'Input': 'Did Canada help UN peacekeeping?', 'Matched Question': 'canada help un peacekeeping effort', 'Answer': 'During the Suez Crisis of 1956, Lester B. Pearson eased tensions by proposing the inception of the United Nations Peacekeeping Force. Canada has since served in 50 peacekeeping missions, including every UN peacekeeping effort until 1989', 'Score': 0.9868932962417603, 'Method': 'cosine'}
{'Input': 'Did Canada help UN peacekeeping?', 'Matched Question': 'canada help un peacekeeping effort', 'Answer': 'During the Suez Crisis of 1956, Lester B. Pear

7. Input de la User

In [None]:
while True:
    user_input = input("Intrebare:")
    
    if user_input.lower() in ["exit"]:
        break

    answer = best_match(user_input)
    print(answer["Answer"])