## Importing libraries 

In [1]:
import time
import statistics
import pandas as pd
import numpy as np
import copy as cp
import spacy
import gensim.downloader
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from nltk.tokenize import word_tokenize
from sklearn.neighbors import NearestNeighbors

ModuleNotFoundError: No module named 'spacy'

## Model importing

In [None]:
# Show all available models in gensim-data

print("Models:")
print(list(gensim.downloader.info()['models'].keys()),"\n")

model = gensim.downloader.load('glove-twitter-25')
# model.save("word2vec.model")

# print("Model download:")
# model = gensim.load('word2vec-google-news-300')

## Bases importing

In [None]:
qa_base = pd.read_csv('./insurance_qna_dataset.csv',sep='\t',index_col=0)
questions_number = qa_base.shape[0]

qm_base = pd.read_csv("Test questions dataset.csv")
test_questions_number = qm_base.shape[0]

# qa_base.head()
qm_base.head(21)

## Words tokenization

In [None]:
qa_base_tokenized = []
    
for i in range(questions_number):
    sentence = qa_base["Question"][i]
    token_words = word_tokenize(sentence)
    if token_words[-1] == '?':
        token_words.pop()
    token_words_lower = [word.lower() for word in token_words]
    qa_base_tokenized.append(token_words_lower)

oq_tokenized = []
qlm_tokenized = []
qmm_tokenized = []
qhm_tokenized = []

for i in range(test_questions_number):
    original_question = qm_base["original question"][i]
    light_modified_question = qm_base["modified question – light"][i]
    medium_modified_question = qm_base["modified question – medium"][i]
    heavy_modified_question = qm_base["modified question – heavy"][i]
    
    token_words_oq = word_tokenize(original_question)
    token_words_qlm = word_tokenize(light_modified_question)
    token_words_qmm = word_tokenize(medium_modified_question)
    token_words_qhm = word_tokenize(heavy_modified_question)
    
    token_words_oq.pop()
    token_words_qlm.pop()
    token_words_qmm.pop()
    token_words_qhm.pop()
    
    token_words_oq_lower = [word.lower() for word in token_words_oq]
    token_words_qlm_lower = [word.lower() for word in token_words_qlm]
    token_words_qmm_lower = [word.lower() for word in token_words_qmm]
    token_words_qhm_lower = [word.lower() for word in token_words_qhm]
    
    oq_tokenized.append(token_words_oq_lower)
    qlm_tokenized.append(token_words_qlm_lower)
    qmm_tokenized.append(token_words_qmm_lower)
    qhm_tokenized.append(token_words_qhm_lower)

## Words vectorization

In [None]:
nlp = spacy.load("en_core_web_sm")

def vectorizer(sentence):
    vectorized_sentence = []
    for i in range(len(sentence)):
        try:
            word = sentence[i]
            word_vectorized = model[word].tolist()
            word_spacy = nlp(word)
            for token in word_spacy:     
                pos = token.pos_
                break
            if pos == "NOUN" or pos == "PROPN" or pos == "ADJ":
                pos_coef = 2
            elif pos == "VERB":
                pos_coef = 1
            elif pos == "NUM":
                pos_coef = 0
            else:
                pos_coef = 0.5
            word_vectorized = [pos_coef*element for element in word_vectorized]
            vectorized_sentence.append(word_vectorized)
        except:
            pass
    return vectorized_sentence
            

def token_sum(sentence):
    vector_length = len(sentence[0])
    sum_list = []
    for i in range(vector_length):
        counter = 0
        for word in sentence:
            counter += word[i]
        sum_list.append(counter)
    return sum_list
    
def token_mean(sentence):
    vector_length = len(sentence[0])
    mean_list = []
    for i in range(vector_length):
        local_mean_list = []
        for word in sentence:
            local_mean_list.append(word[i])
        mean_list.append(np.mean(local_mean_list))
    return mean_list

qa_base_vectorized_sum = []
qa_base_vectorized_mean = []

for i in range(questions_number):
    sentence = vectorizer(qa_base_tokenized[i])
    print(sentence)
    sentence_sum = token_sum(sentence)
    sentence_mean = token_mean(sentence)
    qa_base_vectorized_sum.append(sentence_sum)
    qa_base_vectorized_mean.append(sentence_mean)
    
    break
    
qlm_vectorized_sum = []
qlm_vectorized_mean = []
qmm_vectorized_sum = []
qmm_vectorized_mean = []
qhm_vectorized_sum = []
qhm_vectorized_mean = []

# for i in range(test_questions_number):
#     qlm_sentence = vectorizer(qlm_tokenized[i])
#     qmm_sentence = vectorizer(qmm_tokenized[i])
#     qhm_sentence = vectorizer(qhm_tokenized[i])
    
#     qlm_sentence_sum = token_sum(qlm_sentence)
#     qmm_sentence_sum = token_sum(qmm_sentence)
#     qhm_sentence_sum = token_sum(qhm_sentence)
    
#     qlm_sentence_mean = token_mean(qlm_sentence)
#     qmm_sentence_mean = token_mean(qmm_sentence)
#     qhm_sentence_mean = token_mean(qhm_sentence)
    
#     qlm_vectorized_sum.append(qlm_sentence_sum)
#     qmm_vectorized_sum.append(qmm_sentence_sum)
#     qhm_vectorized_sum.append(qhm_sentence_sum)
    
#     qlm_vectorized_mean.append(qlm_sentence_mean)
#     qmm_vectorized_mean.append(qmm_sentence_mean)
#     qhm_vectorized_mean.append(qhm_sentence_mean)

## Nearest Neighbors space creation

In [None]:
nbrs_euclidean_sum = NearestNeighbors(n_neighbors=25000, metric="euclidean").fit(qa_base_vectorized_sum)
nbrs_manhattan_sum = NearestNeighbors(n_neighbors=25000, metric="manhattan").fit(qa_base_vectorized_sum)
nbrs_cosine_sum = NearestNeighbors(n_neighbors=25000, metric="cosine").fit(qa_base_vectorized_sum)

nbrs_euclidean_mean = NearestNeighbors(n_neighbors=25000, metric="euclidean").fit(qa_base_vectorized_mean)
nbrs_manhattan_mean = NearestNeighbors(n_neighbors=25000, metric="manhattan").fit(qa_base_vectorized_mean)
nbrs_cosine_mean = NearestNeighbors(n_neighbors=25000, metric="cosine").fit(qa_base_vectorized_mean)

## Recording results

In [None]:
results = pd.DataFrame(columns = ['arithmetics',
                                  'questions modification',
                                  'metric',
                                  'average sentence rank',
                                  'accuracy [%]',
                                  'average execution time [s]'])

## Lightly modified questions - sum arithmetics

In [None]:
euclidean_ranks = []
manhattan_ranks = []
cos_ranks = []

euclidean_target_counter = 0
manhattan_target_counter = 0
cos_target_counter = 0

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = np.reshape(qlm_vectorized_sum[i],(1,-1))
    
    distances_euclidean, indices_euclidean = nbrs_euclidean_sum.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan_sum.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine_sum.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank = nbrs_list_euclidean.index(original_question)+1
    manhattan_rank = nbrs_list_manhattan.index(original_question)+1
    cos_rank = nbrs_list_cosine.index(original_question)+1
    
    euclidean_ranks.append(euclidean_rank)
    manhattan_ranks.append(manhattan_rank)
    cos_ranks.append(cos_rank)
    
    if euclidean_rank <= 300:
        euclidean_target_counter += 1
    if manhattan_rank <= 300:
        manhattan_target_counter += 1
    if cos_rank <= 300:
        cos_target_counter += 1
    
end = time.time()
t = (end-start)/test_questions_number

print("Light modifications - euclidian ranks:", euclidean_ranks)
print("Light modifications - manhattan ranks:", manhattan_ranks)
print("Light modifications - cos ranks:", cos_ranks)

data = {'arithmetics':'sum',
         'questions modification':'light',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'accuracy [%]':round((euclidean_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'arithmetics':'sum',
         'questions modification':'light',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'accuracy [%]':round((manhattan_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'arithmetics':'sum',
         'questions modification':'light',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'accuracy [%]':round((cos_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

## Lightly modified questions - mean arithmetics

In [None]:
euclidean_ranks = []
manhattan_ranks = []
cos_ranks = []

euclidean_target_counter = 0
manhattan_target_counter = 0
cos_target_counter = 0

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = np.reshape(qlm_vectorized_mean[i],(1,-1))
    
    distances_euclidean, indices_euclidean = nbrs_euclidean_mean.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan_mean.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine_mean.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank = nbrs_list_euclidean.index(original_question)+1
    manhattan_rank = nbrs_list_manhattan.index(original_question)+1
    cos_rank = nbrs_list_cosine.index(original_question)+1
    
    euclidean_ranks.append(euclidean_rank)
    manhattan_ranks.append(manhattan_rank)
    cos_ranks.append(cos_rank)
    
    if euclidean_rank <= 300:
        euclidean_target_counter += 1
    if manhattan_rank <= 300:
        manhattan_target_counter += 1
    if cos_rank <= 300:
        cos_target_counter += 1
    
end = time.time()
t = (end-start)/test_questions_number

print("Light modifications - euclidian ranks:", euclidean_ranks)
print("Light modifications - manhattan ranks:", manhattan_ranks)
print("Light modifications - cos ranks:", cos_ranks)

data = {'arithmetics':'mean',
         'questions modification':'light',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'accuracy [%]':round((euclidean_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'arithmetics':'mean',
         'questions modification':'light',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'accuracy [%]':round((manhattan_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'arithmetics':'mean',
         'questions modification':'light',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'accuracy [%]':round((cos_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

## Medium modified questions - sum arithmetics

In [None]:
euclidean_ranks = []
manhattan_ranks = []
cos_ranks = []

euclidean_target_counter = 0
manhattan_target_counter = 0
cos_target_counter = 0

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = np.reshape(qmm_vectorized_sum[i],(1,-1))
    
    distances_euclidean, indices_euclidean = nbrs_euclidean_sum.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan_sum.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine_sum.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank = nbrs_list_euclidean.index(original_question)+1
    manhattan_rank = nbrs_list_manhattan.index(original_question)+1
    cos_rank = nbrs_list_cosine.index(original_question)+1
    
    euclidean_ranks.append(euclidean_rank)
    manhattan_ranks.append(manhattan_rank)
    cos_ranks.append(cos_rank)
    
    if euclidean_rank <= 300:
        euclidean_target_counter += 1
    if manhattan_rank <= 300:
        manhattan_target_counter += 1
    if cos_rank <= 300:
        cos_target_counter += 1
    
end = time.time()
t = (end-start)/test_questions_number

print("Medium modifications - euclidian ranks:", euclidean_ranks)
print("Medium modifications - manhattan ranks:", manhattan_ranks)
print("Medium modifications - cos ranks:", cos_ranks)

data = {'arithmetics':'sum',
         'questions modification':'medium',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'accuracy [%]':round((euclidean_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'arithmetics':'sum',
         'questions modification':'medium',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'accuracy [%]':round((manhattan_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'arithmetics':'sum',
         'questions modification':'medium',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'accuracy [%]':round((cos_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

## Medium modified questions - mean arithmetics

In [None]:
euclidean_ranks = []
manhattan_ranks = []
cos_ranks = []

euclidean_target_counter = 0
manhattan_target_counter = 0
cos_target_counter = 0

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = np.reshape(qmm_vectorized_mean[i],(1,-1))
    
    distances_euclidean, indices_euclidean = nbrs_euclidean_mean.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan_mean.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine_mean.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank = nbrs_list_euclidean.index(original_question)+1
    manhattan_rank = nbrs_list_manhattan.index(original_question)+1
    cos_rank = nbrs_list_cosine.index(original_question)+1
    
    euclidean_ranks.append(euclidean_rank)
    manhattan_ranks.append(manhattan_rank)
    cos_ranks.append(cos_rank)
    
    if euclidean_rank <= 300:
        euclidean_target_counter += 1
    if manhattan_rank <= 300:
        manhattan_target_counter += 1
    if cos_rank <= 300:
        cos_target_counter += 1
    
end = time.time()
t = (end-start)/test_questions_number

print("Medium modifications - euclidian ranks:", euclidean_ranks)
print("Medium modifications - manhattan ranks:", manhattan_ranks)
print("Medium modifications - cos ranks:", cos_ranks)

data = {'arithmetics':'mean',
         'questions modification':'medium',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'accuracy [%]':round((euclidean_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'arithmetics':'mean',
         'questions modification':'medium',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'accuracy [%]':round((manhattan_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'arithmetics':'mean',
         'questions modification':'medium',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'accuracy [%]':round((cos_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

## Heavily modified questions - sum arithmetics

In [None]:
euclidean_ranks = []
manhattan_ranks = []
cos_ranks = []

euclidean_target_counter = 0
manhattan_target_counter = 0
cos_target_counter = 0

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = np.reshape(qhm_vectorized_sum[i],(1,-1))
    
    distances_euclidean, indices_euclidean = nbrs_euclidean_sum.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan_sum.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine_sum.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank = nbrs_list_euclidean.index(original_question)+1
    manhattan_rank = nbrs_list_manhattan.index(original_question)+1
    cos_rank = nbrs_list_cosine.index(original_question)+1
    
    euclidean_ranks.append(euclidean_rank)
    manhattan_ranks.append(manhattan_rank)
    cos_ranks.append(cos_rank)
    
    if euclidean_rank <= 300:
        euclidean_target_counter += 1
    if manhattan_rank <= 300:
        manhattan_target_counter += 1
    if cos_rank <= 300:
        cos_target_counter += 1
    
end = time.time()
t = (end-start)/test_questions_number

print("Heavy modifications - euclidian ranks:", euclidean_ranks)
print("Heavy modifications - manhattan ranks:", manhattan_ranks)
print("Heavy modifications - cos ranks:", cos_ranks)

data = {'arithmetics':'sum',
         'questions modification':'heavy',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'accuracy [%]':round((euclidean_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'arithmetics':'sum',
         'questions modification':'heavy',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'accuracy [%]':round((manhattan_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'arithmetics':'sum',
         'questions modification':'heavy',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'accuracy [%]':round((cos_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

## Heavily modified questions - mean arithmetics

In [None]:
euclidean_ranks = []
manhattan_ranks = []
cos_ranks = []

euclidean_target_counter = 0
manhattan_target_counter = 0
cos_target_counter = 0

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = np.reshape(qhm_vectorized_mean[i],(1,-1))
    
    distances_euclidean, indices_euclidean = nbrs_euclidean_mean.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan_mean.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine_mean.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank = nbrs_list_euclidean.index(original_question)+1
    manhattan_rank = nbrs_list_manhattan.index(original_question)+1
    cos_rank = nbrs_list_cosine.index(original_question)+1
    
    euclidean_ranks.append(euclidean_rank)
    manhattan_ranks.append(manhattan_rank)
    cos_ranks.append(cos_rank)
    
    if euclidean_rank <= 300:
        euclidean_target_counter += 1
    if manhattan_rank <= 300:
        manhattan_target_counter += 1
    if cos_rank <= 300:
        cos_target_counter += 1
    
end = time.time()
t = (end-start)/test_questions_number

print("Heavy modifications - euclidian ranks:", euclidean_ranks)
print("Heavy modifications - manhattan ranks:", manhattan_ranks)
print("Heavy modifications - cos ranks:", cos_ranks)

data = {'arithmetics':'mean',
         'questions modification':'heavy',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'accuracy [%]':round((euclidean_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'arithmetics':'mean',
         'questions modification':'heavy',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'accuracy [%]':round((manhattan_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'arithmetics':'mean',
         'questions modification':'heavy',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'accuracy [%]':round((cos_target_counter/test_questions_number)*100,2),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

## View of results

In [None]:
results.to_csv("results - word vectorization - POS.csv",index=False)

results.head(60)