## Importing libraries 

In [1]:
import time
import pandas as pd
import numpy as np
import copy as cp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.neighbors import NearestNeighbors

## Bases importing

In [2]:
qa_base = pd.read_csv('./insurance_qna_dataset.csv',sep='\t',index_col=0)
questions_number = qa_base.shape[0]

qm_base = pd.read_csv("Test questions dataset.csv")
test_questions_number = qm_base.shape[0]

# qa_base.head()
qm_base.head(21)

Unnamed: 0,original question,modified question – light,modified question – medium,modified question – heavy
0,How Many People Go Without Health Insurance?,How Many People Live Without Health Insurance?,How Many People Lack Health Coverage?,How Many Individuals Lack Good Health Coverage?
1,What Is The Purpose Of A Life Insurance Policy?,What Is The Point Of A Life Insurance Policy?,What Serves As A Life Insurance Policy's Objec...,What Do Life Insurance Policies Serve?
2,How Much Is A Typical Homeowners Insurance?,How Much Is An Average Homeowners Insurance?,How Much Does The Average Homeowners Insurance...,What Is the Average Cost Of Homeowners Insurance?
3,Is Disability Insurance Worth Having?,Is Disability Insurance Good?,Is Disability Insurance Worth Enough To Have It?,What Is The Value of Disability Insurance?
4,Is Vision Covered Under Medicare?,Is Vision Not Covered Under Medicare?,Does Medicare Cover Vision Care?,Is Medicare Going To Cover Vision Care?
5,Does Life Insurance Cover Both Spouses?,Does Life Insurance Cover Both Spouse?,Does Life Insurance Cover Both Partners?,Are Both Spouses Covered By Life Insurance?
6,Can Bad Credit Affect Car Insurance?,Can Bad Credit Affect Cars Insurance?,Can Bad Credits Affects Cars Insurance?,Can A Poor Credit History Affect Auto Insurance?
7,How Much Should I Get Life Insurance For?,How Much Should I I I Get Life Insurance For?,How Long Should I Get Life Insurance For?,How Much Life Insurance Should I Purchase?
8,Who Underwrites Hsbc Life Insurance?,Who Underwrite Hsbc Life Insurances?,Who Underwrites Hsbbcc Life Insurances?,Who Drafts The Hsbc Life Insurance Policy?
9,How Much Is Home Insurance Per Month?,How Much Is Home Insurance Per Months?,How Much Is House Insurance Per Month?,How Much Does Monthly Home Insurance Cost?


## Recording results

In [3]:
results = pd.DataFrame(columns = ['method',
                                  'word simplification',
                                  'questions modification',
                                  'metric',
                                  'average sentence rank',
                                  'average execution time [s]'])

## TF-IDF vectorization - loop method

In [4]:
vectorizer = TfidfVectorizer(ngram_range = (1,2))
qa_base_vectorized = vectorizer.fit_transform(qa_base["Question"])

qlm_vectorized = vectorizer.transform(qm_base["modified question – light"])
qmm_vectorized = vectorizer.transform(qm_base["modified question – medium"])
qhm_vectorized = vectorizer.transform(qm_base["modified question – heavy"])

### Lightly modified questions - loop method

In [5]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):

    original_question = qm_base["original question"][i]
    modified_question = qlm_vectorized[i]
    
    distance_euclidean = pairwise_distances(modified_question, qa_base_vectorized, metric='euclidean')
    distance_manhattan = pairwise_distances(modified_question, qa_base_vectorized, metric='manhattan')
    cos_similarity = cosine_similarity(modified_question, qa_base_vectorized)
    
    distance_euclidean_indexes = np.argsort(distance_euclidean)
    distance_manhattan_indexes = np.argsort(distance_manhattan)
    cos_similarity_indexes = np.argsort(cos_similarity)
  
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_euclidean_indexes[0][j]]
        if original_question == ranked_question:
            euclidean_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_manhattan_indexes[0][j]]
        if original_question == ranked_question:
            manhattan_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][cos_similarity_indexes[0][::-1][j]]
        if original_question == ranked_question:
            cos_rank.append(j+1)
            break
            
end = time.time()
t = (end-start)/test_questions_number
            
print("Light modifications - euclidean rank:", euclidean_rank)
print("Light modifications - manhattan rank:", manhattan_rank)
print("Light modifications - cos rank:", cos_rank)

data = {'method':'loop',
         'word simplification':'/',
         'questions modification':'light',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'/',
         'questions modification':'light',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'/',
         'questions modification':'light',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Light modifications - euclidean rank: [1, 7, 59, 76, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 21, 1, 5, 1, 252]
Light modifications - manhattan rank: [1, 5, 9, 77, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 10, 1, 7, 1, 64]
Light modifications - cos rank: [1, 7, 59, 76, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 21, 1, 5, 1, 252]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


### Medium modified questions - loop method

In [6]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):

    original_question = qm_base["original question"][i]
    modified_question = qmm_vectorized[i]
    
    distance_euclidean = pairwise_distances(modified_question, qa_base_vectorized, metric='euclidean')
    distance_manhattan = pairwise_distances(modified_question, qa_base_vectorized, metric='manhattan')
    cos_similarity = cosine_similarity(modified_question, qa_base_vectorized)
    
    distance_euclidean_indexes = np.argsort(distance_euclidean)
    distance_manhattan_indexes = np.argsort(distance_manhattan)
    cos_similarity_indexes = np.argsort(cos_similarity)
  
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_euclidean_indexes[0][j]]
        if original_question == ranked_question:
            euclidean_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_manhattan_indexes[0][j]]
        if original_question == ranked_question:
            manhattan_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][cos_similarity_indexes[0][::-1][j]]
        if original_question == ranked_question:
            cos_rank.append(j+1)
            break
            
end = time.time()
t = (end-start)/test_questions_number
            
print("Medium modifications - euclidian rank:", euclidean_rank)
print("Medium modifications - manhattan rank:", manhattan_rank)
print("Medium modifications - cos rank:", cos_rank)

data = {'method':'loop',
         'word simplification':'/',
         'questions modification':'medium',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'/',
         'questions modification':'medium',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'/',
         'questions modification':'medium',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Medium modifications - euclidian rank: [14, 133, 739, 5, 189, 1, 1, 3, 2, 8, 1, 37, 1, 1, 5, 1, 1, 1, 119, 1, 169]
Medium modifications - manhattan rank: [11, 543, 346, 6, 395, 1, 6, 1, 2, 8, 1, 19, 1, 1, 1, 1, 1, 1, 64, 15, 70]
Medium modifications - cos rank: [14, 133, 739, 5, 189, 1, 1, 3, 2, 8, 1, 37, 1, 1, 5, 1, 1, 1, 119, 1, 169]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


### Heavily modified questions - loop method

In [7]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):

    original_question = qm_base["original question"][i]
    modified_question = qhm_vectorized[i]
    
    distance_euclidean = pairwise_distances(modified_question, qa_base_vectorized, metric='euclidean')
    distance_manhattan = pairwise_distances(modified_question, qa_base_vectorized, metric='manhattan')
    cos_similarity = cosine_similarity(modified_question, qa_base_vectorized)
    
    distance_euclidean_indexes = np.argsort(distance_euclidean)
    distance_manhattan_indexes = np.argsort(distance_manhattan)
    cos_similarity_indexes = np.argsort(cos_similarity)
  
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_euclidean_indexes[0][j]]
        if original_question == ranked_question:
            euclidean_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_manhattan_indexes[0][j]]
        if original_question == ranked_question:
            manhattan_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][cos_similarity_indexes[0][::-1][j]]
        if original_question == ranked_question:
            cos_rank.append(j+1)
            break
            
end = time.time()
t = (end-start)/test_questions_number

print("Heavy modifications - euclidian rank:", euclidean_rank)
print("Heavy modifications - manhattan rank:", manhattan_rank)
print("Heavy modifications - cos rank:", cos_rank)

data = {'method':'loop',
         'word simplification':'/',
         'questions modification':'heavy',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'/',
         'questions modification':'heavy',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'/',
         'questions modification':'heavy',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Heavy modifications - euclidian rank: [30, 1704, 1229, 1071, 36, 1, 62, 205, 1, 582, 45, 647, 11, 9, 705, 14, 1, 1, 18, 1, 92]
Heavy modifications - manhattan rank: [23, 7716, 971, 688, 104, 1, 58, 131, 1, 555, 429, 767, 15, 4, 1572, 5, 1, 1, 2, 1, 48]
Heavy modifications - cos rank: [30, 1704, 1229, 1071, 36, 1, 62, 205, 1, 582, 45, 647, 11, 9, 705, 14, 1, 1, 18, 1, 92]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


## TF-IDF vectorization with stemming - loop method

In [8]:
qa_base_stem = cp.deepcopy(qa_base["Question"])
ps = PorterStemmer()

for i in range(questions_number):
    sentence = qa_base_stem[i]
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(ps.stem(word))
        stem_sentence.append(" ")
        stemmed_sentence = "".join(stem_sentence)
    qa_base_stem.iloc[i] = stemmed_sentence

vectorizer = TfidfVectorizer(ngram_range = (1,2))
qa_base_vectorized = vectorizer.fit_transform(qa_base_stem)

qlm_vectorized = vectorizer.transform(qm_base["modified question – light"])
qmm_vectorized = vectorizer.transform(qm_base["modified question – medium"])
qhm_vectorized = vectorizer.transform(qm_base["modified question – heavy"])

### Lightly modified questions with stemming - loop method

In [9]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):
    original_question = qm_base["original question"][i]
    modified_question = qlm_vectorized[i]
    
    distance_euclidean = pairwise_distances(modified_question, qa_base_vectorized, metric='euclidean')
    distance_manhattan = pairwise_distances(modified_question, qa_base_vectorized, metric='manhattan')
    cos_similarity = cosine_similarity(modified_question, qa_base_vectorized)
    
    distance_euclidean_indexes = np.argsort(distance_euclidean)
    distance_manhattan_indexes = np.argsort(distance_manhattan)
    cos_similarity_indexes = np.argsort(cos_similarity)
  
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_euclidean_indexes[0][j]]
        if original_question == ranked_question:
            euclidean_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_manhattan_indexes[0][j]]
        if original_question == ranked_question:
            manhattan_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][cos_similarity_indexes[0][::-1][j]]
        if original_question == ranked_question:
            cos_rank.append(j+1)
            break
            
end = time.time()
t = (end-start)/test_questions_number

print("Light modifications - euclidean rank:", euclidean_rank)
print("Light modifications - manhattan rank:", manhattan_rank)
print("Light modifications - cos rank:", cos_rank)

data = {'method':'loop',
         'word simplification':'stemming',
         'questions modification':'light',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'stemming',
         'questions modification':'light',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'stemming',
         'questions modification':'light',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Light modifications - euclidean rank: [4, 22, 275, 1208, 1, 1, 1, 1, 1, 2, 2, 1, 44, 146, 1, 1, 57, 1, 71, 12, 976]
Light modifications - manhattan rank: [11, 14, 166, 1573, 1, 1, 1, 1, 1, 4, 7, 1, 22, 235, 5, 1, 49, 1, 57, 182, 397]
Light modifications - cos rank: [4, 22, 275, 1208, 1, 1, 1, 1, 1, 2, 2, 1, 44, 146, 1, 1, 57, 1, 71, 12, 976]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


### Medium modified questions with stemming - loop method

In [10]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):

    original_question = qm_base["original question"][i]
    modified_question = qmm_vectorized[i]
    
    distance_euclidean = pairwise_distances(modified_question, qa_base_vectorized, metric='euclidean')
    distance_manhattan = pairwise_distances(modified_question, qa_base_vectorized, metric='manhattan')
    cos_similarity = cosine_similarity(modified_question, qa_base_vectorized)
    
    distance_euclidean_indexes = np.argsort(distance_euclidean)
    distance_manhattan_indexes = np.argsort(distance_manhattan)
    cos_similarity_indexes = np.argsort(cos_similarity)
  
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_euclidean_indexes[0][j]]
        if original_question == ranked_question:
            euclidean_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_manhattan_indexes[0][j]]
        if original_question == ranked_question:
            manhattan_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][cos_similarity_indexes[0][::-1][j]]
        if original_question == ranked_question:
            cos_rank.append(j+1)
            break
            
end = time.time()
t = (end-start)/test_questions_number
            
print("Medium modifications - euclidian rank:", euclidean_rank)
print("Medium modifications - manhattan rank:", manhattan_rank)
print("Medium modifications - cos rank:", cos_rank)

data = {'method':'loop',
         'word simplification':'stemming',
         'questions modification':'medium',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'stemming',
         'questions modification':'medium',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'stemming',
         'questions modification':'medium',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Medium modifications - euclidian rank: [447, 865, 1147, 115, 8, 1, 1, 5, 592, 9, 2, 1, 243, 225, 4, 1, 1, 1, 621, 193, 694]
Medium modifications - manhattan rank: [5844, 9857, 493, 29, 13, 1, 7, 2, 264, 9, 7, 1, 170, 3139, 43, 1, 1, 72, 990, 8161, 275]
Medium modifications - cos rank: [447, 865, 1147, 115, 8, 1, 1, 5, 592, 9, 2, 1, 243, 225, 4, 1, 1, 1, 621, 193, 694]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


### Heavily modified questions with stemming - loop method

In [11]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):

    original_question = qm_base["original question"][i]
    modified_question = qhm_vectorized[i]
    
    distance_euclidean = pairwise_distances(modified_question, qa_base_vectorized, metric='euclidean')
    distance_manhattan = pairwise_distances(modified_question, qa_base_vectorized, metric='manhattan')
    cos_similarity = cosine_similarity(modified_question, qa_base_vectorized)
    
    distance_euclidean_indexes = np.argsort(distance_euclidean)
    distance_manhattan_indexes = np.argsort(distance_manhattan)
    cos_similarity_indexes = np.argsort(cos_similarity)
  
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_euclidean_indexes[0][j]]
        if original_question == ranked_question:
            euclidean_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_manhattan_indexes[0][j]]
        if original_question == ranked_question:
            manhattan_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][cos_similarity_indexes[0][::-1][j]]
        if original_question == ranked_question:
            cos_rank.append(j+1)
            break
            
end = time.time()
t = (end-start)/test_questions_number

print("Heavy modifications - euclidian rank:", euclidean_rank)
print("Heavy modifications - manhattan rank:", manhattan_rank)
print("Heavy modifications - cos rank:", cos_rank)

data = {'method':'loop',
         'word simplification':'stemming',
         'questions modification':'heavy',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'stemming',
         'questions modification':'heavy',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'stemming',
         'questions modification':'heavy',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Heavy modifications - euclidian rank: [1007, 2265, 8486, 6479, 10, 11, 40, 91, 1, 346, 243, 419, 150, 146, 25, 20, 1, 1, 41, 8, 171]
Heavy modifications - manhattan rank: [6162, 10445, 6266, 3545, 14, 19, 40, 79, 1, 519, 4942, 543, 88, 235, 69, 25, 1, 1, 32, 21, 84]
Heavy modifications - cos rank: [1007, 2265, 8486, 6479, 10, 11, 40, 91, 1, 346, 243, 419, 150, 146, 25, 20, 1, 1, 41, 8, 171]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


## TF-IDF vectorization with lemmatization - loop method

In [12]:
qa_base_lem = cp.deepcopy(qa_base["Question"])
wnl = WordNetLemmatizer()

for i in range(questions_number):
    sentence = qa_base_lem[i]
    token_words=word_tokenize(sentence)
    lem_sentence=[]
    for word in token_words:
        lem_sentence.append(wnl.lemmatize(word))
        lem_sentence.append(" ")
        lematizated_sentence = "".join(lem_sentence)
    qa_base_lem.iloc[i] = lematizated_sentence

vectorizer = TfidfVectorizer(ngram_range = (1,2))
qa_base_vectorized = vectorizer.fit_transform(qa_base_lem)

qlm_vectorized = vectorizer.transform(qm_base["modified question – light"])
qmm_vectorized = vectorizer.transform(qm_base["modified question – medium"])
qhm_vectorized = vectorizer.transform(qm_base["modified question – heavy"])

### Lightly modified questions with lemmatization - loop method

In [13]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = qlm_vectorized[i]
    
    distance_euclidean = pairwise_distances(modified_question, qa_base_vectorized, metric='euclidean')
    distance_manhattan = pairwise_distances(modified_question, qa_base_vectorized, metric='manhattan')
    cos_similarity = cosine_similarity(modified_question, qa_base_vectorized)
    
    distance_euclidean_indexes = np.argsort(distance_euclidean)
    distance_manhattan_indexes = np.argsort(distance_manhattan)
    cos_similarity_indexes = np.argsort(cos_similarity)
  
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_euclidean_indexes[0][j]]
        if original_question == ranked_question:
            euclidean_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_manhattan_indexes[0][j]]
        if original_question == ranked_question:
            manhattan_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][cos_similarity_indexes[0][::-1][j]]
        if original_question == ranked_question:
            cos_rank.append(j+1)
            break
            
end = time.time()
t = (end-start)/test_questions_number

print("Light modifications - euclidean rank:", euclidean_rank)
print("Light modifications - manhattan rank:", manhattan_rank)
print("Light modifications - cos rank:", cos_rank)

data = {'method':'loop',
         'word simplification':'lemmatization',
         'questions modification':'light',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'lemmatization',
         'questions modification':'light',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'lemmatization',
         'questions modification':'light',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Light modifications - euclidean rank: [1, 7, 59, 76, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 21, 1, 5, 1, 248]
Light modifications - manhattan rank: [1, 5, 9, 77, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 10, 1, 7, 1, 64]
Light modifications - cos rank: [1, 7, 59, 76, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 21, 1, 5, 1, 248]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


### Medium modified questions with lemmatization - loop method

In [14]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):

    original_question = qm_base["original question"][i]
    modified_question = qmm_vectorized[i]
    
    distance_euclidean = pairwise_distances(modified_question, qa_base_vectorized, metric='euclidean')
    distance_manhattan = pairwise_distances(modified_question, qa_base_vectorized, metric='manhattan')
    cos_similarity = cosine_similarity(modified_question, qa_base_vectorized)
    
    distance_euclidean_indexes = np.argsort(distance_euclidean)
    distance_manhattan_indexes = np.argsort(distance_manhattan)
    cos_similarity_indexes = np.argsort(cos_similarity)
  
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_euclidean_indexes[0][j]]
        if original_question == ranked_question:
            euclidean_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_manhattan_indexes[0][j]]
        if original_question == ranked_question:
            manhattan_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][cos_similarity_indexes[0][::-1][j]]
        if original_question == ranked_question:
            cos_rank.append(j+1)
            break
            
end = time.time()
t = (end-start)/test_questions_number

print("Medium modifications - euclidian rank:", euclidean_rank)
print("Medium modifications - manhattan rank:", manhattan_rank)
print("Medium modifications - cos rank:", cos_rank)

data = {'method':'loop',
         'word simplification':'lemmatization',
         'questions modification':'medium',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'lemmatization',
         'questions modification':'medium',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'lemmatization',
         'questions modification':'medium',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Medium modifications - euclidian rank: [14, 123, 739, 5, 192, 1, 1, 3, 2, 5, 1, 37, 1, 1, 5, 1, 1, 1, 119, 1, 167]
Medium modifications - manhattan rank: [11, 541, 331, 6, 396, 1, 6, 1, 2, 8, 1, 17, 1, 1, 1, 1, 1, 1, 64, 25, 70]
Medium modifications - cos rank: [14, 123, 739, 5, 192, 1, 1, 3, 2, 5, 1, 37, 1, 1, 5, 1, 1, 1, 119, 1, 167]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


### Heavily modified questions with lemmatization - loop method

In [15]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):

    original_question = qm_base["original question"][i]
    modified_question = qhm_vectorized[i]
    
    distance_euclidean = pairwise_distances(modified_question, qa_base_vectorized, metric='euclidean')
    distance_manhattan = pairwise_distances(modified_question, qa_base_vectorized, metric='manhattan')
    cos_similarity = cosine_similarity(modified_question, qa_base_vectorized)
    
    distance_euclidean_indexes = np.argsort(distance_euclidean)
    distance_manhattan_indexes = np.argsort(distance_manhattan)
    cos_similarity_indexes = np.argsort(cos_similarity)
  
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_euclidean_indexes[0][j]]
        if original_question == ranked_question:
            euclidean_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][distance_manhattan_indexes[0][j]]
        if original_question == ranked_question:
            manhattan_rank.append(j+1)
            break
            
    for j in range(questions_number):
        ranked_question = qa_base["Question"][cos_similarity_indexes[0][::-1][j]]
        if original_question == ranked_question:
            cos_rank.append(j+1)
            break
            
end = time.time()
t = (end-start)/test_questions_number

print("Heavy modifications - euclidian rank:", euclidean_rank)
print("Heavy modifications - manhattan rank:", manhattan_rank)
print("Heavy modifications - cos rank:", cos_rank)

data = {'method':'loop',
         'word simplification':'lemmatization',
         'questions modification':'heavy',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'lemmatization',
         'questions modification':'heavy',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'loop',
         'word simplification':'lemmatization',
         'questions modification':'heavy',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Heavy modifications - euclidian rank: [30, 1661, 1228, 1071, 36, 1, 62, 205, 1, 573, 45, 643, 11, 9, 687, 14, 1, 1, 18, 1, 92]
Heavy modifications - manhattan rank: [23, 7717, 971, 689, 104, 1, 58, 131, 1, 540, 429, 751, 15, 4, 1600, 5, 1, 1, 2, 1, 48]
Heavy modifications - cos rank: [30, 1661, 1228, 1071, 36, 1, 62, 205, 1, 573, 45, 643, 11, 9, 687, 14, 1, 1, 18, 1, 92]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


## TF-IDF vectorization - nearest neighbors method

In [16]:
vectorizer = TfidfVectorizer(ngram_range = (1,2))
qa_base_vectorized = vectorizer.fit_transform(qa_base["Question"])

qlm_vectorized = vectorizer.transform(qm_base["modified question – light"])
qmm_vectorized = vectorizer.transform(qm_base["modified question – medium"])
qhm_vectorized = vectorizer.transform(qm_base["modified question – heavy"])

nbrs_euclidean = NearestNeighbors(n_neighbors=20000, metric="euclidean").fit(qa_base_vectorized)
nbrs_manhattan = NearestNeighbors(n_neighbors=20000, metric="manhattan").fit(qa_base_vectorized)
nbrs_cosine = NearestNeighbors(n_neighbors=20000, metric="cosine").fit(qa_base_vectorized)

### Lightly modified questions - nearest neighbors method

In [17]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = qlm_vectorized[i]
    
    distances_euclidean, indices_euclidean = nbrs_euclidean.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank.append(nbrs_list_euclidean.index(original_question)+1)
    manhattan_rank.append(nbrs_list_manhattan.index(original_question)+1)
    cos_rank.append(nbrs_list_cosine.index(original_question)+1)
    
end = time.time()
t = (end-start)/test_questions_number

print("Light modifications - euclidian rank:", euclidean_rank)
print("Light modifications - manhattan rank:", manhattan_rank)
print("Light modifications - cos rank:", cos_rank)

data = {'method':'nearest neighbors',
         'word simplification':'/',
         'questions modification':'light',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'/',
         'questions modification':'light',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'/',
         'questions modification':'light',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Light modifications - euclidian rank: [1, 7, 59, 76, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 21, 1, 5, 1, 252]
Light modifications - manhattan rank: [1, 5, 9, 77, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 10, 1, 7, 1, 64]
Light modifications - cos rank: [1, 7, 59, 76, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 21, 1, 5, 1, 252]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


### Medium modified questions - nearest neighbors method

In [18]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = qmm_vectorized[i]
    
    distances_euclidean, indices_euclidean = nbrs_euclidean.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank.append(nbrs_list_euclidean.index(original_question)+1)
    manhattan_rank.append(nbrs_list_manhattan.index(original_question)+1)
    cos_rank.append(nbrs_list_cosine.index(original_question)+1)
    
end = time.time()
t = (end-start)/test_questions_number

print("Medium modifications - euclidian rank:", euclidean_rank)
print("Medium modifications - manhattan rank:", manhattan_rank)
print("Medium modifications - cos rank:", cos_rank)

data = {'method':'nearest neighbors',
         'word simplification':'/',
         'questions modification':'medium',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'/',
         'questions modification':'medium',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'/',
         'questions modification':'medium',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Medium modifications - euclidian rank: [14, 133, 739, 5, 189, 1, 1, 3, 2, 8, 1, 37, 1, 1, 5, 1, 1, 1, 119, 1, 169]
Medium modifications - manhattan rank: [11, 543, 346, 6, 395, 1, 6, 1, 2, 8, 1, 19, 1, 1, 1, 1, 1, 1, 64, 15, 70]
Medium modifications - cos rank: [14, 133, 739, 5, 189, 1, 1, 3, 2, 8, 1, 37, 1, 1, 5, 1, 1, 1, 119, 1, 169]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


### Heavily modified questions - nearest neighbors method

In [19]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = qhm_vectorized[i]
    
    distances_euclidean, indices_euclidean = nbrs_euclidean.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank.append(nbrs_list_euclidean.index(original_question)+1)
    manhattan_rank.append(nbrs_list_manhattan.index(original_question)+1)
    cos_rank.append(nbrs_list_cosine.index(original_question)+1)
    
end = time.time()
t = (end-start)/test_questions_number

print("Heavy modifications - euclidian rank:", euclidean_rank)
print("Heavy modifications - manhattan rank:", manhattan_rank)
print("Heavy modifications - cos rank:", cos_rank)

data = {'method':'nearest neighbors',
         'word simplification':'/',
         'questions modification':'heavy',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'/',
         'questions modification':'heavy',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'/',
         'questions modification':'heavy',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Heavy modifications - euclidian rank: [30, 1704, 1229, 1071, 36, 1, 62, 205, 1, 582, 45, 647, 11, 9, 705, 14, 1, 1, 18, 1, 92]
Heavy modifications - manhattan rank: [23, 7716, 971, 688, 104, 1, 58, 131, 1, 555, 429, 767, 15, 4, 1572, 5, 1, 1, 2, 1, 48]
Heavy modifications - cos rank: [30, 1704, 1229, 1071, 36, 1, 62, 205, 1, 582, 45, 647, 11, 9, 705, 14, 1, 1, 18, 1, 92]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


## TF-IDF vectorization with stemming - nearest neighbors method

In [20]:
qa_base_stem = cp.deepcopy(qa_base["Question"])
ps = PorterStemmer()

for i in range(questions_number):
    sentence = qa_base_stem[i]
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(ps.stem(word))
        stem_sentence.append(" ")
        stemmed_sentence = "".join(stem_sentence)
    qa_base_stem.iloc[i] = stemmed_sentence

vectorizer = TfidfVectorizer(ngram_range = (1,2))
qa_base_vectorized = vectorizer.fit_transform(qa_base_stem)

qlm_vectorized = vectorizer.transform(qm_base["modified question – light"])
qmm_vectorized = vectorizer.transform(qm_base["modified question – medium"])
qhm_vectorized = vectorizer.transform(qm_base["modified question – heavy"])

nbrs_euclidean = NearestNeighbors(n_neighbors=20000, metric="euclidean").fit(qa_base_vectorized)
nbrs_manhattan = NearestNeighbors(n_neighbors=20000, metric="manhattan").fit(qa_base_vectorized)
nbrs_cosine = NearestNeighbors(n_neighbors=20000, metric="cosine").fit(qa_base_vectorized)

### Lightly modified questions with stemming - nearest neighbors method

In [21]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = qlm_vectorized[i]
    
    distances_euclidean, indices_euclidean = nbrs_euclidean.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank.append(nbrs_list_euclidean.index(original_question)+1)
    manhattan_rank.append(nbrs_list_manhattan.index(original_question)+1)
    cos_rank.append(nbrs_list_cosine.index(original_question)+1)
    
end = time.time()
t = (end-start)/test_questions_number

print("Light modifications - euclidian rank:", euclidean_rank)
print("Light modifications - manhattan rank:", manhattan_rank)
print("Light modifications - cos rank:", cos_rank)

data = {'method':'nearest neighbors',
         'word simplification':'stemming',
         'questions modification':'light',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'stemming',
         'questions modification':'light',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'stemming',
         'questions modification':'light',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Light modifications - euclidian rank: [4, 22, 275, 1208, 1, 1, 1, 1, 1, 2, 2, 1, 44, 146, 1, 1, 57, 1, 71, 12, 976]
Light modifications - manhattan rank: [11, 14, 166, 1573, 1, 1, 1, 1, 1, 4, 7, 1, 22, 235, 5, 1, 49, 1, 57, 182, 397]
Light modifications - cos rank: [4, 22, 275, 1208, 1, 1, 1, 1, 1, 2, 2, 1, 44, 146, 1, 1, 57, 1, 71, 12, 976]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


### Medium modified questions with stemming - nearest neighbors method

In [22]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = qmm_vectorized[i]
    
    distances_euclidean, indices_euclidean = nbrs_euclidean.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank.append(nbrs_list_euclidean.index(original_question)+1)
    manhattan_rank.append(nbrs_list_manhattan.index(original_question)+1)
    cos_rank.append(nbrs_list_cosine.index(original_question)+1)
    
end = time.time()
t = (end-start)/test_questions_number

print("Medium modifications - euclidian rank:", euclidean_rank)
print("Medium modifications - manhattan rank:", manhattan_rank)
print("Medium modifications - cos rank:", cos_rank)

data = {'method':'nearest neighbors',
         'word simplification':'stemming',
         'questions modification':'medium',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'stemming',
         'questions modification':'medium',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'stemming',
         'questions modification':'medium',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Medium modifications - euclidian rank: [447, 865, 1147, 115, 8, 1, 1, 5, 592, 9, 2, 1, 243, 225, 4, 1, 1, 1, 621, 193, 694]
Medium modifications - manhattan rank: [5844, 9857, 493, 29, 13, 1, 7, 2, 264, 9, 7, 1, 170, 3139, 43, 1, 1, 72, 990, 8161, 275]
Medium modifications - cos rank: [447, 865, 1147, 115, 8, 1, 1, 5, 592, 9, 2, 1, 243, 225, 4, 1, 1, 1, 621, 193, 694]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


### Heavily modified questions with stemming - nearest neighbors method

In [23]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = qhm_vectorized[i]
    
    distances_euclidean, indices_euclidean = nbrs_euclidean.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank.append(nbrs_list_euclidean.index(original_question)+1)
    manhattan_rank.append(nbrs_list_manhattan.index(original_question)+1)
    cos_rank.append(nbrs_list_cosine.index(original_question)+1)
    
end = time.time()
t = (end-start)/test_questions_number

print("Heavy modifications - euclidian rank:", euclidean_rank)
print("Heavy modifications - manhattan rank:", manhattan_rank)
print("Heavy modifications - cos rank:", cos_rank)

data = {'method':'nearest neighbors',
         'word simplification':'stemming',
         'questions modification':'heavy',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'stemming',
         'questions modification':'heavy',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'stemming',
         'questions modification':'heavy',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Heavy modifications - euclidian rank: [1007, 2265, 8486, 6479, 10, 11, 40, 91, 1, 346, 243, 419, 150, 146, 25, 20, 1, 1, 41, 8, 171]
Heavy modifications - manhattan rank: [6162, 10445, 6266, 3545, 14, 19, 40, 79, 1, 519, 4942, 543, 88, 235, 69, 25, 1, 1, 32, 21, 84]
Heavy modifications - cos rank: [1007, 2265, 8486, 6479, 10, 11, 40, 91, 1, 346, 243, 419, 150, 146, 25, 20, 1, 1, 41, 8, 171]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


## TF-IDF vectorization with lemmatization - nearest neighbors method

In [24]:
qa_base_lem = cp.deepcopy(qa_base["Question"])
wnl = WordNetLemmatizer()

for i in range(questions_number):
    sentence = qa_base_lem[i]
    token_words=word_tokenize(sentence)
    lem_sentence=[]
    for word in token_words:
        lem_sentence.append(wnl.lemmatize(word))
        lem_sentence.append(" ")
        lematizated_sentence = "".join(lem_sentence)
    qa_base_lem.iloc[i] = lematizated_sentence

vectorizer = TfidfVectorizer(ngram_range = (1,2))
qa_base_vectorized = vectorizer.fit_transform(qa_base_lem)

qlm_vectorized = vectorizer.transform(qm_base["modified question – light"])
qmm_vectorized = vectorizer.transform(qm_base["modified question – medium"])
qhm_vectorized = vectorizer.transform(qm_base["modified question – heavy"])

nbrs_euclidean = NearestNeighbors(n_neighbors=20000, metric="euclidean").fit(qa_base_vectorized)
nbrs_manhattan = NearestNeighbors(n_neighbors=20000, metric="manhattan").fit(qa_base_vectorized)
nbrs_cosine = NearestNeighbors(n_neighbors=20000, metric="cosine").fit(qa_base_vectorized)

### Lightly modified questions with lemmatization - nearest neighbors method

In [25]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = qlm_vectorized[i]
    
    distances_euclidean, indices_euclidean = nbrs_euclidean.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank.append(nbrs_list_euclidean.index(original_question)+1)
    manhattan_rank.append(nbrs_list_manhattan.index(original_question)+1)
    cos_rank.append(nbrs_list_cosine.index(original_question)+1)
    
end = time.time()
t = (end-start)/test_questions_number

print("Light modifications - euclidian rank:", euclidean_rank)
print("Light modifications - manhattan rank:", manhattan_rank)
print("Light modifications - cos rank:", cos_rank)

data = {'method':'nearest neighbors',
         'word simplification':'lemmatization',
         'questions modification':'light',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'lemmatization',
         'questions modification':'light',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'lemmatization',
         'questions modification':'light',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Light modifications - euclidian rank: [1, 7, 59, 76, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 21, 1, 5, 1, 248]
Light modifications - manhattan rank: [1, 5, 9, 77, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 10, 1, 7, 1, 64]
Light modifications - cos rank: [1, 7, 59, 76, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 21, 1, 5, 1, 248]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


### Medium modified questions with lemmatization - nearest neighbors method

In [26]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = qmm_vectorized[i]
    
    distances_euclidean, indices_euclidean = nbrs_euclidean.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank.append(nbrs_list_euclidean.index(original_question)+1)
    manhattan_rank.append(nbrs_list_manhattan.index(original_question)+1)
    cos_rank.append(nbrs_list_cosine.index(original_question)+1)
    
end = time.time()
t = (end-start)/test_questions_number

print("Medium modifications - euclidian rank:", euclidean_rank)
print("Medium modifications - manhattan rank:", manhattan_rank)
print("Medium modifications - cos rank:", cos_rank)

data = {'method':'nearest neighbors',
         'word simplification':'lemmatization',
         'questions modification':'medium',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'lemmatization',
         'questions modification':'medium',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'lemmatization',
         'questions modification':'medium',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Medium modifications - euclidian rank: [14, 123, 739, 5, 192, 1, 1, 3, 2, 5, 1, 37, 1, 1, 5, 1, 1, 1, 119, 1, 167]
Medium modifications - manhattan rank: [11, 541, 331, 6, 396, 1, 6, 1, 2, 8, 1, 17, 1, 1, 1, 1, 1, 1, 64, 25, 70]
Medium modifications - cos rank: [14, 123, 739, 5, 192, 1, 1, 3, 2, 5, 1, 37, 1, 1, 5, 1, 1, 1, 119, 1, 167]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


### Heavily modified questions with lemmatization - nearest neighbors method

In [27]:
euclidean_rank = []
manhattan_rank = []
cos_rank = []

start = time.time()

for i in range(test_questions_number):
    
    original_question = qm_base["original question"][i]
    modified_question = qhm_vectorized[i]
    
    distances_euclidean, indices_euclidean = nbrs_euclidean.kneighbors(modified_question)
    distances_manhattan, indices_manhattan = nbrs_manhattan.kneighbors(modified_question)
    distances_cosine, indices_cosine = nbrs_cosine.kneighbors(modified_question)
    
    nbrs_list_euclidean = qa_base["Question"].iloc[indices_euclidean[0]].tolist()
    nbrs_list_manhattan = qa_base["Question"].iloc[indices_manhattan[0]].tolist()
    nbrs_list_cosine = qa_base["Question"].iloc[indices_cosine[0]].tolist()
    
    euclidean_rank.append(nbrs_list_euclidean.index(original_question)+1)
    manhattan_rank.append(nbrs_list_manhattan.index(original_question)+1)
    cos_rank.append(nbrs_list_cosine.index(original_question)+1)
    
end = time.time()
t = end-start
    
end = time.time()
t = (end-start)/test_questions_number

print("Heavy modifications - euclidian rank:", euclidean_rank)
print("Heavy modifications - manhattan rank:", manhattan_rank)
print("Heavy modifications - cos rank:", cos_rank)

data = {'method':'nearest neighbors',
         'word simplification':'lemmatization',
         'questions modification':'heavy',
         'metric':'euclidean',
         'average sentence rank':round(np.mean(euclidean_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'lemmatization',
         'questions modification':'heavy',
         'metric':'manhattan',
         'average sentence rank':round(np.mean(manhattan_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

data = {'method':'nearest neighbors',
         'word simplification':'lemmatization',
         'questions modification':'heavy',
         'metric':'cosine',
         'average sentence rank':round(np.mean(cos_rank)),
         'average execution time [s]':round(t,4)}

results = results.append(data, ignore_index=True)

Heavy modifications - euclidian rank: [30, 1661, 1228, 1071, 36, 1, 62, 205, 1, 573, 45, 643, 11, 9, 687, 14, 1, 1, 18, 1, 92]
Heavy modifications - manhattan rank: [23, 7717, 971, 689, 104, 1, 58, 131, 1, 540, 429, 751, 15, 4, 1600, 5, 1, 1, 2, 1, 48]
Heavy modifications - cos rank: [30, 1661, 1228, 1071, 36, 1, 62, 205, 1, 573, 45, 643, 11, 9, 687, 14, 1, 1, 18, 1, 92]


  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)
  results = results.append(data, ignore_index=True)


## View of results

In [28]:
results.to_csv("tf-idf - results - bigrams.csv",index=False)

results.head(60)

Unnamed: 0,method,word simplification,questions modification,metric,average sentence rank,average execution time [s]
0,loop,/,light,euclidean,21,0.0379
1,loop,/,light,manhattan,9,0.0379
2,loop,/,light,cosine,21,0.0379
3,loop,/,medium,euclidean,68,0.0429
4,loop,/,medium,manhattan,71,0.0429
5,loop,/,medium,cosine,68,0.0429
6,loop,/,heavy,euclidean,308,0.0469
7,loop,/,heavy,manhattan,623,0.0469
8,loop,/,heavy,cosine,308,0.0469
9,loop,stemming,light,euclidean,135,0.0363
