In [1]:
import pickle
import scipy
from scipy import sparse
import numpy as np
import pandas as pd

In [2]:
def get_cosine_similarity(sparse_matrix, query):
    return np.dot(sparse_matrix, query.T).toarray()
def get_cosine_similarity2(sparse_matrix, query):
    return np.dot(sparse_matrix, query.T)

In [8]:
def count_metric(method):
    if method != 'corpus_ft':
        anwers_matrix = sparse.load_npz(f'../corpora/{method}_answers.npz')
        questions_matrix = sparse.load_npz(f'../corpora/{method}_questions.npz')
        res_mat = get_cosine_similarity(anwers_matrix, questions_matrix)
    else:
        answers_matrix = pd.read_csv('../corpora/corpus_ft_answers.csv')
        questions_matrix = pd.read_csv('../corpora/corpus_ft_questions.csv')
        cols = [col for col in answers_matrix.columns if 'word' in col]
        res_mat = get_cosine_similarity2(answers_matrix[cols].values, questions_matrix[cols].values)
    return np.argsort(-res_mat, axis=1)

def count_score(result, top_n):
    score = 0
    for i, row in enumerate(result):
            top_results = row[:top_n]
            if i in top_results:
                score += 1
    return score/len(result)


methods = ['count_vectorizer', 'tfidf_vectorizer', 'BM25', 'BERT', 'corpus_ft']
top_n = 5
dict_top5 = {}
for method in methods:
    result = count_metric(method)
    score = count_score(result, top_n)
    dict_top5[method] = score
    print(method, score)

count_vectorizer 0.0336
tfidf_vectorizer 0.0877
BM25 0.0685
BERT 0.0083
corpus_ft 0.0022


In [9]:
with open("./dict_top5.txt", "w") as file:
    file.write(str(dict_top5))

In [10]:
import ast
file = open("./dict_top5.txt", "r")
contents = file.read()
dict_top5_saved = ast.literal_eval(contents)
print(dict_top5_saved)

{'count_vectorizer': 0.0336, 'tfidf_vectorizer': 0.0877, 'BM25': 0.0685, 'BERT': 0.0083, 'corpus_ft': 0.0022}


In [14]:
print(*dict_top5_saved.items(), sep='\n')

('count_vectorizer', 0.0336)
('tfidf_vectorizer', 0.0877)
('BM25', 0.0685)
('BERT', 0.0083)
('corpus_ft', 0.0022)


In [18]:
for key in dict_top5_saved:
    if key == 'corpus_ft':
        key2 = 'FastText'
        print(f'{key2} --- {dict_top5_saved[key]}')
    else:
        print(f'{key} --- {dict_top5_saved[key]}')

count_vectorizer --- 0.0336
tfidf_vectorizer --- 0.0877
BM25 --- 0.0685
BERT --- 0.0083
FastText --- 0.0022


In [None]:
anwers_matrix = sparse.load_npz('../corpora/count_vectorizer_answers.npz')
questions_matrix = sparse.load_npz('../corpora/count_vectorizer_questions.npz')

res_mat = get_cosine_similarity(anwers_matrix, questions_matrix)
sorted = np.argsort(-res_mat, axis=1)

In [4]:
score = 0
for index, row in enumerate(sorted):
        top_results = row[:10]
        if index in top_results:
            score += 1



score = score/len(sorted)
print(score)

0.0439


In [5]:
anwers_matrix = sparse.load_npz('../corpora/tfidf_vectorizer_answers.npz')
questions_matrix = sparse.load_npz('../corpora/tfidf_vectorizer_questions.npz')

res_mat = get_cosine_similarity(anwers_matrix, questions_matrix)
sorted = np.argsort(-res_mat, axis=1)

score = 0
for index, row in enumerate(sorted):
        top_results = row[:10]
        if index in top_results:
            score += 1

score = score/len(sorted)
print(score)

0.1112


In [22]:
# anwers_matrix = sparse.load_npz('../corpora/BM25_answers.npz')
# questions_matrix = sparse.load_npz('../corpora/BM25_questions.npz')

# res_mat = get_cosine_similarity(anwers_matrix, questions_matrix)
# sorted = np.argsort(-res_mat, axis=1)

# score = 0
# for index, row in enumerate(sorted):
#         top_results = row[:10]
#         if index in top_results:
#             score += 1

# score = score/len(sorted)
# print(score)

In [7]:
anwers_matrix.shape

(10000, 14900)

In [8]:
questions_matrix.shape

(10000, 14888)

In [10]:
anwers_matrix = sparse.load_npz('../corpora/BERT_answers.npz')
questions_matrix = sparse.load_npz('../corpora/BERT_questions.npz')

res_mat = get_cosine_similarity(anwers_matrix, questions_matrix)
sorted = np.argsort(-res_mat, axis=1)

score = 0
for index, row in enumerate(sorted):
        top_results = row[:10]
        if index in top_results:
            score += 1

score = score/len(sorted)
print(score)

0.0113


In [11]:
anwers_matrix = sparse.load_npz('../corpora/BERT_answers.npz')
questions_matrix = sparse.load_npz('../corpora/BERT_questions.npz')

res_mat = get_cosine_similarity(anwers_matrix, questions_matrix)
sorted = np.argsort(-res_mat, axis=1)

score = 0
for index, row in enumerate(sorted):
        top_results = row[:10]
        if index in top_results:
            score += 1

score = score/len(sorted)
print(score)

0.0112


In [31]:
anwers_matrix = pd.read_csv('../corpora/corpus_ft_answers.csv')
questions_matrix = pd.read_csv('../corpora/corpus_ft_questions.csv')

In [32]:
anwers_matrix.columns

Index(['Unnamed: 0', 'word0', 'word1', 'word2', 'word3', 'word4', 'word5',
       'word6', 'word7', 'word8',
       ...
       'word290', 'word291', 'word292', 'word293', 'word294', 'word295',
       'word296', 'word297', 'word298', 'word299'],
      dtype='object', length=301)

In [33]:
cols = [col for col in anwers_matrix.columns if 'word' in col]

In [34]:
anwers_matrix[cols].values.shape

(10000, 300)

In [35]:
questions_matrix[cols].values.shape

(10000, 300)

In [21]:
def get_cosine_similarity2(sparse_matrix, query):
    return np.dot(sparse_matrix, query.T)
res_mat = get_cosine_similarity2(anwers_matrix[cols].values, questions_matrix[cols].values)
sorted = np.argsort(-res_mat, axis=1)

score = 0
for index, row in enumerate(sorted):
        top_results = row[:10]
        if index in top_results:
            score += 1

score = score/len(sorted)
print(score)

0.0042
