In [1]:
import nltk
from collections import defaultdict
from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import copy

from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from numpy.linalg import svd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# try simple engish
textfile = "facts.txt"

stemmer = SnowballStemmer("english")

sentences = []
init_sentences = []
words = []
lexemes = []
with open(textfile, encoding='utf-8', mode='r') as f:
    text = f.read().lower()
    
    # lets split text for sentences first
    
    # these 2 parts are the same. Either complex one:
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenize.sent_tokenize(text)
    init_sentences = copy.deepcopy(sentences)
    # let's explode sentences to lexemes
    for i, sentence in enumerate(sentences):
        if not sentence:
            continue
        s_words = [word for word
                    in tokenize.word_tokenize(sentence)
                    if word not in (',', '.', ':', '-', ';', '?', '!', '"', "``", "`", "''")
                ]
        s_lexemes = [stemmer.stem(word) for word in s_words]
        for j, word in enumerate(s_words):
            sentences[i] = sentences[i].replace(word, s_lexemes[j])
        words.append(s_words)
        lexemes.append(s_lexemes)
#flattening lexemes list
lexemes = [item for sublist in lexemes for item in sublist]

In [3]:
#tf-idf vectorization
vectorizer = TfidfVectorizer()
tdm = vectorizer.fit_transform(sentences).toarray()

In [4]:
#PCA
pca = PCA(n_components = 0.8,  svd_solver="full")
tdm_reduced = pca.fit_transform(tdm)

In [5]:
def search_query(query, facts_tdm):
        query = query.lower()
        query_words = [word for word
                    in tokenize.word_tokenize(query)
                    if word not in (',', '.', ':', '-', ';', '?', '!', '"', "``", "`", "''")
                ]
        query_lexemes = [stemmer.stem(word) for word in query_words]
        for j, word in enumerate(query_words):
            query = query.replace(word, query_lexemes[j])
        queries_list = []
        queries_list.append(query)
        query_tdm = vectorizer.transform(queries_list).toarray()
        query_tdm_reduced = pca.transform(query_tdm)
        similarity = cosine_similarity(facts_tdm, query_tdm_reduced).reshape(-1)
        top_matches_idx = np.argsort(similarity)[::-1][:5]
        return top_matches_idx    

In [6]:
def dcg(found_idx, relevant_idx):
    def is_relevant(idx):
        return int(idx in relevant_idx)
    
    dcg = 0
    for i, idx in enumerate(found_idx, 1):
        dcg += is_relevant(idx)/np.log2(i+1)
    return dcg

In [7]:
def pFound(found_idx, relevant_idx, max_rel, p_break):
    def compute_relevance():
        rel = []
        for idx in found_idx:
            rel.append(max_rel if idx in relevant_idx else 0)
        return rel
    
    def pLook(i):
        return p_look[i-1]*(1 - p_rel[i-1])*(1 - p_break)
    
    p_look = [1]
    p_rel = compute_relevance()
    pFound = 0
    
    for i, idx in enumerate(found_idx):
        if i != 0:
            p_look.append(pLook(i))
        pFound += p_look[i]*p_rel[i]
    return pFound

In [8]:
queries = []
relevance = []
with open('queries.txt', mode='r') as f:
    queries = [line.strip() for line in f.readlines()]
with open('relevance.txt', mode='r') as f:
    for line in f.readlines():
        line = line.strip()
        relevance.append(list(map(int, line.split(','))))

In [11]:
for i, query in enumerate(queries):
    matches_idx = search_query(query, tdm_reduced)
    print(matches_idx)
    print(relevance[i])
    _dcg = dcg(matches_idx, relevance[i])
    _pFound = pFound(matches_idx, relevance[i], 0.4, 0.15)
    print("Query: {}, DCG: {}, pFound: {}".format(query, _dcg, _pFound))

[157  38  15 143   2]
[2, 15, 38, 143, 144, 157]
Query: person, DCG: 2.9484591188793923, pFound: 0.7881612040000001
[133  90   9 139   1]
[1, 5, 9, 90, 133, 139]
Query: planet, DCG: 2.9484591188793923, pFound: 0.7881612040000001
[147  12  15   2  80]
[2, 12, 15, 80, 147]
Query: average, DCG: 2.9484591188793923, pFound: 0.7881612040000001
[ 26 155  22 143   9]
[9, 10, 22, 26, 31, 99, 118, 143, 155, 159]
Query: one, DCG: 2.9484591188793923, pFound: 0.7881612040000001
[ 22  46 139  19   3]
[3, 19, 22, 35, 46, 69, 79, 80, 103, 114, 118, 139]
Query: year, DCG: 2.9484591188793923, pFound: 0.7881612040000001
[ 74 165 149 126  83]
[9, 12, 74, 83, 126, 140, 149, 165, 166]
Query: human, DCG: 2.9484591188793923, pFound: 0.7881612040000001
[ 29 165  53   9 103]
[9, 29, 53, 103, 130, 142, 164, 165]
Query: every, DCG: 2.9484591188793923, pFound: 0.7881612040000001
[152 104 130   2  35]
[2, 35, 104, 130, 152]
Query: call, DCG: 2.9484591188793923, pFound: 0.7881612040000001
[ 11  78 165  89 137]
[10, 