In [22]:
import nltk
from collections import defaultdict
from nltk import tokenize
from nltk.stem.snowball import SnowballStemmer
import numpy as np
import copy

from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from numpy.linalg import svd
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
# try simple engish
textfile = "facts.txt"

stemmer = SnowballStemmer("english")

sentences = []
init_sentences = []
words = []
lexemes = []
with open(textfile, encoding='utf-8', mode='r') as f:
    text = f.read().lower()
    
    # lets split text for sentences first
    
    # these 2 parts are the same. Either complex one:
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenize.sent_tokenize(text)
    init_sentences = copy.deepcopy(sentences)
    # let's explode sentences to lexemes
    for i, sentence in enumerate(sentences):
        if not sentence:
            continue
        s_words = [word for word
                    in tokenize.word_tokenize(sentence)
                    if word not in (',', '.', ':', '-', ';', '?', '!', '"', "``", "`", "''")
                ]
        s_lexemes = [stemmer.stem(word) for word in s_words]
        for j, word in enumerate(s_words):
            sentences[i] = sentences[i].replace(word, s_lexemes[j])
        words.append(s_words)
        lexemes.append(s_lexemes)
#flattening lexemes list
lexemes = [item for sublist in lexemes for item in sublist]
print(sentences)
with open("sentenced.txt", encoding='utf-8', mode='w+') as f:
    for s in sentences:
        f.write(s + '\n')

['\ufeff1.', 'if you somehow found a way to extract all of the gold from the bubbl core of our love littl planet, you would be abl to cover all of the land in a layer of gold up to your knee.', '2. mcdonald call frequent buyer of their food “heavi users.”\n3. the averag person spend 6 month of their lifetim wait on a red light to turn green.', '4. the largest record snowflak was in keogh, mt dure year 1887, and was 15 inch wide.', '5. you burn more calori sleep than you do watch televis.', '6. there are more lifeform live on your skin than there are peopl on the planet.', '7. southern sea otter have flap of skin under their foreleg that act as pocket.', 'when dive, they use these pouch to store rock and food.', '8. in 1386 a pig in franc was execut by public hang for the murder of a child.', '9. one in everi five adult believ that alien are hide in our planet disguis as human.', '10. if you believ that you’re truli one in a million, there are still approxim 7,184 more peopl out there j

In [3]:
#tf-idf vectorization
vectorizer = TfidfVectorizer()
tdm = vectorizer.fit_transform(sentences).toarray()

In [4]:
#PCA
pca = PCA(n_components = 0.8,  svd_solver="full")
tdm_reduced = pca.fit_transform(tdm)

In [5]:
def search_query(query, facts_tdm):
        query = query.lower()
        query_words = [word for word
                    in tokenize.word_tokenize(query)
                    if word not in (',', '.', ':', '-', ';', '?', '!', '"', "``", "`", "''")
                ]
        query_lexemes = [stemmer.stem(word) for word in query_words]
        for j, word in enumerate(query_words):
            query = query.replace(word, query_lexemes[j])
        queries_list = []
        queries_list.append(query)
        query_tdm = vectorizer.transform(queries_list).toarray()
        query_tdm_reduced = pca.transform(query_tdm)
        similarity = cosine_similarity(facts_tdm, query_tdm_reduced).reshape(-1)
        top_matches_idx = np.argsort(similarity)[::-1][:5]
        return [init_sentences[i] for i in top_matches_idx]    

In [12]:
def dcg(found_idx, relevant_idx):
    def is_relevant(idx):
        return int(idx in relevant_idx)
    
    dcg = 0
    for i, idx in enumerate(found_idx, 1):
        dcg += is_relevant(idx)/np.log2(i+1)
    return dcg

In [18]:
def pFound(found_idx, relevant_idx, max_rel, p_break):
    def compute_relevance():
        rel = []
        for idx in found_idx:
            rel.append(max_rel if idx in relevant_idx else 0)
        return rel
    
    def pLook(i):
        return p_look[i-1]*(1 - p_rel[i-1])*(1 - p_break)
    
    p_look = [1]
    p_rel = compute_relevance()
    pFound = 0
    
    for i, idx in enumerate(found_idx):
        if i != 0:
            p_look.append(pLook(i))
        pFound += p_look[i]*p_rel[i]
    return pFound

In [27]:
while True:
    query = input(">> Enter your query: ")
    matches = search_query(query, tdm_reduced)
    print("Found:")
    [print(match) for match in matches]
    repeat = input(">> Another query(any/n?): ")
    if repeat == 'n':
        break

>> Enter your query: pig
Found:
36. it is physically impossible for pigs to look up into the sky.
8. in 1386 a pig in france was executed by public hanging for the murder of a child.
﻿1.
145. it is impossible to sneeze with your eyes open.
81. under the code of hammurabi, bartenders who watered down beer were punished by execution.
>> Another query(any/n?): average
>> Enter your query: average
Found:
135. on average, 12 newborns will be given to the wrong parents daily.
12. a human will eat on average 70 assorted insects and 10 spiders while sleeping.
15. the average person walks the equivalent of three times around the world in a lifetime.
2. mcdonalds calls frequent buyers of their food “heavy users.”
3. the average person spends 6 months of their lifetime waiting on a red light to turn green.
76. you breathe on average about 8,409,600 times a year
77. more than 60,000 people are flying over the united states in an airplane right now.
>> Another query(any/n?): 
>> Enter your query: m

KeyboardInterrupt: 

In [13]:
print(dcg([1,2,3,4,5,6,7,8,9,10], [1,2,5,6,8]))

2.6894546246997506


In [19]:
print(pFound([1,2,3,4,5,6,7,8,9,10], [1,2,5,6,8], 0.4, 0.15))

0.7341237552565001


In [28]:
print(init_sentences[2])

2. mcdonalds calls frequent buyers of their food “heavy users.”
3. the average person spends 6 months of their lifetime waiting on a red light to turn green.


In [43]:
print(init_sentences[80])

76. you breathe on average about 8,409,600 times a year
77. more than 60,000 people are flying over the united states in an airplane right now.


In [None]:
with open('queires')