In [33]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import random

import sys
sys.path.append("../")

from models.vector_model import vector_model


random.seed(0)
np.random.seed(0)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
docs = {
    54: "The cat sat on the mat",
    46: "The dog sat on the mat",
    45: "The cat and dog slept slept",
    12: "The mouse sat on the mat",
    50: "The duck sat on the mat",
}

docs_list = [docs[i] for i in docs.keys()]

from preprocessors.preprocessor import Preprocessor

preprocessor = Preprocessor()
tokenized_docs = preprocessor.preprocess(docs)

100%|██████████| 5/5 [00:00<00:00, 11881.88it/s]


In [35]:
vectorizer = TfidfVectorizer(min_df=1, stop_words="english")
X = vectorizer.fit_transform(docs_list)
X.toarray()
print(vectorizer.vocabulary_)
print(len(vectorizer.vocabulary_))
print(vectorizer.idf_)


{'cat': 0, 'sat': 5, 'mat': 3, 'dog': 1, 'slept': 6, 'mouse': 4, 'duck': 2}
7
[1.69314718 1.69314718 2.09861229 1.18232156 2.09861229 1.18232156
 2.09861229]


In [36]:
vm = vector_model(tokenized_docs, min_df=1)
vm.fit()

#vm.vectors
print(vm.vector_norms)
print(vm.vocab)
print(len(vm.vocab))
#print(vm.vectors)
print(vm.idf)

{'cat': 2, 'sat': 4, 'mat': 4, 'dog': 2, 'slept': 1, 'mous': 1, 'duck': 1}


5it [00:00, 49461.13it/s]

[0.96911031 0.96911031 1.73495958 1.64008429 1.64008429]
['cat', 'dog', 'duck', 'mat', 'mous', 'sat', 'slept']
7
{'cat': 0.9162907318741551, 'sat': 0.22314355131420976, 'mat': 0.22314355131420976, 'dog': 0.9162907318741551, 'slept': 1.6094379124341003, 'mous': 1.6094379124341003, 'duck': 1.6094379124341003}





In [37]:
query = "The kitty sat on the mat"

print(cosine_similarity(X,vectorizer.transform([query])))

[[0.70266106]
 [0.70266106]
 [0.        ]
 [0.62314068]
 [0.62314068]]


In [38]:
q_terms = preprocessor.preprocess_query(query)
q_terms

['kitti', 'sat', 'mat']

In [39]:
vm.find_similar(q_terms, topn=5)

[0.         0.         0.         0.22314355 0.         0.22314355
 0.        ]


([(46, 0.32563128587164397),
  (54, 0.32563128587164397),
  (12, 0.19241244994255877),
  (50, 0.19241244994255877),
  (45, 0.0)],
 array([0.32563129, 0.32563129, 0.        , 0.19241245, 0.19241245]))