# Implementation of Simple NN based Ranking Model using Model Similarity feature of Word Embeddings.

```/* @Author: Jagan Kaartik */```

In [None]:
import os
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from collections import defaultdict

In [None]:
os.getcwd()

In [None]:
path = "./Dataset"

In [None]:
filelist = os.listdir(path)
filelist

In [None]:
filelist.pop(0)

In [None]:
corpus = []
for i in filelist:
     with open("{}".format(i)) as f_input:
            corpus.append(f_input.read())

In [None]:
len(corpus)

In [None]:
# Check
corpus[0]

## Pre-Processing Corpus Data 

In [None]:
newpath = "./Dataset"
os.chdir(newpath)

In [None]:
# Remove \n in Corpus Txt
for c in range(0,len(corpus)):
    corpus[c] = re.sub('\n','',corpus[c])

## Lemmatization and Removal of Stop Words from Corpus

In [None]:
stop_words = set(stopwords.words('english')) 
lemmatizer = WordNetLemmatizer() 

for c in range(0,len(corpus)):
    word_list = nltk.word_tokenize(corpus[c])
    filtered_sentence = [w for w in word_list if not w in stop_words]
    corpus[c] = ' '.join([lemmatizer.lemmatize(w) for w in filtered_sentence])

In [None]:
corpus[0]

### Writing Lemmatized Corpus to File for Efficient Retreival 

In [None]:
path = "./Dataset/Lemmatized Corpus/"
os.chdir(path)

In [None]:
for c in range(0,len(corpus)):
    with open('doc{}.txt'.format(c+1), 'w') as f:
        f.write("%s\n" % corpus[c])

### Inverted - Index

In [None]:
class InvertedIndex: 
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.index = defaultdict(list)
        self.documents = {}
        self.unique_id = 0
 
    def termLookup(self, word):
        word = word.lower()
        return [self.documents.get(id, None) for id in self.index.get(word)]
    
    def retIndex(self):           
        return self.index.items()
 
    def addTerm(self, document):
        for token in [t.lower() for t in nltk.word_tokenize(document)]:
            if self.unique_id not in self.index[token]:
                self.index[token].append(self.unique_id)
 
        self.documents[self.unique_id] = document
        self.unique_id += 1           

invObj = InvertedIndex(nltk.word_tokenize)

### Adding Corpus Data to Inverted Index

In [None]:
for c in range(0,100):
    invObj.addTerm(corpus[c])

In [None]:
res = invObj.termLookup("cancer")
print(len(res))

### View Inverted Index

In [None]:
invIndex = invObj.retIndex()

for k,v in invIndex:
    print("{} ---> {}".format(k,v)) 

In [None]:
len(invIndex)

### Lookup for Terms in Inverted Index

In [None]:
term = "cancer"
lookupResult = invObj.termLookup(term)

In [None]:
print("No of Documents with Term {} is {} ".format(term,len(lookupResult)))

### Saving Invertex Index to CSV

In [None]:
import csv
w = csv.writer(open("InvertedIndex.csv", "w"))
for key, val in invIndex:
    w.writerow([key, val])

### Size of Invertex Index with 100 corpus documents.

In [None]:
print(len(invIndex))

### Load Lemmatized Corpus

In [None]:
path = "./Dataset/Lemmatized Corpus/"
os.chdir(path)

In [None]:
filelist = os.listdir(path)
filelist

In [None]:
corpus = []
for i in filelist:
     with open("{}".format(i)) as f_input:
            corpus.append(f_input.read())

In [None]:
#Lemmatized Corpus
corpus[0]

### Convert Corpus to Vectors

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize 

In [None]:
def processCorpus(train):
    traincorp = []
    for i in train:
        traincorp.append(word_tokenize(i))
    return traincorp

In [None]:
train = corpus

In [None]:
traincorp = processCorpus(train)

In [None]:
modelW2V = Word2Vec(traincorp, min_count=1)

In [None]:
modelW2V.save('modelfull.bin')

* Refer : https://machinelearningmastery.com/develop-word-embeddings-python-gensim/ 
* Refer : https://www.geeksforgeeks.org/tokenize-text-using-nltk-python/
* Refer : https://stackoverflow.com/questions/55713132/how-to-tokenize-a-list-of-lists-in-python

### Model Based Similarity

##### Score of using our Model to compare Cancer with similar words such as Ovarian, Tumor

In [None]:
modelW2V.wv.similarity('cancer', 'tumor') 

In [None]:
modelW2V.wv.similarity('cancer','ovarian')

##### Score of using our Model to compare Cancer with dissimilar words such as Cloud

In [None]:
modelW2V.wv.similarity('cancer','cloud')

##### Finding top 10 most similar words to word Cancer in our dataset

In [None]:
v1 = "cancer"
modelW2V.wv.most_similar(positive=v1)

## Doc2Vec Model for Finding out document similarity

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [None]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(corpus)]

### Traning the Model

In [None]:
max_epochs = 10
vec_size = 20 
alpha = 0.025

In [None]:
model = Doc2Vec(size=vec_size,alpha=alpha, min_alpha=0.00025,min_count=50,dm =1)

In [None]:
model.build_vocab(tagged_data)

In [None]:
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    model.alpha -= 0.0002
    model.min_alpha = model.alpha

In [None]:
model.save("min50.model")

### Cosinie Similarity between Documents

In [None]:
from scipy import spatial

In [None]:
vec1 = model.infer_vector(corpus[1].split())
vec2 = model.infer_vector(corpus[3].split())

In [None]:
similairty = spatial.distance.cosine(vec1, vec2)
similairty

In [None]:
new_sentence = "i love dogs".split(" ") 

In [None]:
model.docvecs.most_similar(positive=[model.infer_vector(new_sentence)],topn=5)

In [None]:
corpus[5868]

In [None]:
model = Doc2Vec.load("d2v.model")

In [None]:
query = "i love dogs".split(" ") 

In [None]:
model.docvecs.most_similar(positive=[model.infer_vector(query)],topn=5)

In [None]:
# Corpus Regarding Euthanesia, Animal Health, Veterinarian etc. 
# These WORDS are Similar in semantic similarity with Dogs!
corpus[638]

#### Benefits of Semantic Similarity based Ranking is, documents related in semantic similarity with the query is returned. Using this to initialty filter results (documents) would be ideal.

In [None]:
query = "Healthcare Research".split(" ") 

In [None]:
model.docvecs.most_similar(positive=[model.infer_vector(query)],topn=5)

In [None]:
# Doc with highest similarity
corpus[5508]

In [None]:
query = "safety".split(" ") 
model.docvecs.most_similar(positive=[model.infer_vector(query)],topn=5)

In [None]:
# Doc with highest similarity
corpus[107]

In [None]:
newmodel = Doc2Vec.load("d2vmin50.model")

In [None]:
query = "dog".split(" ") 
newmodel.docvecs.most_similar(positive=[model.infer_vector(query)],topn=5)

In [None]:
# Doc with highest similarity
corpus[748]