In [47]:
import gensim
import csv
from nltk.tokenize import word_tokenize as tokenize
import codecs 
from operator import itemgetter

def readCSV():

    spellSet = {}

    m = codecs.open('../datasets/Spells.csv', "r",encoding='utf-8', errors='ignore')
    reader = csv.DictReader(m)

    for row in reader:
        for column, value in row.items():
            spellSet.setdefault(column, []).append(value)


    return spellSet


def collectDocs():
    
    spellSet = readCSV()
    
    spellIndex = spellSet['Index']
    spellDescriptions = spellSet['Spell Description']
    spellTitles = spellSet['Spell Title']
    print('Number of documents: ', len(spellDescriptions))
    
    return spellIndex, spellTitles, spellDescriptions


def tokenizeDocs():
    
    spellIndex, spellTitles, spellDescriptions = collectDocs()
    
    gen_docs = [[w.lower() for w in tokenize(text)] for text in spellDescriptions]
    
    return spellIndex, spellTitles, gen_docs


def main():
    spellIndex, spellTitles, docs = tokenizeDocs()
    dictionary = gensim.corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(docs) for docs in docs]
    tf_idf = gensim.models.TfidfModel(corpus)
    s = 0
    
    for i in corpus:
        s += len(i)
        
    sims = gensim.similarities.Similarity('.', tf_idf[corpus], num_features=len(dictionary))
    return spellIndex, spellTitles, dictionary, tf_idf, sims
    
    
def search(searchInput):
    spellIndex, spellTitles, dictionary, tf_idf, sims = main()
    query_doc = [w.lower() for w in tokenize(searchInput)]
    query_doc_bow = dictionary.doc2bow(query_doc)
    query_doc_tf_idf = tf_idf[query_doc_bow]
    
    similar = sims[query_doc_tf_idf]
    pairedSims = list(zip(spellIndex, spellTitles, similar))
    
    sortedSimilar = sorted(pairedSims, key=itemgetter(2))
    mostSimilar = [simSpell for simSpell in sortedSimilar if simSpell[2] > 0]
    print('Most similar: ', mostSimilar)
        

In [48]:
text = 'find something lost'
search(text)

Number of documents:  412
Most similar:  [('81', 'To Go to the Moment Where Souls Become Lost', 0.20118606), ('31', 'To Call for Your Powers Lost in Battle', 0.23942603), ('21', 'To Bring Back a Lost Sister', 0.2791768), ('110', 'Lost and Found Spell', 0.33092618), ('74', 'To Find the Place of Injury', 0.34465259), ('24', 'To Call a Lost Sister', 0.36387876), ('73', 'To Find Good Luck', 0.36875218), ('23', 'To Call a Lost Witch', 0.39057297), ('72', 'To Find a Person', 0.44830143), ('71', 'To Find a Muse', 0.46726951), ('70', 'To Find a Lost Love', 0.76301169)]


In [50]:
test = [('81', 'To Go to the Moment Where Souls Become Lost', 0.20118606), ('31', 'To Call for Your Powers Lost in Battle', 0.23942603), ('21', 'To Bring Back a Lost Sister', 0.2791768), ('110', 'Lost and Found Spell', 0.33092618), ('74', 'To Find the Place of Injury', 0.34465259), ('24', 'To Call a Lost Sister', 0.36387876), ('73', 'To Find Good Luck', 0.36875218), ('23', 'To Call a Lost Witch', 0.39057297), ('72', 'To Find a Person', 0.44830143), ('71', 'To Find a Muse', 0.46726951), ('70', 'To Find a Lost Love', 0.76301169)]
print(test[::-1])

[('70', 'To Find a Lost Love', 0.76301169), ('71', 'To Find a Muse', 0.46726951), ('72', 'To Find a Person', 0.44830143), ('23', 'To Call a Lost Witch', 0.39057297), ('73', 'To Find Good Luck', 0.36875218), ('24', 'To Call a Lost Sister', 0.36387876), ('74', 'To Find the Place of Injury', 0.34465259), ('110', 'Lost and Found Spell', 0.33092618), ('21', 'To Bring Back a Lost Sister', 0.2791768), ('31', 'To Call for Your Powers Lost in Battle', 0.23942603), ('81', 'To Go to the Moment Where Souls Become Lost', 0.20118606)]
