In [38]:
import gensim
import csv
from nltk.tokenize import word_tokenize as tokenize
import codecs 
from operator import itemgetter

def readCSV():

    spellSet = {}

    m = codecs.open('../datasets/Spells.csv', "r",encoding='utf-8', errors='ignore')
    reader = csv.DictReader(m)

    for row in reader:
        for column, value in row.items():
            spellSet.setdefault(column, []).append(value)


    return spellSet


def collectDocs():
    
    spellSet = readCSV()
    
    spellIndex = spellSet['Index']
    spellDescriptions = spellSet['Spell Description']
    spellTitles = spellSet['Spell Title']
    print('Number of documents: ', len(spellDescriptions))
    
    return spellIndex, spellTitles, spellDescriptions


def tokenizeDocs():
    
    spellIndex, spellTitles, spellDescriptions = collectDocs()
    
    gen_docs = [[w.lower() for w in tokenize(text)] for text in spellDescriptions]
    
    return spellIndex, spellTitles, gen_docs


def main():
    spellIndex, spellTitles, docs = tokenizeDocs()
    dictionary = gensim.corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(docs) for docs in docs]
    tf_idf = gensim.models.TfidfModel(corpus)
    s = 0
    
    for i in corpus:
        s += len(i)
        
    sims = gensim.similarities.Similarity('.', tf_idf[corpus], num_features=len(dictionary))
    return spellIndex, spellTitles, dictionary, tf_idf, sims
    
    
def search(searchInput):
    spellIndex, spellTitles, dictionary, tf_idf, sims = main()
    query_doc = [w.lower() for w in tokenize(searchInput)]
    query_doc_bow = dictionary.doc2bow(query_doc)
    query_doc_tf_idf = tf_idf[query_doc_bow]
    
    similar = sims[query_doc_tf_idf]
    pairedSims = list(zip(spellIndex, spellTitles, similar))
    
    sortedSimilar = sorted(pairedSims, key=itemgetter(2))
    mostSimilar = sortedSimilar[-5:]
    print('Most similar: ', mostSimilar)
        

In [42]:
text = 'find something lost'
search(text)

Number of documents:  412
Most similar:  [('83', 'To Find Good Luck', 0.27667516), ('84', 'To Find the Place of Injury', 0.281533), ('81', 'To Find a Muse', 0.34490076), ('82', 'To Find a Person', 0.40759471), ('80', 'To Find a Lost Love', 0.5246799)]


In [28]:
testIndex = [0,1,2,3]
testTitle = ['mom', 'dad', 'boy', 'girl']
testScore = [4,12,3,53]

testZipped = list(zip(testIndex, testTitle, testScore))
print(testZipped)

[(0, 'mom', 4), (1, 'dad', 12), (2, 'boy', 3), (3, 'girl', 53)]


In [33]:
print(sorted(testZipped, key=itemgetter(2)))

[(2, 'boy', 3), (0, 'mom', 4), (1, 'dad', 12), (3, 'girl', 53)]
