## Semantic search

### Using word2vec for query term expansion
Searching the IMDB dataset

In [2]:
!pip install whoosh

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
[K     |████████████████████████████████| 468 kB 5.0 MB/s 
[?25hInstalling collected packages: whoosh
Successfully installed whoosh-2.7.4


In [15]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, DATETIME
from whoosh.index import create_in
from whoosh.analysis import StemmingAnalyzer
from whoosh.qparser import MultifieldParser
import whoosh.index
import csv
from gensim.models import KeyedVectors

In [16]:
w2vec_model_path = "../model.bin"
def load_model(path):
    model = KeyedVectors.load_word2vec_format(w2vec_model_path, binary=True)
    return model

In [17]:
imdb_dataset_path = "IMDB-Movie-Data.csv"
search_engine_index_path = "whoosh_index"

In [18]:
class IMDBSearchEngine:

    def __init__(self, index_path, imdb_path="", load_existing=False):
        self.schema = self.create_schema()
        if (not load_existing and imdb_path):
            self.data = self.read_in_csv(imdb_path)
            self.create_and_populate_index(index_path)
        elif (load_existing):
            self.load_existing_index(index_path)
        else:
            raise Exception("You need to provide the index path, and either load_existing=True or the path to the data and load_existing=False")


    def read_in_csv(self, csv_file):
        with open(csv_file, 'r', encoding='utf-8') as fp:
            reader = csv.reader(fp, delimiter=',', quotechar='"')
            data_read = [row for row in reader]
        return data_read

    def create_and_populate_index(self, index_path):
        self.index = create_in(index_path, self.schema)
        self.writer = self.index.writer()
        self.populate_index()

    def load_existing_index(self, index_path):
        self.index = whoosh.index.open_dir(index_path)

    def create_schema(self):
        schema = whoosh.fields.Schema(movie_id=ID(stored=True),
                title=TEXT(analyzer=StemmingAnalyzer()),
                description=TEXT(analyzer=StemmingAnalyzer()),
                genre=KEYWORD,
                director=TEXT,
                actors=TEXT,
                year=DATETIME)
        return schema

    def populate_index(self):
        for row in self.data[1:]:
            movie_id = row[0]
            title = row[1]
            genre = row[2]
            description = row[3]
            director = row[4]
            actors = row[5]
            year = row[6]
            self.writer.add_document(movie_id=movie_id, title=title, description=description, genre=genre, director=director, actors=actors, year=year)
        self.writer.commit()

    def query_engine(self, keywords):
        with self.index.searcher() as searcher:
            query = MultifieldParser(["title", "description"], self.index.schema).parse(keywords)
            results = searcher.search(query)
            print(results)
            print(results[0:3])
            return results

In [13]:
# upload IMDB-Movie-Data.csv to colab
from google.colab import files
uploaded = files.upload()

In [None]:
# if necc, create a folder for your index on colab
import os
os.mkdir('../content/whoosh_index')

In [33]:
search_engine = IMDBSearchEngine(search_engine_index_path, imdb_dataset_path, load_existing=False)

In [29]:
# you need to download a model from http://vectors.nlpl.eu/repository/ for the next bit to work
# see the video for an example 
model = load_model(w2vec_model_path)

FileNotFoundError: ignored

In [27]:
def get_similar_words(model, search_term):
    similarity_list = model.most_similar(search_term, topn=5)
    similar_words = [sim_tuple[0] for sim_tuple in similarity_list]
    return similar_words

In [None]:
search_term= "gigantic"
other_words = get_similar_words(model, search_term)

In [None]:
other_words

In [None]:
results = search_engine.query_engine(" OR ".join([search_term] + other_words))

In [None]:
top1 = results[0]

In [None]:
%config Completer.use_jedi = False

### Useful gensim API methods

In [None]:
words = ['courgette', 'mushroom', 'onion', 'camera']
print(model.doesnt_match(words))

In [None]:
print(model.most_similar_to_given("fungus", words))