In [12]:
from gensim import corpora
import pandas as pd
import spacy
import re

In [15]:
company_df = pd.read_csv("data/company_desc_translated.csv", sep=";")[["company_name", "description_en"]]

In [16]:
company_df

Unnamed: 0,company_name,description_en
0,Le Fourgon,Le Fourgon delivers your stored drinks to your...
1,Comptoir des Vignes,Comptoir des Vignes is a brand of cellars spec...
2,Shin Sekai,Welcome to our Trustpilot page! Shin Sekai is ...
3,Nutri Naturel,"Nutri-Naturel.com, the leading online organic ..."
4,Maison Martin - Le Piment Français,Maison Martin - Le Piment Francais is the firs...
...,...,...
12991,Ljbautoparts,"Sale of auto body spare parts online: fender, ..."
12992,Aéroports de Paris,"Aeroports de Paris, with its three platforms, ..."
12993,Online SAS,"Shared hosting with unlimited traffic, domain ..."
12994,shopequitation,Online specialist in the sale of horse riding ...


In [8]:
nlp = spacy.load("en_core_web_sm") 

In [17]:
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop and len(token.text) > 2]

    if len(txt) > 2:
        return ' '.join(txt)

In [18]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in company_df['description_en'])

In [19]:
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000)]

In [20]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(11365, 1)

In [21]:
df_clean

Unnamed: 0,clean
0,fourgon deliver store drink home order place l...
1,comptoir des vigne brand cellar specialize win...
2,welcome trustpilot page shin sekai online figu...
3,nutri naturel com lead online organic grocery ...
4,maison martin piment francais brand artisanal ...
...,...
12991,sale auto body spare part online fender bumper...
12992,aeroport paris platform major connection point...
12993,share host unlimited traffic domain dedicated ...
12994,online specialist sale horse ride equipment sa...


In [None]:
#creating term dictionary
%time dictionary = corpora.Dictionary(movie_plot)

#filter out terms which occurs in less than 4 documents and more than 20% of the documents.
#NOTE: Since we have smaller dataset, we will keep this commented for now.

#dictionary.filter_extremes(no_below=4, no_above=0.2)

#list of few which which can be further removed
stoplist = set('hello and if this can would should could tell ask stop come go')
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)

In [None]:
corpus = [dictionary.doc2bow(desc) for desc in movie_plot]
word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in corpus[0:3]]

In [None]:
movie_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
movie_lsi_model = gensim.models.LsiModel(movie_tfidf_model[corpus], id2word=dictionary, num_topics=300)

#Serialize and Store the corpus locally for easy retrival whenver required.
gensim.corpora.MmCorpus.serialize('movie_tfidf_model_mm', movie_tfidf_model[corpus])
gensim.corpora.MmCorpus.serialize('movie_lsi_model_mm',movie_lsi_model[movie_tfidf_model[corpus]])

#Load the indexed corpus
movie_tfidf_corpus = gensim.corpora.MmCorpus('movie_tfidf_model_mm')
movie_lsi_corpus = gensim.corpora.MmCorpus('movie_lsi_model_mm')

#Load the MatrixSimilarity
from gensim.similarities import MatrixSimilarity
movie_index = MatrixSimilarity(movie_lsi_corpus, num_features = movie_lsi_corpus.num_terms)

In [None]:
from operator import itemgetter

def search_similar_movies(search_term):

    query_bow = dictionary.doc2bow(spacy_tokenizer(search_term))
    query_tfidf = movie_tfidf_model[query_bow]
    query_lsi = movie_lsi_model[query_tfidf]

    movie_index.num_best = 5

    movies_list = movie_index[query_lsi]

    movies_list.sort(key=itemgetter(1), reverse=True)
    movie_names = []

    for j, movie in enumerate(movies_list):

        movie_names.append (
            {
                'Relevance': round((movie[1] * 100),2),
                'Movie Title': df_movies['title'][movie[0]],
                'Movie Plot': df_movies['wiki_plot'][movie[0]]
            }

        )
        if j == (movie_index.num_best-1):
            break

    return pd.DataFrame(movie_names, columns=['Relevance','Movie Title','Movie Plot'])