In [44]:
import re
import os
import pandas as pd

from gensim.parsing.preprocessing import preprocess_string, STOPWORDS, remove_stopwords
from gensim.parsing.preprocessing import strip_tags, stem_text, strip_multiple_whitespaces 
from gensim.parsing.preprocessing import strip_non_alphanum, strip_punctuation
from gensim.models.phrases import Phraser, Phrases, ENGLISH_CONNECTOR_WORDS
from gensim.utils import tokenize
from gensim import corpora, models, similarities
from gensim.similarities import Similarity

# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

from bs4 import BeautifulSoup as BSHTML

import pickle
from tqdm.notebook import tqdm

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     c:\Users\user\anaconda3\lib\nltk_data...


In [2]:
df = pd.read_json('../data/cleaned_data.json')

In [3]:
df.head()

Unnamed: 0,asin,original_category,category_list,category_string,main_cat,title,4_features_combined,5_features_combined,4_features_tokenized,5_features_tokenized,category_group_count
0,11300000,"[Electronics, Camera &amp; Photo, Video Survei...","[Electronics, Camera & Photo, Video Surveillan...","Electronics, Camera & Photo, Video Surveillanc...",Camera & Photo,Genuine Geovision 1 Channel 3rd Party NVR IP S...,Genuine Geovision 1 Channel 3rd Party NVR IP S...,Genuine Geovision 1 Channel 3rd Party NVR IP S...,"[genuin, geovis, channel, parti, nvr, softwar,...","[genuin, geovis, channel, parti, nvr, softwar,...",1392
1,43396828,"[Electronics, Camera &amp; Photo]","[Electronics, Camera & Photo]","Electronics, Camera & Photo",Camera & Photo,"Books ""Handbook of Astronomical Image Processi...","Books ""Handbook of Astronomical Image Processi...","Books ""Handbook of Astronomical Image Processi...","[book, handbook, astronom, imag, process, rom,...","[book, handbook, astronom, imag, process, rom,...",5779
2,60009810,"[Electronics, eBook Readers &amp; Accessories,...","[Electronics, eBook Readers & Accessories, eBo...","Electronics, eBook Readers & Accessories, eBoo...",Books,One Hot Summer,"One Hot Summer Electronics, eBook Readers & Ac...","One Hot Summer Electronics, eBook Readers & Ac...","[hot, summer, electron, ebook, reader, accesso...","[hot, summer, electron, ebook, reader, accesso...",399
3,60219602,"[Electronics, eBook Readers & Accessories, eBo...","[Electronics, eBook Readers & Accessories, eBo...","Electronics, eBook Readers & Accessories, eBoo...",Books,Hurray for Hattie Rabbit: Story and pictures (...,Hurray for Hattie Rabbit: Story and pictures (...,Hurray for Hattie Rabbit: Story and pictures (...,"[hurrai, hatti, rabbit, stori, pictur, earli, ...","[hurrai, hatti, rabbit, stori, pictur, earli, ...",399
4,60786817,"[Electronics, eBook Readers & Accessories, eBo...","[Electronics, eBook Readers & Accessories, eBo...","Electronics, eBook Readers & Accessories, eBoo...",Books,sex.lies.murder.fame.: A Novel,"sex.lies.murder.fame.: A Novel Electronics, eB...","sex.lies.murder.fame.: A Novel Electronics, eB...","[sex, li, murder, fame, novel, electron, ebook...","[sex, li, murder, fame, novel, electron, ebook...",399


In [35]:
def create_folders_to_save_to_file(file):
    filepath = "/".join(file.split("/")[:-1])
    if filepath:
        os.makedirs(filepath, exist_ok=True)

def train_lsi(df, num_topics, feature: str, save_model: str="../data/recommend/lsi_index/lsi_index_bigram"):
    """
    params:
    - df: Pandas DataFrame
    - feature: str value of the dataframe's column
    - save_model: 
    """
    tokenized = []

    bigram_model = Phrases(df[feature], connector_words = ENGLISH_CONNECTOR_WORDS)

    for text in tqdm(df[feature]):

        # apply the bigram model to the lemmatized text.
        # if applied correctly, bigrammed_tokens contains a list of unigrams and bigrams
        # generated from the lemmatized tokens
        bigrammed_tokens = bigram_model[text]

        # append to tokenized list
        tokenized.append(bigrammed_tokens)

    dictionary = corpora.Dictionary(tokenized)
    corpus = [dictionary.doc2bow(text) for text in tokenized]
    # word_counts = [[(dictionary[id], count) for id, count in line] for line in corpus]
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=num_topics)

    #index = similarities.MatrixSimilarity(lsi[corpus])  # transform corpus to LSI space and index it
    create_folders_to_save_to_file(save_model)
    create_folders_to_save_to_file('../data/recommend/lsi_dictionary/lsi_dict_bigram')
    create_folders_to_save_to_file('../data/recommend/lsi_model/lsi_model_bigram')
    create_folders_to_save_to_file('../data/recommend/lsi_trained_bigram_model/trained_lsi_bigram_model')
    index = Similarity(save_model, lsi[corpus], num_features=len(dictionary))  # transform corpus to LSI space and index it
    # index = Similarity(save_model, lsi[corpus], num_features=num_topics)  # transform corpus to LSI space and index it
    dictionary.save('../data/recommend/lsi_dictionary/lsi_dict_bigram')
    lsi.save('../data/recommend/lsi_model/lsi_model_bigram')
    index.save(save_model)
    bigram_model.save('../data/recommend/lsi_trained_bigram_model/trained_lsi_bigram_model')
    return lsi, index, dictionary, bigram_model

In [36]:
# Create model
lsi, index, dictionary, lsi_bigram_model = train_lsi(df, num_topics=25, feature="4_features_tokenized", save_model="../data/recommend/lsi_index/lsi_index")

  0%|          | 0/786445 [00:00<?, ?it/s]

In [49]:
def return_categories(query, bigram_model, lsi_model, lsi_index, lsi_dictionary):
    # initialize Lemmatizer
    lemmatizer = WordNetLemmatizer()

    # use gensim preprocessing filters 
    custom_filters = [lambda x: x.lower(), 
                      strip_tags, 
                      strip_punctuation, 
                      strip_multiple_whitespaces, 
                      strip_non_alphanum,
                      remove_stopwords]

    # tokenize, strip tags, punctuation, multi_whitespace, non_alphanum, and remove stopwords
    tokenized_query = preprocess_string(query, custom_filters)

    # lemmatize
    lemmatized_query = [lemmatizer.lemmatize(word) for word in tokenized_query]

    # apply the bigram model to the lemmatized text.
    # if applied correctly, bigrammed_tokens contains a list of unigrams and bigrams
    # generated from the lemmatized tokens
    bigrammed_query = bigram_model[lemmatized_query]
    
    vec_bow = dictionary.doc2bow(bigrammed_query)
    vec_lsi = lsi[vec_bow]  # convert the query to LSI space
    print('vec_lsi:', vec_lsi)
    print("======================")
    sims = index[vec_lsi]  # perform a similarity query against the corpus
    #print(list(enumerate(sims)))  # print (document_number, document_similarity) 2-tuples

    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    for doc_position, doc_score in sims[:10]:
        print(doc_score, df.category_list[doc_position])
        print(df.title[doc_position])
        print("======================")

In [50]:
query_term = "computer speakers with microphone"
# query_term = "i want a kingston sd card"
# query_term = "i want a computer monitor"

return_categories(query_term, bigram_model=lsi_bigram_model, lsi_model=lsi, lsi_index=index, lsi_dictionary=dictionary)

vec_lsi: [(0, 0.028112193945198503), (1, -0.004929436654364791), (2, -0.08840807037051494), (3, -0.08636846081939248), (4, 0.012289085389350706), (5, -0.0240277300689112), (6, 0.11517578338712182), (7, 0.09660060956920022), (8, -0.11937939008714527), (9, -0.14937192974391741), (10, 0.04276714157336668), (11, 0.01093153732734963), (12, 0.2761933901486182), (13, 0.046466561694928105), (14, 0.10720872830710111), (15, -0.37333136070166006), (16, -0.03894470516760002), (17, 0.03776076499576643), (18, 0.06985652245435413), (19, -0.08094858637011085), (20, 0.19875782605740835), (21, 0.2715270252404035), (22, 0.060742546580165875), (23, -0.39267521398962735), (24, 0.35013350092131257)]
0.9118724 ['Electronics', 'Home Audio', 'Speakers', 'Outdoor Speakers']
Pa Horn Weatherproof Abs Speaker PA Loudspeaker Speaker Horn
0.9105098 ['Electronics', 'Home Audio', 'Speakers', 'Ceiling & In-Wall Speakers']
8 Ceiling Wall Mount Speakers - Pair of 2-Way Midbass Woofer Speaker Directable 1 Titanium Dome Tw