In [None]:
import pandas as pd
import numpy as np
import joblib
import scipy
import scipy.sparse as sp
import pickle
from tqdm.notebook import tqdm
import math

from rake_nltk import Rake

from sklearn.metrics.pairwise import cosine_similarity

from nltk.stem import WordNetLemmatizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from gensim.parsing.preprocessing import preprocess_string, STOPWORDS, remove_stopwords
from gensim.parsing.preprocessing import strip_tags, stem_text, strip_multiple_whitespaces 
from gensim.parsing.preprocessing import strip_non_alphanum, strip_punctuation
from gensim.models.phrases import Phraser, Phrases, ENGLISH_CONNECTOR_WORDS
from gensim.utils import tokenize
from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import LsiModel

In [None]:
product_dict = joblib.load('../data/recommend/product_dict.joblib') # Not needed for deployment, use DB
index_asin = pd.read_pickle('../data/recommend/index_asin.pickle')
#category_index = joblib.load('../data/recommend/category_index.joblib')

In [None]:
keyword_list = pd.read_csv('../data/recommend/keyword_list_1000.csv')
keyword_list = keyword_list.keywords.tolist()

In [None]:
lsi_dictionary = pd.read_pickle('../data/recommend/lsi_dictionary_10_components/lsi_dict_bigram')
lsi_model = LsiModel.load('../data/recommend/lsi_model_10_components/lsi_model_bigram')
lsi_bigram_model = joblib.load('../data/recommend/lsi_trained_bigram_model_10_components/trained_lsi_bigram_model')

In [None]:
review_keyword_sparse_mat_with_lsi_and_keywords = joblib.load('../data/recommend/v2/review_keyword_sparse_mat_with_lsi_and_keywords.joblib')

In [None]:
import sys
sys.getsizeof(review_keyword_sparse_mat_with_lsi_and_keywords)

48

In [None]:
user_keyword_sparse_mat_without_lsi = joblib.load('../data/recommend/user_keyword_sparse_mat_without_lsi.joblib') # Not used, it's legacy

In [None]:
review_keyword_master_index = joblib.load('../data/recommend/v2/review_keyword_master_index.joblib')

In [None]:
#index_reviewer = joblib.load('../data/recommend/index_reviewer.joblib')
reviewer_index = joblib.load('../data/recommend/reviewer_index.joblib')
index_product = joblib.load('../data/recommend/index_product.joblib')
#product_index = joblib.load('../data/recommend/product_index.joblib')
#common_product_dict = joblib.load('../data/recommend/common_product_dict.joblib')

In [None]:
user_product_quality_mat = joblib.load('../data/recommend/v2/user_product_quality_mat.joblib')
user_product_rating_mat = joblib.load('../data/recommend/v2/user_product_ratings_mat.joblib')
user_product_seller_mat = joblib.load('../data/recommend/v2/user_product_seller_mat.joblib')
user_product_value_mat = joblib.load('../data/recommend/v2/user_product_value_mat.joblib')

## Filter Category and Find Similar User

In [None]:
def convert_reviews_to_tokens(text, lsi_bigram_model, lsi_model, lsi_dictionary):
    '''
    Converts review text into bigrammed tokens. Performs filtering of tags, punctuation, whitespaces,
    alpha numerica, and stopwords before lemmatizing.

    Args:
        text: a single review text in string format
        lsi_bigram_model: gensim bigram model
        lsi_model: gensim lsi model
        lsi_dictionary: gensim lsi dictionary

    Returns:
        bigrammed_text: list of tokens that have been lemmatized and run through the bigram model.
    '''
    # initialize Lemmatizer
    lemmatizer = WordNetLemmatizer()

    # use gensim preprocessing filters 
    custom_filters = [lambda x: x.lower(), 
                      strip_tags, 
                      strip_punctuation, 
                      strip_multiple_whitespaces, 
                      strip_non_alphanum,
                      remove_stopwords]

    # tokenize, strip tags, punctuation, multi_whitespace, non_alphanum, and remove stopwords
    tokenized_text = preprocess_string(text, custom_filters)

    # lemmatize
    lemmatized_text = [lemmatizer.lemmatize(word) for word in tokenized_text]

    # apply the bigram model to the lemmatized text.
    # if applied correctly, bigrammed_tokens contains a list of unigrams and bigrams
    # generated from the lemmatized tokens
    bigrammed_text = lsi_bigram_model[lemmatized_text]
    
    return bigrammed_text

In [None]:
print(convert_reviews_to_tokens('Sony Bundle and', lsi_bigram_model=lsi_bigram_model, lsi_model=lsi_model, lsi_dictionary=lsi_dictionary))
print(lsi_dictionary.doc2bow(convert_reviews_to_tokens('Sony Bundle and', lsi_bigram_model=lsi_bigram_model, lsi_model=lsi_model, lsi_dictionary=lsi_dictionary)))

['sony', 'bundle']
[]


In [None]:
def get_similar_users(input_mat, selected_subcategory, query, keyword_list, lsi_bigram_model, lsi_dictionary, lsi_model, master_index, num_users=1000):
    '''
    Takes in a user-keyword sparse matrix and returns the top n similar users.
    First uses Category to filter the rows, then uses cosine similarity to find similar users based on keywords.

    Args:
        input_mat: scipy sparse matrix; user-keyword sparse matrix
        selected_subcategory: list of subcategory that user selected
        query: user query in string format
        keyword_list: list of top keywords
        master_index: dictionary that links input_mat index to asin, reviewerID, and category
        num_users: number of users to return

    Returns:
        user_list: list of top n similar users
    '''

    # initialize variables
    r = Rake()
    query_keywords = np.zeros(len(keyword_list)) # array that'll hold the count of keywords per query
    input_mat_matching_indices = list()
    review_similarity_dict = dict()

    # transform user query to match keyword
    r.extract_keywords_from_text(query)
    extracted_keywords = r.get_ranked_phrases()

    for keyword in extracted_keywords:
        if keyword in keyword_list:
            idx = keyword_list.index(keyword)
            query_keywords[idx] += 1

    # use LSI to transform the query into LSI space
    #vec_bow = lsi_dictionary.doc2bow(query.lower().split())
    #vec_lsi = lsi_model[vec_bow]  # convert the query to LSI space

    bigrammed_review_text = convert_reviews_to_tokens(query, lsi_bigram_model=lsi_bigram_model, lsi_model=lsi_model, lsi_dictionary=lsi_dictionary)
    vec_bow = lsi_dictionary.doc2bow(bigrammed_review_text)
    vec_lsi = lsi_model[vec_bow]  # convert the query to LSI space

    if len(vec_lsi) == 0:
        vec_lsi = [(0, 0)] * 10
    
    # print('bigrammed_review_text:', bigrammed_review_text)
    # print('vec_bow:', vec_bow)
    # print('vec_lsi:', vec_lsi)

    # append each LSI vector to query_keywords
    for vec in vec_lsi:
        query_keywords = np.append(query_keywords, vec[1])

    # filter input_mat by category selected
    for k, v in master_index.items():
        if v['category'] == selected_subcategory:
            input_mat_matching_indices.append(k) # build the user indices list
    
    filtered_mat = input_mat[input_mat_matching_indices]

    # perform cosine similarity to get most similar users
    review_similarity_list = cosine_similarity(filtered_mat, query_keywords.reshape(1, -1))

    # sort similarities in descending order while also sorting user indices
    for user_idx, user_similarity in zip(input_mat_matching_indices, review_similarity_list):
        review_similarity_dict[user_idx] = user_similarity[0]

    sorted_review_similarity_dict = sorted(review_similarity_dict.items(), key=lambda x : x[1], reverse=True)

    # top_review = sorted_review_similarity_dict[0]
    
    # testing out returning the user with the most similar review
    # return master_index[top_review[0]]

    if len(review_similarity_dict) < num_users:
        return sorted_review_similarity_dict
    else:
        return sorted_review_similarity_dict[:num_users]

## Collaborative Filtering on User-Product Matrices

In [None]:
def find_recommendations(user_product_value_mat, user_product_quality_mat, user_product_rating_mat, user_product_seller_mat, selected_filter, top_reviews, master_index, index_product, reviewer_index, top_n):
    
    if selected_filter == 'top_value':
        user_product_mat = user_product_value_mat
    elif selected_filter == 'top_quality':
        user_product_mat = user_product_quality_mat
    elif selected_filter == 'top_ratings':
        user_product_mat = user_product_rating_mat
    elif selected_filter == 'top_seller':
        user_product_mat = user_product_seller_mat

    if user_product_mat.shape[0] == 1:
        pred_values = user_product_mat.todense()
    else:
        # construct index list of top_n users
        #review_indices = [review[0] for review in top_reviews]
        #reviewer_ids = [master_index[review_id]['reviewerID'] for review_id in review_indices]
        #user_indices = [reviewer_index[id] for id in reviewer_ids]

        user_indices = [reviewer_index[master_index[review[0]]['reviewerID']] for review in top_reviews]

        #user = user_product_mat[top_user]
        top_user = user_product_mat[user_indices[0]]
        user_product_mat = user_product_mat[user_indices]

        mean_play_times = np.mean(user_product_mat, axis=1)
        normalized_play_times = user_product_mat - mean_play_times
        similarities = cosine_similarity(top_user, user_product_mat)
        
        #print('user id:', top_reviews[0][0])
        #print('user shape:', top_user.shape)
        #print('similarities shape:', similarities.shape)
        #print('normalized playtime shape:', normalized_play_times.shape)

        pred_values = (similarities * normalized_play_times) / np.sum(similarities) + np.mean(top_user, axis=1)
        #print('playtime shape:', pred_playtime.shape)

        flattened_pred_values = np.asarray(pred_values).flatten()
        recommended_product_indices = np.argsort(-flattened_pred_values[top_user.toarray().flatten() == 0]) # if the user has not seen the product before
        # recommended_product_indices = np.argsort(-flattened_pred_values[top_user.toarray().flatten() != 0]) # if the user has seen the product before
        recommended_top_products = [index_product[i] for i in recommended_product_indices[:top_n]]
    
    return recommended_top_products, top_user, pred_values

In [None]:
%%timeit -n1 -r1 -o
#top_user = reviewer_index[top_review['reviewerID']]
#print(top_user)

#selected_subcategory = ['Electronics', 'Computers & Accessories', 'Computer Accessories & Peripherals', 'Memory Cards', 'Micro SD Cards']
#query = 'I want a SD card'
# query = 'I want a GPS antenna'
# query = 'FSK-12JL-W7-1 12W7, Wide'
# selected_subcategory = list(index_category[79])
query = 'Sony Bundle and'
selected_subcategory = list(index_category[285])
selected_filter = 'top_ratings'

print('query:', query)
print('selected_subcategory:', selected_subcategory)

top_reviews = get_similar_users(review_keyword_sparse_mat_with_lsi_and_keywords, selected_subcategory, query, keyword_list, lsi_bigram_model, lsi_dictionary=lsi_dictionary, lsi_model=lsi_model, master_index=review_keyword_master_index, num_users=800)

query: Sony Bundle and
selected_subcategory: ['Electronics', 'Camera & Photo', 'Digital Cameras', 'Mirrorless Cameras']
760 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


<TimeitResult : 760 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

In [None]:
run_time = _

In [None]:
run_time.average

0.7865146999993158

In [None]:
%%timeit -n1 -r1 -o
top_recommendations, top_user, pred_values = find_recommendations(user_product_value_mat, user_product_quality_mat, user_product_rating_mat, user_product_seller_mat, selected_filter=selected_filter, top_reviews=top_reviews, master_index=review_keyword_master_index, index_product=index_asin, reviewer_index=reviewer_index, top_n=10)
# top_recommendations

top_user = top_user.toarray().flatten()
pred_values = np.array(pred_values).flatten()

rmse = np.sqrt(np.sum((top_user[top_user != 0] - pred_values[top_user != 0]) ** 2))
print(f"RMSE = {rmse}\n")

for product in top_recommendations:
    print(product, '-------', product_dict[product]['title'])

RMSE = 8.014466726311593

B00M55BS36 ------- SanDisk Extreme 32GB UHS-I/U3 Micro SDHC Memory Card Up To 60MB/s Read With Adapte-SDSDQXN-032G-G46A [Older Version]
B00KCHQDS4 ------- E-BLUE COBRA EKM066BKC COINBATANT-X Advanced Gaming USB Wired Keyboard (English packing)
B00EDBRE2Y ------- Travelwell P6315 Bellino Rolling Computer Backpack (Navy)
B01EIYOC9I ------- USB C Hub, F-color USB C to 2 Ports USB 3.0 Hub with 1 Gigabit Ethernet Port and 1 Full Speed Type-C Charging Port for Apple New MacBook 12 inch, Chrome Book Pixel and More, Silver, 1 Pack
B0012OKPBM ------- SanDisk 8 GB microSDHC Card with SD Adapter
B000UXDH4I ------- zCover echo for COWON D2, Original ICE
B00GHNHUC0 ------- Sonpre Samurai Max-C Pop-up SD Rechargeable Speaker for Smartphones/Laptops/Tablets/MP3s - Purple
B000XR87XC ------- HP VGA Cable GS567AA
B00CAB6PJM ------- [Newest 2018] Bluetooth Headphones w/ 12+ Hours Battery - Best Wireless Sport Earphones w/Mic - IPX7 Waterproof Music in-Ear Earbuds for Gym Running

<TimeitResult : 2.56 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

## Testing

In [None]:
machine_generated_queries = pd.read_csv('../data/recommend/query_test_1000.csv')
index_category = pd.read_pickle('../data/recommend/v2/testing/M2_index_category.pickle')

In [None]:
machine_generated_queries

Unnamed: 0,asin,cat_idx,query_3,query_4,query_5,query_6,query_7,label
0,69186.0,79.0,"FSK-12JL-W7-1 12W7, Wide",12W7 FSK-12JL-W7-1 Edge (Single),Repair Speaker (Single) JL Edge,"JL Repair Edge Kit, Extra 12W7,","Roll, Repair Extra Wide JL 12W7 12,",1
1,455186.0,620.0,4g Portable Sit,1900 and Mah 3gs,3g 4g Mobile for Cute,4g Kitty 3g and Touch Portable,Mah 3gs Mobile Hello Iphone Kitty Touch,1
2,205033.0,285.0,Sony Bundle and,Sony Bag Camera 32GB,with Gadget Sony (Black) and,SD Bundle 32GB Alpha and Sony,SD a6000 Bundle Alpha (Black) Mirrorless Gadget,1
3,29298.0,21.0,Ft HDMI White,HDMI to White HDMI,4K HDMI 10FT 10 Speed,to Resolution Ft 4K HDMI HDMI,"Cable, to HDMI Video Return Audio HDMI",1
4,360432.0,542.0,Protector E7440 It3,E7440 Screen It3 Guard,Anti E7440 14 Latitude It3,Latitude Guard for It3 Screen Screen,Anti Screen for Screen It3 14 Glare,1
...,...,...,...,...,...,...,...,...
995,-1.1,,Plush Dog Grriggles,Plush Grriggles Unstuffies Dog,Grriggles Plush Unstuffies Dog Toy,Grriggles Plush Unstuffies Dog Toy,Grriggles Plush Unstuffies Dog Toy,0
996,-1.1,,Pets MidWest 39&quot;,Tall Gate/Pet 39&quot; Pet,Gate/Pet Steel Homes Soft Graphite,for Soft Gate/Pet Gate; 39&quot; Textured,29&quot; &amp; Textured White MidWest in Pet,0
997,-1.1,,Heavy HUG 78,Navy/Light 300g 1200D Heavy,Heavy HUG Navy/Light Horse 1200D,300g Horse Heavy HUG Navy/Light Blue,Heavy Blue Horse Blanket 1200D 78 HUG,0
998,-1.1,,"Bag) Range, Bully",for No - Good,"Range, &hellip; Hand Good -","Range, 100% Dog Inspected Hand -",- USDA/FDA-Approved Natural or Bully Chemicals...,0


In [None]:
import time

def test_recommendations(machine_generated_queries, query_column, num_users):
    
    results = dict()

    test_queries = machine_generated_queries[machine_generated_queries['label'] == 1][['cat_idx', query_column]]

    for idx, row in tqdm(test_queries.iterrows(), total=len(test_queries)):
        start_time = time.process_time()
        category = list(index_category[row.cat_idx])
        query = row[query_column]

        top_reviews = get_similar_users(review_keyword_sparse_mat_with_lsi_and_keywords, category, query, keyword_list, lsi_bigram_model, lsi_dictionary, lsi_model, review_keyword_master_index, num_users)
        top_recommendations, top_user, pred_values = find_recommendations(user_product_value_mat, user_product_quality_mat, user_product_rating_mat, user_product_seller_mat, selected_filter=selected_filter, top_reviews=top_reviews, master_index=review_keyword_master_index, index_product=index_asin, reviewer_index=reviewer_index, top_n=10)
        
        # top_recommendations
        top_user = top_user.toarray().flatten()
        pred_values = np.array(pred_values).flatten()

        # track time to make predictions
        end_time = time.process_time()
        duration = end_time - start_time

        rmse = np.sqrt(np.sum((top_user[top_user != 0] - pred_values[top_user != 0]) ** 2))
        # print(query, f"RMSE = {rmse}")

        query_attr = {'rmse' : rmse, 'duration' : duration}

        results[idx] = query_attr

    return results

In [None]:
query_list = ['query_3', 'query_4', 'query_5', 'query_6', 'query_7']
num_users = 500

for query_column in query_list:
    print('processing ', query_column)
    test_result = test_recommendations(machine_generated_queries, query_column, num_users)
    joblib.dump(test_result, '../data/recommend/v2/testing/{}_{}_users_test_results.joblib'.format(query_column, str(num_users)))

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
query_list = ['query_3', 'query_4', 'query_5', 'query_6', 'query_7']
num_users = 1000

for query_column in query_list:
    print('processing ', query_column)
    test_result = test_recommendations(machine_generated_queries, query_column, num_users)
    joblib.dump(test_result, '../data/recommend/v2/testing/{}_{}_users_test_results.joblib'.format(query_column, str(num_users)))

processing  query_3


  0%|          | 0/500 [00:00<?, ?it/s]

processing  query_4


  0%|          | 0/500 [00:00<?, ?it/s]

processing  query_5


  0%|          | 0/500 [00:00<?, ?it/s]

processing  query_6


  0%|          | 0/500 [00:00<?, ?it/s]

processing  query_7


  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
query_list = ['query_3', 'query_4', 'query_5', 'query_6', 'query_7']
num_users = 1500

for query_column in query_list:
    print('processing ', query_column)
    test_result = test_recommendations(machine_generated_queries, query_column, num_users)
    joblib.dump(test_result, '../data/recommend/v2/testing/{}_{}_users_test_results.joblib'.format(query_column, str(num_users)))

processing  query_3


  0%|          | 0/500 [00:00<?, ?it/s]

processing  query_4


  0%|          | 0/500 [00:00<?, ?it/s]

processing  query_5


  0%|          | 0/500 [00:00<?, ?it/s]

processing  query_6


  0%|          | 0/500 [00:00<?, ?it/s]

processing  query_7


  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
query_list = ['query_3', 'query_4', 'query_5', 'query_6', 'query_7']
num_users = 2000

for query_column in query_list:
    print('processing ', query_column)
    test_result = test_recommendations(machine_generated_queries, query_column, num_users)
    joblib.dump(test_result, '../data/recommend/v2/testing/{}_{}_users_test_results.joblib'.format(query_column, str(num_users)))