In [1]:
import pandas as pd
import numpy as np
import joblib

import scipy
import scipy.sparse as sp

from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity

from tqdm.notebook import tqdm
from itertools import islice

from nltk.stem import WordNetLemmatizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#nltk.download('wordnet')
#nltk.download('punkt')

import copy
import math
import pickle

from gensim.parsing.preprocessing import preprocess_string, STOPWORDS, remove_stopwords
from gensim.parsing.preprocessing import strip_tags, stem_text, strip_multiple_whitespaces 
from gensim.parsing.preprocessing import strip_non_alphanum, strip_punctuation
from gensim.models.phrases import Phraser, Phrases, ENGLISH_CONNECTOR_WORDS
from gensim.utils import tokenize
from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import LsiModel

## Import required files

In [2]:
product_dict = joblib.load('../data/recommend/product_dict.joblib')
index_asin = pd.read_pickle('../data/recommend/index_asin.pickle')
category_index = joblib.load('../data/recommend/category_index.joblib')

In [3]:
keyword_list = pd.read_csv('../data/recommend/keyword_list_1000.csv')
keyword_list = keyword_list.keywords.tolist()

In [4]:
lsi_dictionary = pd.read_pickle('../data/recommend/lsi_dictionary_10_components/lsi_dict_bigram')
lsi_model = LsiModel.load('../data/recommend/lsi_model_10_components/lsi_model_bigram')

In [3]:
reviews_bigram_model = joblib.load('../data/recommend/reviews_bigram_model.joblib')
lsi_bigram_model = joblib.load('../data/recommend/lsi_trained_bigram_model_10_components/trained_lsi_bigram_model')

In [4]:
review_keyword_master_index = joblib.load('../data/recommend/v2/review_keyword_master_index.joblib')

In [5]:
index_reviewer = joblib.load('../data/recommend/index_reviewer.joblib')
reviewer_index = joblib.load('../data/recommend/reviewer_index.joblib')
index_product = joblib.load('../data/recommend/index_product.joblib')
product_index = joblib.load('../data/recommend/product_index.joblib')
common_product_dict = joblib.load('../data/recommend/common_product_dict.joblib')

## Create Products Dictionary

In [8]:
def create_products_dict(file_path, chunksize):
    '''
    Extracts features from the Amazon products dataset.

    Args:
        file_path: file path to product dataset, should be in json format
        chunksize: size of the chunks to read in json

    Returns:
        product_dict: dictionary where key is product asin and values is a dictionary of product title, category, price, and num_features
    '''
    iter_df = pd.read_json(file_path, lines=True, chunksize=chunksize)

    product_dict = dict()

    for chunk in iter_df:
        for idx, row in chunk.iterrows():
            asin = row['asin']
            title = row['title']
            category = row['category']
            price = row['price']
            num_features = len(row['feature'])

            # clean the category
            category = [cat.strip().replace('&amp;', '&') for cat in category]

            # clean price
            try:
                if price == '':
                    price = 0
                elif len(price) > 7:
                    price = 0
                else:
                    price = price.replace('$', '')
                    price = price.replace(',', '')
                    price = float(price)
            except:
                print(asin, ':', price)

            attr_dict = {'title' : title, 'category' : category, 'price' : price, 'num_features' : num_features}
            
            if asin not in product_dict:
                product_dict[asin] = attr_dict
            else:
                product_dict[asin].update(attr_dict)

    
    return product_dict

In [9]:
# product_dict = create_products_dict('../data/meta_Electronics.json', 10000)

In [10]:
# joblib.dump(product_dict, '../data/recommend/product_dict.joblib')

## Create Master Index

In [4]:
def convert_reviews_to_tokens(text, lsi_bigram_model, lsi_model, lsi_dictionary):
    '''
    Converts review text into bigrammed tokens. Performs filtering of tags, punctuation, whitespaces,
    alpha numerica, and stopwords before lemmatizing.

    Args:
        text: a single review text in string format
        lsi_bigram_model: gensim bigram model
        lsi_model: gensim lsi model
        lsi_dictionary: gensim lsi dictionary

    Returns:
        bigrammed_text: list of tokens that have been lemmatized and run through the bigram model.
    '''
    # initialize Lemmatizer
    lemmatizer = WordNetLemmatizer()

    # use gensim preprocessing filters 
    custom_filters = [lambda x: x.lower(), 
                      strip_tags, 
                      strip_punctuation, 
                      strip_multiple_whitespaces, 
                      strip_non_alphanum,
                      remove_stopwords]

    # tokenize, strip tags, punctuation, multi_whitespace, non_alphanum, and remove stopwords
    tokenized_text = preprocess_string(text, custom_filters)

    # lemmatize
    lemmatized_text = [lemmatizer.lemmatize(word) for word in tokenized_text]

    # apply the bigram model to the lemmatized text.
    # if applied correctly, bigrammed_tokens contains a list of unigrams and bigrams
    # generated from the lemmatized tokens
    bigrammed_text = lsi_bigram_model[lemmatized_text]
    
    return bigrammed_text

In [5]:
def quality_text_score(text):
    word_set_good = (['good_quality', 'great_quality', 'high_quality', 'excellent_quality','quality_product', 'nice_quality',
                     'better_quality', 'best_quality', 'highest_quality', 'amazing_quality', 'awesome_quality', 'top_quality',
                     'quality_item', 'quality_made', 'quality_material', 'works_great', 'works_well', 'works_perfectly',
                     'worked_great', 'work_great', 'work_well', 'worked_perfectly','works_flawlessly', 'worked_flawlessly',
                     'worked_perfect', 'working_perfectly', 'works_wonderfully', 'works_amazingly', 'work_wonderfully',
                     'work_excellent', 'work_awesome', 'worked_excellent','quality_construction', 'built_quality', 
                      'fantastic_quality', 'perfect_quality', 'superior_quality'])
    
    word_set_bad = ['poor_quality', 'low_quality', 'cheap_quality','bad_quality', 'quality_control', 'poor_build', 
                    'stopped_working', 'stop_working', 'never_worked', 'quit_working', 'nothing_works', 'stop_working' ]
    score = 0 
    for token in text:
        if token in word_set_good:
            score = score + 1

        if token in word_set_bad:
            score = score - 1
    
    return score

In [6]:
def price_text_score(text):
    word_set_bad = (['little_pricey','pricey','bit_pricey','overpriced','high_price',
                     'way_overpriced','higher_price','premium_price'])
    
    word_set_good = ['great_price','good_price','low_price','excellent_price',
                    'best_price','cheap_price','affordable_price','awesome_price',
                    'well_priced','amazing_price','discounted_price','fantastic_price',
                    'priced_well','bargain_price','perfect_price','super_price',
                    'great_prices','unbeatable_price','inexpensive_price','lowest_price',
                    'incredible_price','terrific_price','wonderful_price','great_value',
                    'good_value','excellent_value','best_value','better_value','fantastic_value',
                    'amazing_value','outstanding_value','incredible_value','awesome_value']
    score = 0 
    for token in text:
        if token in word_set_good:
            score = score + 1

        if token in word_set_bad:
            score = score - 1
    
    return score

In [7]:
def transform_quality_score(x):
    if x > 2:
        return 5
    elif x == 2:
        return 4.0
    elif x == 1:
        return 3.5
    elif x == 0:
        return 2.5
    elif x == -1:
        return 1
    else:
        return 0

In [8]:
def transform_price_score(x):
    if x > 2:
        return 5
    elif x == 2:
        return 4.5
    elif x == 1:
        return 3.5
    elif x == 0:
        return 2.5
    elif x == -1:
        return 1
    else:
        return 0

In [9]:
def create_master_index(reviews_file_path, product_dict, reviews_bigram_model, chunk_size):
    '''
    Constructs a dictionary where the key is a tuple of (review_index, keyword_index) and value is the keyword frequency and query converted to LSI space. This dictionary can then be converted to a sparse matrix.
    
    Args:
        reviews_file_path: file path to the Amazon review json file
        product_dict: dictionary that, at the minimum, maps asin's to product categories.
        keyword_list: list of the top n keywords
        lsi_dictionary: pre-trained LSI dictionary used to convert the reviewText into bag-of-words
        lsi_model: pre-trained LSI model to convert the BOW into LSI space
        chunk_size: size of the chunks to iterate through

    Returns:
        user_keyword_dict: dictionary that maps the (review_index, keyword_index) tuple to keyword frequencies
        master_index: dictionary that maps review index to asin, reviewerID, and product category
    '''

    reviews_iter_df = pd.read_json(reviews_file_path, lines=True, chunksize=chunk_size)

    master_index = dict() # links index of user_keyword_dict to asin, reviewerID, categoryID
    idx_tracker = 0 # since we're chunking, the index will reset to 0 each time we do iterrows(), so we track

    # initialize Rake
    r = Rake()
    
    for chunk in tqdm(reviews_iter_df, total=int(math.ceil(6739590/chunk_size))):
        for idx, row in chunk.iterrows():

            # interestingly, not all asin's in the reviews exist in the products
            # if the asin doesn't exist in products, ignore it
            if row['asin'] in product_dict:
                reviewer_id = row['reviewerID']
                asin = row['asin']
                overall = row['overall']
                category = product_dict[asin]['category']
                review_text = row['reviewText']

                if pd.isna(review_text) == False:
                    bigram_list = list()

                    # clean up review_text before applying bigram model
                    review_text = review_text.lower()
                    tokenized_review_text = word_tokenize(review_text)
                    tokenized_review_text = [token for token in tokenized_review_text if not token.isnumeric()] # remove numbers
                    tokenized_review_text = [token for token in tokenized_review_text if len(token) > 1] # remove single letter tokens
                    stop_words = set(stopwords.words('english'))
                    cleaned_review_text = [token for token in tokenized_review_text if token not in stop_words] # remove stop words

                    bigrammed_text = reviews_bigram_model[cleaned_review_text]
                    for token in bigrammed_text:
                        if '_' in token:
                            bigram_list.append(token)
                    quality_score = transform_quality_score(quality_text_score(bigram_list))
                    value_score = transform_price_score(price_text_score(bigram_list))
                else:
                    quality_score = 0
                    value_score = 0

                # build the attributes of each review
                attr_dict = {'reviewerID' : reviewer_id, 'asin' : asin, 'category' : category, 'overall' : overall, 'quality_score' : quality_score, 'value_score' : value_score}
                
                if idx_tracker not in master_index:
                    master_index[idx_tracker] = attr_dict
                else:
                    master_index[idx_tracker].update(attr_dict)
                
                # increment idx_tracker
                idx_tracker += 1
        
    return master_index

In [10]:
# takes 72 minutes
master_index = create_master_index('../data/recommend/reviews_with_sentiment_scores_line_delimited.json', product_dict=product_dict, reviews_bigram_model=reviews_bigram_model, chunk_size=100000)

  0%|          | 0/68 [00:00<?, ?it/s]

In [11]:
joblib.dump(master_index, '../data/recommend/v2/user_keyword_master_index.joblib')

['../data/recommend/v2/user_keyword_master_index.joblib']

## Create user-keyword dictionary

This dictionary maps a tuple of (review_index, keyword_index) to the number of times that keyword showed up in the review. This will then be passed to create a sparse matrix.

In [13]:
def create_reviews_keyword_lsi_and_keywords(reviews_file_path, product_dict, keyword_list, lsi_dictionary, lsi_model, lsi_bigram_model, num_rows, num_cols, chunk_size):
    '''
    Constructs two dictionaries where:
    1. One where the key is a tuple of (review_index, keyword_index) and value is the keyword frequency and query converted to LSI space. This dictionary can then be converted to a sparse matrix.
    2. One maps the review index to asin, reviewerID, product category, and overall stars.

    Args:
        reviews_file_path: file path to the Amazon review json file
        product_dict: dictionary that, at the minimum, maps asin's to product categories.
        keyword_list: list of the top n keywords
        lsi_dictionary: pre-trained LSI dictionary used to convert the reviewText into bag-of-words
        lsi_model: pre-trained LSI model to convert the BOW into LSI space
        chunk_size: size of the chunks to iterate through

    Returns:
        user_keyword_dict: dictionary that maps the (review_index, keyword_index) tuple to keyword frequencies
        master_index: dictionary that maps review index to asin, reviewerID, and product category
    '''

    reviews_iter_df = pd.read_json(reviews_file_path, lines=True, chunksize=chunk_size)

    user_keyword_dict = dict() # holds the (review_index, keyword_index) tuple and number of times a keyword shows up
    # master_index = dict() # links index of user_keyword_dict to asin, reviewerID, categoryID
    idx_tracker = 0 # since we're chunking, the index will reset to 0 each time we do iterrows(), so we track

    # initialize Rake
    r = Rake()

    # initialize sparse matrix to fill out
    sparse_mat = sp.dok_matrix((num_rows, num_cols))
    
    for chunk in tqdm(reviews_iter_df, total=int(math.ceil(6739590/chunk_size))):
        for idx, row in chunk.iterrows():

            # interestingly, not all asin's in the reviews exist in the products
            # if the asin doesn't exist in products, ignore it
            if row['asin'] in product_dict:
                reviewer_id = row['reviewerID']
                asin = row['asin']
                overall = row['overall']
                category = product_dict[asin]['category']
                review_text = row['reviewText']
                values = np.zeros(len(keyword_list)) # array that'll hold the count of keywords per review
                
                try:
                    # only process reviews that are not empty
                    if pd.isna(review_text) == False: 
                        r.extract_keywords_from_text(review_text)
                        extracted_keywords = r.get_ranked_phrases()[:10]

                        # build the keyword frequencies per (review-keyword) tuple
                        for keyword in extracted_keywords:
                            if keyword in keyword_list:

                                keyword_index = keyword_list.index(keyword)
                                tuple_index = (idx_tracker, keyword_index)
                                
                                if sparse_mat[idx_tracker, keyword_index] == 0:
                                    sparse_mat[idx_tracker, keyword_index] = 1
                                else:
                                    sparse_mat[idx_tracker, keyword_index] += 1

                        # use LSI to transform the reviewText into LSI space
                        bigrammed_review_text = convert_reviews_to_tokens(review_text, lsi_bigram_model=lsi_bigram_model, lsi_model=lsi_model, lsi_dictionary=lsi_dictionary)
                        vec_bow = lsi_dictionary.doc2bow(bigrammed_review_text)
                        vec_lsi = lsi_model[vec_bow]  # convert the query to LSI space

                        # since vec_lsi start index at 0...
                        for element in vec_lsi:
                            sparse_mat[idx_tracker, len(keyword_list) + element[0]] = element[1]
                        
                except:
                    print(reviewer_id, '*****', review_text, pd.isna(review_text))
                
                # increment idx_tracker
                idx_tracker += 1
        
    #return user_keyword_dict, master_index
    return sparse_mat.tocsr()

In [14]:
# print(len(user_keyword_master_index))

6732848


In [15]:
# takes 120 minutes
# review_keyword_sparse_mat_with_lsi_keywords = create_reviews_keyword_lsi_and_keywords('../data/recommend/reviews_with_sentiment_scores_line_delimited.json', product_dict=product_dict, keyword_list=keyword_list, lsi_dictionary=lsi_dictionary, lsi_model=lsi_model, lsi_bigram_model=lsi_bigram_model, num_rows=len(user_keyword_master_index), num_cols=1010, chunk_size=100000)

  0%|          | 0/68 [00:00<?, ?it/s]

In [16]:
# review_keyword_sparse_mat_with_lsi_keywords

<6732848x1010 sparse matrix of type '<class 'numpy.float64'>'
	with 80562256 stored elements in Compressed Sparse Row format>

In [17]:
# joblib.dump(review_keyword_sparse_mat_with_lsi_keywords, '../data/recommend/v2/review_keyword_sparse_mat_with_lsi_and_keywords.joblib')

['../data/recommend/v2/user_keyword_sparse_mat_with_lsi_and_keywords.joblib']

#### Split LSI and keyword sparse matrix into LSI only and keyword only

In [3]:
# review_keyword_sparse_mat_with_lsi_keywords = joblib.load('../data/recommend/v2/review_keyword_sparse_mat_with_lsi_keywords.joblib')

In [8]:
# review_keyword_sparse_mat_with_keywords_only = review_keyword_sparse_mat_with_lsi_keywords[:, :1000]

In [9]:
# review_keyword_sparse_mat_with_lsi_only = review_keyword_sparse_mat_with_lsi_keywords[:, 1000:]

In [10]:
# joblib.dump(review_keyword_sparse_mat_with_keywords_only, '../data/recommend/v2/review_keyword_sparse_mat_with_keywords_only.joblib')
# joblib.dump(review_keyword_sparse_mat_with_lsi_only, '../data/recommend/v2/review_keyword_sparse_mat_with_lsi_only.joblib')

['../data/recommend/v2/user_keyword_sparse_mat_with_lsi_only.joblib']

### Create sparse matrices (DEPRECATED)

In [24]:
def create_sparse_matrix(input_dict, num_rows, num_cols):
    '''
    (LEGACY)
    Constructs a csr_matrix from the output of create_sparse_dict function.
    It uses the dok_matrix as an intermediate matrix to fill in values before converting to csr_matrix. This is because
    incrementally replacing values in a csr_matrix is computationally expensive.

    Args:
        input_dict: output from create_sparse_dict, or a dictionary of format {(row_index, col_index) : keyword_frequency_value}
        num_rows: the number of rows that the resulting matrix should have
        num_cols: the number of columns that the resulting matrix should have

    Returns:
        sparse_mat: csr_matrix with shape (num_rows, num_cols)
    '''

    sparse_mat = sp.dok_matrix((num_rows, num_cols))

    for k, v in tqdm(input_dict.items()):
        sparse_mat[k[0], k[1]] = v
    
    return sparse_mat.tocsr()

In [11]:
# review_keyword_sparse_mat = create_sparse_matrix(review_keyword_dict, len(user_keyword_master_index), 1010)

In [28]:
# joblib.dump(review_keyword_sparse_mat, '../data/recommend/user_keyword_sparse_mat_keyword_lsi.joblib')

['../data/recommend/user_keyword_sparse_mat_lsi_only.joblib']

In [25]:
# review_keyword_sparse_mat = create_sparse_matrix(user_keyword_dict, len(user_keyword_master_index), 10)

  0%|          | 0/65586771 [00:00<?, ?it/s]

## Train bigram model on reviews reviewText column

In [13]:
def train_bigram_model(file_path, chunk_size):
    docs = list()
    iter_reviews = pd.read_json(file_path, lines=True, chunksize=chunk_size)

    for chunk in tqdm(iter_reviews, total=int(math.ceil(6739590/chunk_size))):
        chunk['reviewText'] = chunk['reviewText'].fillna(0)
        chunk['reviewText'] = chunk['reviewText'].astype('string')
        for reviews in chunk.reviewText:
            docs.append(reviews)

    # lower case
    # docs = docs.apply(lambda x: x.lower())
    docs = [doc.lower() for doc in docs]
    # tokenize
    # docs = docs.apply(lambda x: word_tokenize(x))
    docs = [word_tokenize(doc) for doc in docs]
    # Remove numbers, but not words that contain numbers.
    # docs = docs.apply(lambda x: [token for token in x if not token.isnumeric()])
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]
    # remove stopwords 
    stop_words = set(stopwords.words('english'))
    docs = [[token for token in doc if token not in stop_words] for doc in docs]
    # Compute bigrams.
    from gensim.models import Phrases

    # Add bigrams
    bigram = Phrases(docs, min_count=1)

    return bigram

In [14]:
# takes 100 minutes to train
# reviews_bigram_model = train_bigram_model('../data/Electronics_5.json', chunk_size=100000)

## Create reviewer-product value matrix

Create 4 of these matrices, one for top value, top seller, top quality, and top ratings.
Each matrix is structured where each row is a user and each column is a product.

Note that even though there are 6.7 million reviews, there are only around 700,000 unique reviewers.

In [6]:
def create_user_product_mat(review_keyword_master_index, product_index, reviewer_index, mat_val = 'top_value'):
    '''
    Creates a sparse matrix where rows are users/reviews and columns are asin/products. The values are determined by input mat_val.

    Args:
        product_dict: product dictionary where keys are asin's and values are product attributes (i.e. price, category, num_features)
        user_keyword_master_index: dictionary that maps review index to review attributes (i.e. reviewerID, asin, product category, stars)
        mat_val: determines the type of value in the matrix, must choose among ('top_value', 'top_seller', 'top_quality', 'top_ratings')
        asin_index: dictionary with {asin_index: asin} format
        reviewers_index: dictionary with {reviewer_index: reviewerID} format

    Returns:
        user_prod_mat: sparse matrix with rows as reviews and columns as products. Values held by the matrix is determined by mat_val.
    '''
    # initialize variables
    # user_prod_mat = sp.dok_matrix((len(reviewers_index), len(asin_index)))
    data = list()
    row = list()
    col = list()
    # product_set = set(product_index.values()) # set search is O(1) while list search is O(n)
    
    if mat_val == 'top_value':
        for review_id, review_attr in tqdm(review_keyword_master_index.items()):
            asin = review_attr['asin']
            reviewer_id = review_attr['reviewerID']
            if asin in product_index:
                try:
                    val = review_attr['value_score']
                    # val = product_dict[asin]['price']
                    # matching_asin_index = [k for k, v in product_index.items() if v == asin][0]
                    # matching_reviewer_index = [k for k, v in reviewer_index.items() if v == reviewer_id][0]
                    # user_prod_mat[[matching_reviewer_index], [matching_asin_index]] = val
                    matching_product_index = product_index[asin]
                    matching_reviewer_index = reviewer_index[reviewer_id]
                    
                    data.append(val)
                    row.append(matching_reviewer_index)
                    col.append(matching_product_index)
                except:
                    print('val:', val, ' asin:', asin)
    
    elif mat_val == 'top_ratings':
        for review_id, review_attr in tqdm(review_keyword_master_index.items()):
            asin = review_attr['asin']
            reviewer_id = review_attr['reviewerID']
            if asin in product_index:
                try:
                    val = review_attr['overall']
                    # matching_asin_index = [k for k, v in asin_index.items() if v == asin][0]
                    # matching_reviewer_index = [k for k, v in reviewers_index.items() if v == reviewer_id][0]
                    # user_prod_mat[[matching_reviewer_index], [matching_asin_index]] = val
                    matching_product_index = product_index[asin]
                    matching_reviewer_index = reviewer_index[reviewer_id]
                    
                    data.append(val)
                    row.append(matching_reviewer_index)
                    col.append(matching_product_index)
                except:
                    print('val:', val, ' asin:', asin)

    elif mat_val == 'top_seller':
        user_product_tracker = dict() # format is {(row, col) : data_index}

        for review_id, review_attr in tqdm(review_keyword_master_index.items()):
            asin = review_attr['asin']
            reviewer_id = review_attr['reviewerID']
            if asin in product_index:
                try:
                    # matching_asin_index = [k for k, v in asin_index.items() if v == asin][0]
                    # matching_reviewer_index = [k for k, v in reviewers_index.items() if v == reviewer_id][0]
                    # user_prod_mat[[matching_reviewer_index], [matching_asin_index]] += 1
                    matching_product_index = product_index[asin]
                    matching_reviewer_index = reviewer_index[reviewer_id]

                    if (matching_reviewer_index, matching_product_index) in user_product_tracker:
                        idx = user_product_tracker[(matching_reviewer_index, matching_product_index)]
                        data[idx] += 1
                    else:
                        user_product_tracker[(matching_reviewer_index, matching_product_index)] = len(data)
                        data.append(1)
                        row.append(matching_reviewer_index)
                        col.append(matching_product_index)
                except:
                    print('val:', val, ' asin:', asin)
    
    elif mat_val == 'top_quality':
        for review_id, review_attr in tqdm(review_keyword_master_index.items()):
            asin = review_attr['asin']
            reviewer_id = review_attr['reviewerID']
            if asin in product_index:
                try:
                    val = review_attr['quality_score']
                    # matching_asin_index = [k for k, v in asin_index.items() if v == asin][0]
                    # matching_reviewer_index = [k for k, v in reviewers_index.items() if v == reviewer_id][0]
                    # user_prod_mat[[matching_reviewer_index], [matching_asin_index]] = val
                    matching_product_index = product_index[asin]
                    matching_reviewer_index = reviewer_index[reviewer_id]
                    
                    data.append(val)
                    row.append(matching_reviewer_index)
                    col.append(matching_product_index)
                except:
                    print('val:', val, ' asin:', asin)
    
    else:
        print('please pass one of the following to mat_val: {\'top_value\', \'top_ratings\', \'top_seller\', \'top_quality\'')
    
    user_prod_mat = sp.coo_matrix((data, (row, col)), shape=(len(reviewer_index), len(product_index)))
    
    return user_prod_mat.tocsr()

In [7]:
user_product_value_mat = create_user_product_mat(review_keyword_master_index=review_keyword_master_index, product_index=product_index, reviewer_index=reviewer_index, mat_val='top_value')

  0%|          | 0/6732848 [00:00<?, ?it/s]

In [8]:
user_product_ratings_mat = create_user_product_mat(review_keyword_master_index=review_keyword_master_index, product_index=product_index, reviewer_index=reviewer_index, mat_val='top_ratings')

  0%|          | 0/6732848 [00:00<?, ?it/s]

In [9]:
user_product_seller_mat = create_user_product_mat(review_keyword_master_index=review_keyword_master_index, product_index=product_index, reviewer_index=reviewer_index, mat_val='top_seller')

  0%|          | 0/6732848 [00:00<?, ?it/s]

In [10]:
user_product_quality_mat = create_user_product_mat(review_keyword_master_index=review_keyword_master_index, product_index=product_index, reviewer_index=reviewer_index, mat_val='top_quality')

  0%|          | 0/6732848 [00:00<?, ?it/s]

In [11]:
user_product_value_mat

<728719x748122 sparse matrix of type '<class 'numpy.float64'>'
	with 6481375 stored elements in Compressed Sparse Row format>

In [12]:
joblib.dump(user_product_value_mat, '../data/recommend/v2/user_product_value_mat.joblib')
joblib.dump(user_product_ratings_mat, '../data/recommend/v2/user_product_ratings_mat.joblib')
joblib.dump(user_product_seller_mat, '../data/recommend/v2/user_product_seller_mat.joblib')
joblib.dump(user_product_quality_mat, '../data/recommend/v2/user_product_quality_mat.joblib')

['../data/recommend/v2/user_product_quality_mat.joblib']