# Preparation #

In [3]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import re
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics, preprocessing
from tensorflow.keras import models, layers, utils
dfs=pd.read_csv("preprocessed.csv")
dfs=dfs.iloc[:,3:]
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# Keep copy of original data
dfor=dfs.copy()

## Drop duplicates ##

In [4]:
dfs=dfs[~dfs[['reviewerID', 'asin']].duplicated()]

## Recode 'bottomcat' ##

In [5]:
counts=dfs.groupby('bottomcat').count()['overall']
counts=counts[counts>40]
dfs['bottomcat_other']=[i if i in counts else 'other' for i in dfs['bottomcat']]

## Discretize rating ##

In [6]:
dfs['disc_rating']=(dfs['overall']>4).astype(int)

## Use only users that have given more than 4 reviews ##

In [7]:
mask=dfs.groupby('reviewerID').count()['asin']
mask=mask[mask>4]
dfs=dfs[dfs['reviewerID'].isin(mask.index)]

## Recode nan-votes to zero ##

In [8]:
dfs['vote']=dfs['vote'].fillna(0)

## Re-enumerate user and product ids ##

In [9]:
dfs['reviewerID'], list_userids=pd.factorize(dfs['reviewerID'])
dfs['asin'], list_itemids=pd.factorize(dfs['asin'])

## Make pivot-table user X product ratings DF ##

In [10]:
df_cf=dfs.pivot_table(index="reviewerID", columns="asin", values="overall")

## Recode verified ##

In [11]:
dfs.loc[dfs['verified']=='SAND', 'verified']=1
dfs.loc[dfs['verified']=='FALSK', 'verified']=0

## Merge unixReviewTime ##

In [12]:
df4=pd.read_csv('merged_df.csv')
dfs=dfs.join(df4['unixReviewTime'])

## Text preprocessing ##

In [13]:
from sklearn.feature_extraction import _stop_words
import string
import nltk
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
Stop_Words= _stop_words.ENGLISH_STOP_WORDS

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Frede\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Frede\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
def text_processing(text):
    # remove punctuation 
    text = "".join([c for c in text 
                    if c not in string.punctuation])
    # lowercase
    text = "".join([c.lower() for c in text])
    # remove stopwords
    text = " ".join([w for w in text.split() 
                     if w not in Stop_Words])
    # stemming / lematizing (optional)
    text = " ".join([lemmatizer.lemmatize(w) for w in text.split()])
    return text

In [15]:
dfs['reviewText_cleaned'] = dfs['reviewText'].apply(text_processing)

## Sentiment analysis ##

In [16]:
sid_obj = SentimentIntensityAnalyzer()

dfs['compound'] = [sid_obj.polarity_scores(x)['compound'] for x in dfs['reviewText_cleaned']]
dfs['neg'] = [sid_obj.polarity_scores(x)['neg'] for x in dfs['reviewText_cleaned']]
dfs['neu'] = [sid_obj.polarity_scores(x)['neu'] for x in dfs['reviewText_cleaned']]
dfs['pos'] = [sid_obj.polarity_scores(x)['pos'] for x in dfs['reviewText_cleaned']]

## Train/test split ##

In [17]:
# Split data, keep 20% for testing.
split = int(0.8*df_cf.shape[1])
df_train = df_cf.iloc[:, :split-1]
df_test = df_cf.iloc[:, split:]

## Stacked ##

In [18]:
# Convert to stacked format
train = df_train.stack(dropna=True).reset_index().rename(columns={0:"overall"})
train.columns=['user', 'product', 'overall']
test = df_test.stack(dropna=True).reset_index().rename(columns={0:"overall"})
test.columns=['user', 'product', 'overall']

## Get dummies ##

In [19]:
dummy_df=pd.get_dummies(dfs, columns=['bottomcat_other'], drop_first=True)
dummy_df['product']=dummy_df['asin']
dummy_df['user']=dummy_df['reviewerID']
merger_df=dummy_df.drop(columns=['overall', 'asin', 'reviewerID', 'summary', 'reviewTime', 'style', 'bottomcat'
                                 , 'main_cat', 'day', 'month', 'year', 'season'])

In [20]:
trainf=train.merge(merger_df, how = 'inner', on = ['product', 'user'])
testf=test.merge(merger_df, how = 'inner', on = ['product', 'user'])

# Standardize continuous features

In [23]:
np.zeros((10,2)).shape

(10, 2)

In [24]:
continuous = ['vote', 'price', 'review_length']
#to_be_standardized
means_sds_for_standardizing = np.zeros((len(continuous),2))
a=0
for i in continuous:
    meantr=np.nanmean(trainf.loc[:, i])
    stdtr=np.nanstd(trainf.loc[:, i])
    means_sds_for_standardizing[a,0]=meantr
    means_sds_for_standardizing[a,1]=stdtr
    a+=1
    trainf.loc[:,i]=(trainf.loc[:,i]-meantr)/stdtr
    testf.loc[:,i]=(testf.loc[:,i]-meantr)/stdtr

# Impute missing values for price #

In [25]:
trainf['price']=trainf['price'].fillna(np.mean(trainf['price']))
testf['price']=testf['price'].fillna(np.mean(trainf['price']))

In [26]:
df_cf

asin,0,1,2,3,4,5,6,7,8,9,...,2294,2295,2296,2297,2298,2299,2300,2301,2302,2303
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5.0,,,,,,,,,,...,,,,,,,,,,
1,,5.0,,,,,,,,,...,,,,,,,,,,
2,,3.0,,,,,,,,,...,,,,,,,,,,
3,,,5.0,,,,,,,,...,,,,,,,,,,
4,,,5.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1400,,,,,,,,,,,...,,,,,,,,,,
1401,,,,,,,,,,,...,,,,,,,,,,
1402,,,,,,,,,,,...,,,,,,,,,,
1403,,,,,,,,,,,...,,,,,,,,,,


In [27]:
trainf['product'].nunique()

1842