# Sentiment-based product recommendation system for Ebuss
### Author - Kumaraguru Muthuraj

## Install the required libraries

In [1]:
!pip install textblob
!pip install wordcloud



In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame 
import nltk

In [3]:
import re

In [4]:
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
#from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

from sklearn import neighbors
import matplotlib.pyplot as plt

In [5]:
from string import punctuation
from sklearn import svm
from textblob import Word
from wordcloud import WordCloud
import textblob
from textblob import TextBlob

In [6]:
from imblearn.over_sampling import SMOTE

## Task 1: Data Cleaning and Pre-Processing

In [54]:
reviews_df = pd.read_csv("sample30.csv")

In [55]:
reviews_df = reviews_df.drop(['id', 'brand', 'categories', 'manufacturer', 
                              'reviews_date', 'reviews_didPurchase', 'reviews_doRecommend', 
                              'reviews_title', 'reviews_userCity', 'reviews_userProvince'], axis=1)

## Task 2: Text Processing

In [10]:
reviews_df['reviews_tokenized'] = reviews_df['reviews_text'].apply(lambda x: re.sub(r"http\S+", "", x))

def expand_contractions(s):
    s = re.sub(r"won't", "will not", s)
    s = re.sub(r"would't", "would not", s)
    s = re.sub(r"could't", "could not", s)
    s = re.sub(r"\'d", " would", s)
    s = re.sub(r"can\'t", "can not", s)
    s = re.sub(r"n\'t", " not", s)
    s= re.sub(r"\'re", " are", s)
    s = re.sub(r"\'s", " is", s)
    s = re.sub(r"\'ll", " will", s)
    s = re.sub(r"\'t", " not", s)
    s = re.sub(r"\'ve", " have", s)
    s = re.sub(r"\'m", " am", s)
    return s

reviews_df['reviews_tokenized'] =reviews_df['reviews_tokenized'].apply(lambda x:expand_contractions(x))

#Tokenize the review text to words
reg = re.compile('[^a-zA-Z]+')
def tokenizeReview(text):
    text = text.lower()
    text = reg.sub(' ', text).strip()
    return text

reviews_df['reviews_tokenized'] = reviews_df['reviews_tokenized'].apply(tokenizeReview)

reviews_df = reviews_df.drop(['reviews_text'], axis=1)
# Convert 'positive' to 1 and 'negative' to 0
def sentiment2Number(txt):
    if str(txt).lower() == 'positive':
        return 1
    else:
        return 0
    
X = reviews_df['reviews_tokenized']
y = reviews_df['user_sentiment'].apply(sentiment2Number)


0        i love this album. it is very good. more to th...
1        Good flavor. This review was collected as part...
2                                             Good flavor.
3        I read through the reviews on here before look...
4        My husband bought this gel for us. The gel cau...
                               ...                        
29995    I got this conditioner with Influenster to try...
29996    I love it , I received this for review purpose...
29997    First of all I love the smell of this product....
29998    I received this through Influenster and will n...
29999    I received this product complimentary from inf...
Name: reviews_tokenized, Length: 30000, dtype: object

## Task 3: Feature Extraction

In [14]:
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word', stop_words= 'english')
X_features = tfidf.fit_transform(X)

In [15]:
#Handle class imbalance
sm = SMOTE()
X_smote, y_smote = sm.fit_resample(X_features, y)

In [16]:
print(X_smote.shape)
print(y_smote.shape)

(53264, 1000)
(53264,)


### Core function of the project. Fit the tf-idf vectorized features into the text classification model.

In [17]:
def class_balanced_model_fit(X_smote, y_smote, ml_model):
    classification = ml_model.fit(X_smote, y_smote)
    classification_pred = classification.predict(X_smote)

    featureNames = tfidf.get_feature_names()
    coef = classification.coef_.tolist()[0]
    coeff_df = pd.DataFrame({'Word' : featureNames, 'Coefficient' : coef})
    coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
    print('')
    print("************ Top 10 positive features (variables) ************")
    print(coeff_df.head(20).to_string(index=False))
    print('')
    print("************ Top 10 negative features (variables) ************")        
    print(coeff_df.tail(20).to_string(index=False))
    
    return classification

## Task 4: Model Building - Logistic Regression.
### Use Logistic Regression as the classification model and evaluate.

In [18]:
LRClassModel = class_balanced_model_fit(X_smote, y_smote, LogisticRegression(solver='lbfgs', max_iter=500))


************ Top 10 positive features (variables) ************
         Word  Coefficient
        great    18.959889
         love    16.098151
         best    12.944142
         easy    11.889340
         good    11.067077
        clean    10.883783
        loved     9.196505
       better     8.891691
         nice     8.820986
    excellent     8.744004
      perfect     8.560755
      amazing     7.757945
      awesome     7.732272
     favorite     7.645077
      enjoyed     7.521499
    wonderful     6.755897
        handy     6.612516
         free     6.525096
 entertaining     6.095864
        fresh     6.018771

************ Top 10 negative features (variables) ************
          Word  Coefficient
 disappointing    -2.575614
       chicken    -2.730954
 unfortunately    -2.847498
         crazy    -2.862469
      resident    -3.043313
           sad    -3.154446
          base    -3.491000
         wrong    -4.314284
          cold    -4.326171
          hate    -4.7153

## Task 5: Build the user-based recommendation system.

### Creating dummy dataset
These dataset will be used for prediction 
- Dummy train will be used later for prediction of the movies which has not been rated by the user. To ignore the movies rated by the user, we will mark it as 0 during prediction. The movies not rated by user is marked as 1 for prediction in dummy train dataset. 

In [21]:
# Copy the dataset into dummy
dummy_reviews_df = reviews_df.copy()
dummy_reviews_df.head()

Unnamed: 0,name,reviews_rating,reviews_username,user_sentiment,reviews_tokenized
0,Pink Friday: Roman Reloaded Re-Up (w/dvd),5,joshua,Positive,i love this album it is very good more to the ...
1,Lundberg Organic Cinnamon Toast Rice Cakes,5,dorothy w,Positive,good flavor this review was collected as part ...
2,Lundberg Organic Cinnamon Toast Rice Cakes,5,dorothy w,Positive,good flavor
3,K-Y Love Sensuality Pleasure Gel,1,rebecca,Negative,i read through the reviews on here before look...
4,K-Y Love Sensuality Pleasure Gel,1,walker557,Negative,my husband bought this gel for us the gel caus...


In [22]:
print(dummy_reviews_df['reviews_rating'].value_counts())
print(dummy_reviews_df.shape)

5    20831
4     6020
1     1384
3     1345
2      420
Name: reviews_rating, dtype: int64
(30000, 5)


In [23]:
# The movies not rated by user is marked as 1 for prediction. 
dummy_reviews_df['reviews_rating'] = dummy_reviews_df['reviews_rating'].apply(lambda x: 0 if x>=1 else 1)

In [24]:
dummy_reviews_df.reviews_rating.unique()

array([0], dtype=int64)

In [25]:
dummy_reviews_df.head()

Unnamed: 0,name,reviews_rating,reviews_username,user_sentiment,reviews_tokenized
0,Pink Friday: Roman Reloaded Re-Up (w/dvd),0,joshua,Positive,i love this album it is very good more to the ...
1,Lundberg Organic Cinnamon Toast Rice Cakes,0,dorothy w,Positive,good flavor this review was collected as part ...
2,Lundberg Organic Cinnamon Toast Rice Cakes,0,dorothy w,Positive,good flavor
3,K-Y Love Sensuality Pleasure Gel,0,rebecca,Negative,i read through the reviews on here before look...
4,K-Y Love Sensuality Pleasure Gel,0,walker557,Negative,my husband bought this gel for us the gel caus...


In [26]:
# Convert the dummy train dataset into matrix format.
dummy_reviews_df = dummy_reviews_df.pivot_table(index='reviews_username', columns='name', values='reviews_rating').fillna(1)

In [27]:
dummy_reviews_df.shape

(24914, 271)

#### Checking for a given user if the reviewed items are set to 0 and rest to 1. All good so far.

In [28]:
print(dummy_reviews_df.loc['abby'].value_counts())
print(dummy_reviews_df.loc['abismomy'].value_counts())
print(dummy_reviews_df.loc['ac94'].value_counts())


1.0    268
0.0      3
Name: abby, dtype: int64
1.0    269
0.0      2
Name: abismomy, dtype: int64
1.0    269
0.0      2
Name: ac94, dtype: int64


In [29]:
dummy_reviews_df

name,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. Fire File Chest,100:Complete First Season (blu-Ray),2017-2018 Brownline174 Duraflex 14-Month Planner 8 1/2 X 11 Black,"2x Ultra Era with Oxi Booster, 50fl oz","42 Dual Drop Leaf Table with 2 Madrid Chairs""",4C Grated Parmesan Cheese 100% Natural 8oz Shaker,5302050 15/16 FCT/HOSE ADAPTOR,Africa's Best No-Lye Dual Conditioning Relaxer System Super,Alberto VO5 Salon Series Smooth Plus Sleek Shampoo,Alex Cross (dvdvideo),...,Walkers Stem Ginger Shortbread,"Wallmount Server Cabinet (450mm, 9 RU)","Way Basics 3-Shelf Eco Narrow Bookcase Storage Shelf, Espresso - Formaldehyde Free - Lifetime Guarantee","WeatherTech 40647 14-15 Outlander Cargo Liners Behind 2nd Row, Black",Wedding Wishes Wedding Guest Book,Weleda Everon Lip Balm,Wilton Black Dots Standard Baking Cups,Windex Original Glass Cleaner Refill 67.6oz (2 Liter),Yes To Carrots Nourishing Body Wash,Yes To Grapefruit Rejuvenating Body Wash
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
00sab00,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
01impala,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
02dakota,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
02deuce,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zxcsdfd,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
zxjki,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
zyiah4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
zzdiane,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### User Similarity Matrix

### Using adjusted Cosine 

### Here, we are not removing the NaN values and calculating the mean only for the items rated by the user

In [30]:
# Create a user-item review matrix.
df_pivot = reviews_df.pivot_table(index='reviews_username', columns='name', values='reviews_rating')

In [31]:
df_pivot.head()

name,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. Fire File Chest,100:Complete First Season (blu-Ray),2017-2018 Brownline174 Duraflex 14-Month Planner 8 1/2 X 11 Black,"2x Ultra Era with Oxi Booster, 50fl oz","42 Dual Drop Leaf Table with 2 Madrid Chairs""",4C Grated Parmesan Cheese 100% Natural 8oz Shaker,5302050 15/16 FCT/HOSE ADAPTOR,Africa's Best No-Lye Dual Conditioning Relaxer System Super,Alberto VO5 Salon Series Smooth Plus Sleek Shampoo,Alex Cross (dvdvideo),...,Walkers Stem Ginger Shortbread,"Wallmount Server Cabinet (450mm, 9 RU)","Way Basics 3-Shelf Eco Narrow Bookcase Storage Shelf, Espresso - Formaldehyde Free - Lifetime Guarantee","WeatherTech 40647 14-15 Outlander Cargo Liners Behind 2nd Row, Black",Wedding Wishes Wedding Guest Book,Weleda Everon Lip Balm,Wilton Black Dots Standard Baking Cups,Windex Original Glass Cleaner Refill 67.6oz (2 Liter),Yes To Carrots Nourishing Body Wash,Yes To Grapefruit Rejuvenating Body Wash
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,,,,,,,,,,,...,,,,,,,,,,
00sab00,,,,,,,,,,,...,,,,,,,,,,
01impala,,,,,,,,,,,...,,,,,,,,,,
02dakota,,,,,,,,,,,...,,,,,,,,,,
02deuce,,,,,,,,,,,...,,,,,,,,,,


### Normalising the rating of the item for each user around 0 mean

In [32]:
#This does the mean for each row as the axis is 1
mean = np.nanmean(df_pivot, axis=1)

In [33]:
print(mean)
print(len(mean))

[4. 2. 3. ... 5. 5. 4.]
24914


In [34]:
df_subtracted = (df_pivot.T - mean).T

In [35]:
df_subtracted.head()

name,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. Fire File Chest,100:Complete First Season (blu-Ray),2017-2018 Brownline174 Duraflex 14-Month Planner 8 1/2 X 11 Black,"2x Ultra Era with Oxi Booster, 50fl oz","42 Dual Drop Leaf Table with 2 Madrid Chairs""",4C Grated Parmesan Cheese 100% Natural 8oz Shaker,5302050 15/16 FCT/HOSE ADAPTOR,Africa's Best No-Lye Dual Conditioning Relaxer System Super,Alberto VO5 Salon Series Smooth Plus Sleek Shampoo,Alex Cross (dvdvideo),...,Walkers Stem Ginger Shortbread,"Wallmount Server Cabinet (450mm, 9 RU)","Way Basics 3-Shelf Eco Narrow Bookcase Storage Shelf, Espresso - Formaldehyde Free - Lifetime Guarantee","WeatherTech 40647 14-15 Outlander Cargo Liners Behind 2nd Row, Black",Wedding Wishes Wedding Guest Book,Weleda Everon Lip Balm,Wilton Black Dots Standard Baking Cups,Windex Original Glass Cleaner Refill 67.6oz (2 Liter),Yes To Carrots Nourishing Body Wash,Yes To Grapefruit Rejuvenating Body Wash
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,,,,,,,,,,,...,,,,,,,,,,
00sab00,,,,,,,,,,,...,,,,,,,,,,
01impala,,,,,,,,,,,...,,,,,,,,,,
02dakota,,,,,,,,,,,...,,,,,,,,,,
02deuce,,,,,,,,,,,...,,,,,,,,,,


### Finding cosine similarity

In [36]:
from sklearn.metrics.pairwise import pairwise_distances

In [37]:
# Creating the User Similarity Matrix using pairwise_distance function.
user_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Prediction - User User

Doing the prediction for the users which are positively related with other users, and not the users which are negatively related as we are interested in the users which are more similar to the current users. So, ignoring the correlation for values less than 0. 

In [38]:
user_correlation[user_correlation<0]=0
user_correlation

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Rating predicted by the user (for items rated as well as not rated) is the weighted sum of correlation with the item rating (as present in the rating dataset). 

In [39]:
user_predicted_ratings = np.dot(user_correlation, df_pivot.fillna(0))
user_predicted_ratings

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 6.65475678, 0.        , ..., 5.343701  , 0.42640143,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

Since we are interested only in the items not rated by the user, we will ignore the items rated by the user by making it zero. 

In [40]:
user_final_rating = np.multiply(user_predicted_ratings,dummy_reviews_df)
user_final_rating.head()

name,0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. Fire File Chest,100:Complete First Season (blu-Ray),2017-2018 Brownline174 Duraflex 14-Month Planner 8 1/2 X 11 Black,"2x Ultra Era with Oxi Booster, 50fl oz","42 Dual Drop Leaf Table with 2 Madrid Chairs""",4C Grated Parmesan Cheese 100% Natural 8oz Shaker,5302050 15/16 FCT/HOSE ADAPTOR,Africa's Best No-Lye Dual Conditioning Relaxer System Super,Alberto VO5 Salon Series Smooth Plus Sleek Shampoo,Alex Cross (dvdvideo),...,Walkers Stem Ginger Shortbread,"Wallmount Server Cabinet (450mm, 9 RU)","Way Basics 3-Shelf Eco Narrow Bookcase Storage Shelf, Espresso - Formaldehyde Free - Lifetime Guarantee","WeatherTech 40647 14-15 Outlander Cargo Liners Behind 2nd Row, Black",Wedding Wishes Wedding Guest Book,Weleda Everon Lip Balm,Wilton Black Dots Standard Baking Cups,Windex Original Glass Cleaner Refill 67.6oz (2 Liter),Yes To Carrots Nourishing Body Wash,Yes To Grapefruit Rejuvenating Body Wash
reviews_username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00dog3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00sab00,0.0,6.654757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.128914,...,0.0,0.0,0.0,1.218834,0.0,0.0,0.0,5.343701,0.426401,0.0
01impala,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
02dakota,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
02deuce,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Finding the top 20 recommendations for the *user*

In [41]:
#Take the user name as input.
user_input = "abby"

In [42]:
top20UserBased = user_final_rating.loc[user_input].sort_values(ascending=False)[0:20]
top20UserBased

name
Godzilla 3d Includes Digital Copy Ultraviolet 3d/2d Blu-Ray/dvd                                  6.588478
Tostitos Bite Size Tortilla Chips                                                                6.335033
Planes: Fire Rescue (2 Discs) (includes Digital Copy) (blu-Ray/dvd)                              5.757683
Hoover174 Platinum Collection153 Lightweight Bagged Upright Vacuum With Canister - Uh30010com    4.602982
Clorox Disinfecting Bathroom Cleaner                                                             4.180387
Jason Aldean - They Don't Know                                                                   3.965442
Windex Original Glass Cleaner Refill 67.6oz (2 Liter)                                            3.535534
Chester's Cheese Flavored Puffcorn Snacks                                                        3.535534
Clorox Disinfecting Wipes Value Pack Scented 150 Ct Total                                        3.448282
The Resident Evil Collection 5 Discs (blu

### Finding the top 20 recommendation for the *user*

In [43]:
# Take the user ID as input
user_input = "abby"
print(user_input)

abby


## Task 6: Recommendation of Top 20 Products to a specified user. 
### Since user-based collaborative system has a lesser RMSE, we go with that. 

In [44]:
top20UserBased

name
Godzilla 3d Includes Digital Copy Ultraviolet 3d/2d Blu-Ray/dvd                                  6.588478
Tostitos Bite Size Tortilla Chips                                                                6.335033
Planes: Fire Rescue (2 Discs) (includes Digital Copy) (blu-Ray/dvd)                              5.757683
Hoover174 Platinum Collection153 Lightweight Bagged Upright Vacuum With Canister - Uh30010com    4.602982
Clorox Disinfecting Bathroom Cleaner                                                             4.180387
Jason Aldean - They Don't Know                                                                   3.965442
Windex Original Glass Cleaner Refill 67.6oz (2 Liter)                                            3.535534
Chester's Cheese Flavored Puffcorn Snacks                                                        3.535534
Clorox Disinfecting Wipes Value Pack Scented 150 Ct Total                                        3.448282
The Resident Evil Collection 5 Discs (blu

## Task 7: Fine-Tuning the Recommendation System and Recommendation of Top 5 Products.

### Print our old tokenized reviews dataframe

In [45]:
print(reviews_df)

                                                    name  reviews_rating  \
0              Pink Friday: Roman Reloaded Re-Up (w/dvd)               5   
1             Lundberg Organic Cinnamon Toast Rice Cakes               5   
2             Lundberg Organic Cinnamon Toast Rice Cakes               5   
3                       K-Y Love Sensuality Pleasure Gel               1   
4                       K-Y Love Sensuality Pleasure Gel               1   
...                                                  ...             ...   
29995  L'or233al Paris Elvive Extraordinary Clay Reba...               5   
29996  L'or233al Paris Elvive Extraordinary Clay Reba...               5   
29997  L'or233al Paris Elvive Extraordinary Clay Reba...               5   
29998  L'or233al Paris Elvive Extraordinary Clay Reba...               5   
29999  L'or233al Paris Elvive Extraordinary Clay Reba...               5   

      reviews_username user_sentiment  \
0               joshua       Positive   
1    

### Identify the sentiments of the top 20 items, average them per item and pick top 5. 

#### TF-IDF Vectorize all the reviews and classify them as positive or negative and merge them with the dataframe.

In [46]:
reviews_df

Unnamed: 0,name,reviews_rating,reviews_username,user_sentiment,reviews_tokenized
0,Pink Friday: Roman Reloaded Re-Up (w/dvd),5,joshua,Positive,i love this album it is very good more to the ...
1,Lundberg Organic Cinnamon Toast Rice Cakes,5,dorothy w,Positive,good flavor this review was collected as part ...
2,Lundberg Organic Cinnamon Toast Rice Cakes,5,dorothy w,Positive,good flavor
3,K-Y Love Sensuality Pleasure Gel,1,rebecca,Negative,i read through the reviews on here before look...
4,K-Y Love Sensuality Pleasure Gel,1,walker557,Negative,my husband bought this gel for us the gel caus...
...,...,...,...,...,...
29995,L'or233al Paris Elvive Extraordinary Clay Reba...,5,laurasnchz,Positive,i got this conditioner with influenster to try...
29996,L'or233al Paris Elvive Extraordinary Clay Reba...,5,scarlepadilla,Positive,i love it i received this for review purposes ...
29997,L'or233al Paris Elvive Extraordinary Clay Reba...,5,liviasuexo,Positive,first of all i love the smell of this product ...
29998,L'or233al Paris Elvive Extraordinary Clay Reba...,5,ktreed95,Positive,i received this through influenster and will n...


In [47]:
#Do reviews classification as good or bad sentiment. 
tfidfFeatures = tfidf.transform(reviews_df.reviews_tokenized)
sentcls = LRClassModel.predict(tfidfFeatures)

In [48]:
#Merge the class to the dataframe
sntmtClassSeries = pd.Series(sentcls, name = "sentiment_class")
reviews_df = reviews_df.join(sntmtClassSeries)
#print(reviews_df[['name', 'reviews_tokenized', 'sentiment_class']])

#### Group the sentiments by item name and average them.

In [49]:
groupedDf = reviews_df.groupby(['name'])
product_class = groupedDf['sentiment_class'].agg(mean_class=np.mean)

In [50]:
print(type(product_class))
print(product_class.info)

<class 'pandas.core.frame.DataFrame'>
<bound method DataFrame.info of                                                     mean_class
name                                                          
0.6 Cu. Ft. Letter A4 Size Waterproof 30 Min. F...    0.666667
100:Complete First Season (blu-Ray)                   0.899281
2017-2018 Brownline174 Duraflex 14-Month Planne...    0.250000
2x Ultra Era with Oxi Booster, 50fl oz                1.000000
42 Dual Drop Leaf Table with 2 Madrid Chairs"         1.000000
...                                                        ...
Weleda Everon Lip Balm                                1.000000
Wilton Black Dots Standard Baking Cups                1.000000
Windex Original Glass Cleaner Refill 67.6oz (2 ...    0.718391
Yes To Carrots Nourishing Body Wash                   1.000000
Yes To Grapefruit Rejuvenating Body Wash              0.769231

[271 rows x 1 columns]>


In [51]:
top20UserBased

name
Godzilla 3d Includes Digital Copy Ultraviolet 3d/2d Blu-Ray/dvd                                  6.588478
Tostitos Bite Size Tortilla Chips                                                                6.335033
Planes: Fire Rescue (2 Discs) (includes Digital Copy) (blu-Ray/dvd)                              5.757683
Hoover174 Platinum Collection153 Lightweight Bagged Upright Vacuum With Canister - Uh30010com    4.602982
Clorox Disinfecting Bathroom Cleaner                                                             4.180387
Jason Aldean - They Don't Know                                                                   3.965442
Windex Original Glass Cleaner Refill 67.6oz (2 Liter)                                            3.535534
Chester's Cheese Flavored Puffcorn Snacks                                                        3.535534
Clorox Disinfecting Wipes Value Pack Scented 150 Ct Total                                        3.448282
The Resident Evil Collection 5 Discs (blu

### Overwrite the top20 item ratings with the average sentiment ranging from 1 to 5.

In [52]:
for itmName in list(top20UserBased.index):
    top20UserBased[itmName] = product_class.loc[itmName][0]

### Top 5 items picked by sentiments.

In [53]:
top20UserBased.sort_values(ascending=False)[:5]

name
Tostitos Simply Blue Corn Tortilla Chips                                                         0.952381
My Big Fat Greek Wedding 2 (blu-Ray + Dvd + Digital)                                             0.914671
Clorox Disinfecting Bathroom Cleaner                                                             0.882786
Hoover174 Platinum Collection153 Lightweight Bagged Upright Vacuum With Canister - Uh30010com    0.878453
Planes: Fire Rescue (2 Discs) (includes Digital Copy) (blu-Ray/dvd)                              0.876640
Name: abby, dtype: float64

##### END OF FILE