In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string,re
from nltk.corpus import stopwords
import pickle
from sklearn.model_selection import train_test_split,GridSearchCV
from collections import Counter

In [None]:
# Libraries for text preprocessing
import  nltk, spacy
nlp = spacy.load("en_core_web_sm", disable = ['ner', 'parser'])
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import MinMaxScaler

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)
pd.set_option("display.width",None)
pd.set_option("display.max_colwidth",None)

## Data Cleaning and Preprocessing 

In [None]:
df=pd.read_csv("sample30.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

###   We observe that there are three columns that will be useful in building a sentiment classification model:
### 1. `reviews_text`: It contains the reviews given by the users to a particular product
### 2. `reviews_title`: It contains the title of the review given in previous column
### 3. `user_sentiment`: It contains the overall sentiment of the user for a particular product (Positive or Negative). We will use them as labels in our model.

In [None]:
df.isnull().sum()

### There are 190 missing values in `reviews_title` ->  we will replace them with blank.
###  There is 1 missing value in `user_sentiment`-> we will simply drop it.

In [None]:
df["reviews_title"].fillna("",inplace=True)


In [None]:
df=df[~df["user_sentiment"].isnull()]

In [None]:
df.isnull().sum()

###   We now create a new dataframe containing only two columns:-
### 1. First column will be a concatenation of the two columns: `reviews_text` and `reviews_title`.
### 2. Second column will be the `user_sentiment` column and it will serve as our target column.

In [None]:
df['reviews']=df['reviews_text']+" "+df["reviews_title"]
df.head(1)

In [None]:
final_df=df[["reviews","user_sentiment"]]
final_df.head()

In [None]:
final_df.info()

In [None]:
final_df["user_sentiment"].value_counts()

In [None]:
plt.figure(figsize=(5,5))
sns.countplot(x="user_sentiment",data=final_df)
plt.show()

####  As our target column is highly imbalanced, we will have to use techniques to handle imbalanced data during our model building process.

## ` Text Preprocessing `

In [None]:
final_df.head(10)

In [None]:
def clean_text(text):
    
    # Make the text lowercase
    text=text.lower()
    
    # remove punctuation
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)

    return text

In [None]:
final_df["reviews"]=final_df["reviews"].apply(lambda x:clean_text(x))
final_df.head(10)

In [None]:
stop_words = stopwords.words('english')

In [None]:
def lemmatize(text):
    text = nlp(text)
    text = [token.lemma_ for token in text]
    return " ".join(text)

In [None]:
final_df["reviews"]=final_df["reviews"].apply(lambda x:lemmatize(x))

In [None]:
final_df.head(10)

In [None]:
final_df["reviews"] = final_df["reviews"].str.replace('-PRON-','')

In [None]:
final_df.head(10)

In [None]:
doc_lens = [len(d) for d in final_df['reviews']]
doc_lens[:5]

In [None]:
plt.figure(figsize=(10,6))
plt.hist(doc_lens, edgecolor='black', bins = 50)
plt.title('Distribution of Review character length', fontsize=25)
plt.ylabel('Number of Reviews', fontsize=20)
plt.xlabel('Review character length', fontsize=20)

plt.show()

In [None]:
wordcloud = WordCloud(background_color='white',
                      stopwords=stop_words,
                      max_words=40,
                      max_font_size=40,
                      scale=30,
                      random_state=42
                      ).generate(str(final_df["reviews"]))

fig = plt.figure(figsize=(10,6))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
pickle.dump(final_df, open('picklecopy\processed_data.pkl', 'wb'))

## `Feature Extraction`

In [None]:
X=final_df["reviews"]
y=final_df["user_sentiment"]


In [None]:
tfv = TfidfVectorizer(ngram_range = (1,3),stop_words='english',max_df=.95,min_df=2)
X_tfv=tfv.fit_transform(X)


In [None]:
tfv_feature_names=tfv.get_feature_names()
len(tfv_feature_names)

In [None]:
pickle.dump(tfv.vocabulary_, open("picklecopy/tfidf_vocabulary.pkl","wb"))

In [None]:
X_train_tfv, X_test_tfv, y_train, y_test = train_test_split(X_tfv, y, test_size=0.25, random_state=42)

In [None]:
X_train_tfv.shape,y_train.shape

In [None]:
X_test_tfv.shape,y_test.shape

In [None]:
y_test.value_counts()

## `Model Building`

### `1.Logistic Regression`

In [None]:
logit=LogisticRegression()
logit.fit(X_train_tfv,y_train)


In [None]:
y_pred_train=logit.predict(X_train_tfv)
print(accuracy_score(y_pred_train,y_train))
print(classification_report(y_pred_train,y_train))

In [None]:
cm = confusion_matrix(y_train,y_pred_train)
print(cm)

In [None]:
y_pred_test=logit.predict(X_test_tfv)
print(accuracy_score(y_pred_test,y_test))
print(classification_report(y_pred_test,y_test))

As we can see the Precision is very low for negative sentiment and the model is biased towards the majority case.We can increase the f1 score by `SMOTE` technique.

In [None]:
from imblearn.over_sampling import SMOTE
sm=SMOTE(random_state=100)

In [None]:
X_train_tfvsm,y_trainsm=sm.fit_resample(X_train_tfv,y_train)

In [None]:
print(y_trainsm.value_counts())

In [None]:
counter = Counter(y_train)
print('Before',counter)

In [None]:
counter = Counter(y_trainsm)
print('After',counter)

In [None]:
logitsm=LogisticRegression()
logitsm.fit(X_train_tfvsm,y_trainsm)


In [None]:
y_pred_trainsm=logitsm.predict(X_train_tfvsm)
print(accuracy_score(y_pred_trainsm,y_trainsm))
print(classification_report(y_pred_trainsm,y_trainsm))

In [None]:
cm = confusion_matrix(y_trainsm,y_pred_trainsm)
print(cm)

In [None]:
cm = confusion_matrix(y_test,y_pred_testsm)
print(cm)

In [None]:
y_pred_testsm=logitsm.predict(X_test_tfv)
print(accuracy_score(y_pred_testsm,y_test))
print(classification_report(y_pred_testsm,y_test))

In [None]:
f1_lr = f1_score(y_test, y_pred_testsm, average="weighted")
f1_lr

In [None]:
# Hyperparameter tuning to improve performance of the model:
param_grid_lr = {
     'penalty': ['l1', 'l2'],
     'C': [0.001,0.01,0.1,1,10,100],
     'solver':['liblinear', 'saga']
    }

grid_lr = GridSearchCV(estimator=logitsm, 
                       param_grid=param_grid_lr,
                       verbose=1,
                       scoring='roc_auc',
                       n_jobs=-1,
                       cv=4)
grid_lr.fit(X_train_tfvsm, y_trainsm)
print(grid_lr.best_params_)

In [None]:
logitsm_tune=LogisticRegression(C=100, 
                               penalty='l2', 
                               solver='liblinear')

In [None]:
logitsm_tune.fit(X_train_tfvsm, y_trainsm)
y_pred_testsm_tune = logitsm_tune.predict(X_test_tfv)

In [None]:
cm = confusion_matrix(y_test,y_pred_testsm_tune)
print(cm)

In [None]:
print(accuracy_score(y_pred_testsm_tune,y_test))
print(classification_report(y_pred_testsm_tune,y_test))

In [None]:
f1_lr_tune = f1_score(y_test, y_pred_testsm_tune, average="weighted")
f1_lr_tune

## `2.Random Forest` 

In [None]:
rfc=RandomForestClassifier(n_estimators=50)
rfc.fit(X_train_tfvsm,y_trainsm)

In [None]:
y_pred_rfc=rfc.predict(X_test_tfv)

In [None]:
cm=confusion_matrix(y_test,y_pred_rfc)
print(cm)

In [None]:
f1_rfc = f1_score(y_test, y_pred_rfc, average="weighted")
f1_rfc

In [None]:
print(accuracy_score(y_pred_rfc,y_test))
print(classification_report(y_pred_rfc,y_test))

In [None]:
# Hyperparameter tuning to improve performance of the model:
param_grid_rf = {
     'n_estimators': [50,100,150],
     'criterion':['gini','entropy'],
     'max_depth': [30,40,50],
     'min_samples_split': [2, 5, 10],
     'min_samples_leaf': [1, 5, 10],
       }

grid_rfc = GridSearchCV(estimator=rfc, 
                        param_grid=param_grid_rf,
                        scoring='roc_auc',
                        verbose=1,
                        n_jobs=-1,
                        cv=3)
grid_rfc.fit(X_train_tfvsm, y_trainsm)
print(grid_rfc.best_params_)

In [None]:
rfc_tune=RandomForestClassifier(criterion= 'gini', 
                                max_depth= 50, 
                                min_samples_leaf= 1, 
                                min_samples_split= 2, 
                                n_estimators= 150)

In [None]:
rfc_tune.fit(X_train_tfvsm,y_trainsm)

In [None]:
y_pred_rfctune=rfc_tune.predict(X_test_tfv)

In [None]:
cm=confusion_matrix(y_test,y_pred_rfctune)
print(cm)

In [None]:
f1_rfc_tune = f1_score(y_test, y_pred_rfctune, average="weighted")
f1_rfc_tune

In [None]:
print(accuracy_score(y_pred_rfctune,y_test))
print(classification_report(y_pred_rfctune,y_test))

## `3.Naive Bayes`

In [None]:
nb = MultinomialNB()
nb.fit(X_train_tfvsm,y_trainsm)

In [None]:
y_pred_nb=nb.predict(X_test_tfv)

In [None]:
cm=confusion_matrix(y_test,y_pred_nb)
print(cm)

In [None]:
f1_nb = f1_score(y_test, y_pred_nb, average="weighted")
f1_nb

In [None]:
print(accuracy_score(y_pred_nb,y_test))
print(classification_report(y_pred_nb,y_test))

In [None]:
# Hyperparameter tuning to improve performance of the model:
param_grid_nb = {
     'alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001),
     'fit_prior':[True, False]
 }
grid_nb = GridSearchCV(estimator=nb, 
                       param_grid=param_grid_nb,
                       verbose=1,
                       scoring='f1_weighted',
                       n_jobs=-1,
                       cv=10)
grid_nb.fit(X_train_tfvsm, y_trainsm)
print(grid_nb.best_params_)

In [None]:
nb_tune = MultinomialNB(alpha=0.00001, fit_prior=True)
nb_tune.fit(X_train_tfvsm,y_trainsm)

In [None]:
y_pred_nbtune=nb_tune.predict(X_test_tfv)

In [None]:
cm=confusion_matrix(y_test,y_pred_nbtune)
print(cm)

In [None]:
f1_nb_tune = f1_score(y_test, y_pred_nbtune, average="weighted")
f1_nb_tune

In [None]:
print(accuracy_score(y_pred_nbtune,y_test))
print(classification_report(y_pred_nbtune,y_test))

In [None]:
data=[["Logistic Regression",round(f1_lr,2),round(f1_lr_tune,2)],
      ["Random Forest",round(f1_rfc,2),round(f1_rfc_tune,2)],
      ["Naive Bayes",round(f1_nb,2),round(f1_nb_tune,2)]]
summary=pd.DataFrame(data,columns=["Model Name","F1_score(untune)","F1_score(tune)"])
summary
                                   

#### Model XGBOOST takes more time than other models and it size is more and hence it cannot be deployed in `HEROKU` (cloud application platform). 

###  From the above summary table, we compare all the models built and select the `tuned Logistic Regression` model. There are two reasons we select this model:-

### 1. This model gives the highest `weighted F1 Score` 

### 2. It takes much less time to train than other models giving same `weighted F1 Score`

In [None]:
pickle.dump(logitsm_tune, open("picklecopy/Tuned_logreg_model.pkl", "wb"))

# 2. Building a recommendation system 

In this task, we will perform the following sub-tasks:

2.1 - **User-based Recommendation System**  
2.2 - **Item-based Recommendation System**  
2.3 - **Select best Recommendation System**  
2.4 - **Recommend top-20 products to user**

## 2.1 User-based Recommendation System

In [None]:
df.head(2)

In [None]:
ratings=df[["reviews_username","name","reviews_rating"]]

In [None]:
ratings.head()

In [None]:
ratings.shape

In [None]:
ratings.isnull().sum()

In [None]:
#drop the rows in reviews_username
ratings=ratings[~ratings["reviews_username"].isnull()]
ratings.isnull().sum()

In [None]:
ratings.drop_duplicates(subset=["reviews_username","name","reviews_rating"],keep='first',inplace=True)

In [None]:
ratings.info()

In [None]:
ratings.shape

In [None]:
# Test and Train split of the dataset.
train,test=train_test_split(ratings,test_size=0.3,random_state=42)
print("shape of the train set = {}".format(train.shape))
print("shape of the test set = {}".format(test.shape))

In [None]:
#Create a pivot table with all user names as index, all products as columns and ratings as values
# Note: Here we use fillna(0) to give 0 ratings to products that have not been rated by corresponding user
df_pivot = train.pivot_table(
    index="reviews_username",
    columns='name',
    values='reviews_rating'
).fillna(0)

# View first five rows of the pivot table
df_pivot.head()

In [None]:
df_pivot.shape

#### `dummy train` will be used for prediction of ratings given by peer users to the products that have not been rated by user `u`. For this, we create a copy of the training dataset and then give `0` rating to the products that have already been rated and `1` to the non-rated products.
####  `dummy test` will be used for evaluation. As we want to evaluate only those products that have been rated, we give `1` rating to the products that have already been rated by user `u` and `0` to the non-rated products.

In [None]:
#Copy the train dataset into dummy_train.
dummy_train=train.copy()
dummy_train.head()

In [None]:
dummy_train.reviews_rating.value_counts()

In [None]:
# The products not rated by user is marked as 1 for prediction.

dummy_train["reviews_rating"]=dummy_train["reviews_rating"].apply(lambda x:0 if x>=1 else 1)

In [None]:
# Convert the dummy train dataset into matrix format.
dummy_train=dummy_train.pivot_table(
        index="reviews_username",
        columns="name",
        values="reviews_rating",
        ).fillna(1)

dummy_train.head()

In [None]:
dummy_train.shape

#### Next, we find similarity between the users by using ` cosine similarity` metric.

In [None]:
# Create the user similarity Matrix using pairwise_distance function
user_correlation = 1 - pairwise_distances(df_pivot, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

In [None]:
user_correlation.shape

## Prediction (UBCF) 

In [None]:
# Replace negative correlations with 0
user_correlation[user_correlation<0]=0
user_correlation

In [None]:
# Matrix multiplication of matrices user similarity matrix and original matrix df_pivot
# This will give predicted ratings of the users corresponding to each product in the dataset
user_predicted_ratings = np.dot(user_correlation, df_pivot.fillna(0))
user_predicted_ratings

In [None]:
user_predicted_ratings.shape

###  We are only interested in products that are not rated by user `u`. So we ignore the rated products by setting their ratings as `0`.
###   For this, we multiply the `dummy_train` matrix with the `user_predicted_ratings` matrix. 

In [None]:
# Multiplying 'dummy_train' with 'user_predicted_ratings' to make ratings of non-rated products 0
user_final_rating = np.multiply(user_predicted_ratings, dummy_train)
user_final_rating.head()

In [None]:
user_final_rating.sample(10)

In [None]:
d = user_final_rating.loc['lucky'].sort_values(ascending=False)[0:20]
d

##  Evaluation (UBCF) 

### Evaluation is the same as prediction except for one thing: here we will evaluate for products that are already rated by user `u` 

In [None]:
common = test[test.reviews_username.isin(train.reviews_username)]
common.shape

In [None]:
common.head()

In [None]:
# convert into the user-product matrix (pivot form)
common_user_based_matrix = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating')
common_user_based_matrix.head()

In [None]:
common_user_based_matrix.shape

### Now, we will filter out correlations of those users that are common in both train and test datasets.

In [None]:
# Convert the user_correlation matrix into dataframe.
user_correlation_df = pd.DataFrame(user_correlation)
user_correlation_df.head()

In [None]:
# Set index of user correlation df as index of df_pivot
user_correlation_df['reviews_username'] = df_pivot.index
user_correlation_df.set_index('reviews_username',inplace=True)
user_correlation_df.head()

In [None]:
# Put all user names in a list
list_name = common.reviews_username.tolist()

# Set column names of user correlation df as indices of df_subtracted
user_correlation_df.columns = df_pivot.index.tolist()

# Filter out those user correlations that are present in both
user_correlation_df_1 =  user_correlation_df[user_correlation_df.index.isin(list_name)]

In [None]:
user_correlation_df_1.shape

In [None]:
user_correlation_df_2 = user_correlation_df_1.T[user_correlation_df_1.T.index.isin(list_name)]

user_correlation_df_3 = user_correlation_df_2.T

user_correlation_df_3.head()

In [None]:
user_correlation_df_3.shape

In [None]:
# Set negative correlations to 0
user_correlation_df_3[user_correlation_df_3<0]=0

# Dot product of 'user_correlation_df_3' and 'common_user_based_matrix'
common_user_predicted_ratings = np.dot(user_correlation_df_3, common_user_based_matrix.fillna(0))
common_user_predicted_ratings

In [None]:
# We get the predicted ratings of users that have already rated the products
dummy_test = common.copy()

dummy_test['reviews_rating'] = dummy_test["reviews_rating"].apply(lambda x: 1 if x>=1 else 0)

dummy_test = dummy_test.pivot_table(index='reviews_username', columns='name', values='reviews_rating').fillna(0)

In [None]:
dummy_test.shape

In [None]:
# Multiply 'common_user_predicted_ratings' with 'dummy_test'
common_user_predicted_ratings = np.multiply(common_user_predicted_ratings,dummy_test)

# Check first few rows
common_user_predicted_ratings.head()

###  Now, we have to calculate the RMSE for only the products rated by the users
###  For this, we normalize the ratings to bring them in range 1 to 5.

In [None]:
from numpy import *

# Create a copy of 'common_user_predicted_ratings'
X  = common_user_predicted_ratings.copy() 

# Filter out positive ratings
X = X[X>0]

# Normalize the ratings and bring them within range 1 to 5
scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)

In [None]:
total_non_nan = np.count_nonzero(~np.isnan(y))
total_non_nan

In [None]:
rmse = (sum(sum((common_user_based_matrix - y )**2))/total_non_nan)**0.5
print(rmse)

## 2.2 Item-based Recommendation System 

In [None]:
# Create a pivot table with all user names as index, all products as columns and ratings as values
# Note: Here we take a transpose to get products (items) as indices and users as columns
df_pivot = train.pivot_table(
    index='reviews_username',
    columns='name',
    values='reviews_rating'
).T.fillna(0)

# View first five rows of the pivot table
df_pivot.head()

In [None]:
# Create the item similarity Matrix using pairwise_distance function
item_correlation = 1 - pairwise_distances(df_pivot, metric='cosine')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation)

In [None]:
# Replace negative correlations with 0
item_correlation[item_correlation<0]=0
item_correlation

# Prediction - Item Item

In [None]:
# Matrix multiplication of matrices user similarity matrix and transposed matrix df_pivot
# This will give predicted ratings of the users corresponding to each product in the dataset
item_predicted_ratings = np.dot((df_pivot.fillna(0).T),item_correlation)
item_predicted_ratings

In [None]:
item_predicted_ratings.shape

In [None]:
# Check whether the above shape is same as that of 'dummy_train'
dummy_train.shape

In [None]:
# Multiplying 'dummy_train' with 'item_predicted_ratings' to make ratings of rated products 0
item_final_rating = np.multiply(item_predicted_ratings,dummy_train)
item_final_rating.head()

In [None]:
item_final_rating.sample(10)

In [None]:
# Recommending top 20 products to the user.
d = item_final_rating.loc['00sab00'].sort_values(ascending=False)[0:20]
d

## Evaluation (IBCF) 

In [None]:
# Extract users from test dataset that are present in the train dataset
common = test[test['name'].isin(train['name'])]
common.shape

In [None]:
common.head()

In [None]:
common_item_based_matrix = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T
common_item_based_matrix.head()

In [None]:
# Check the shape
common_item_based_matrix.shape

In [None]:
# Convert the item_correlation matrix into dataframe.
item_correlation_df = pd.DataFrame(item_correlation)
item_correlation_df.head()

In [None]:
# Set index of item correlation df as index of df_pivot
item_correlation_df['name'] = df_pivot.index
item_correlation_df.set_index('name',inplace=True)
item_correlation_df.head()

In [None]:
# Put all product names in a list
list_name = common['name'].tolist()

# Set column names of item correlation df as indices of df_subtracted
item_correlation_df.columns = df_pivot.index.tolist()

# Filter out those item correlations that are present in both train and test datasets
item_correlation_df_1 =  item_correlation_df[item_correlation_df.index.isin(list_name)]

item_correlation_df_2 = item_correlation_df_1.T[item_correlation_df_1.T.index.isin(list_name)]

item_correlation_df_3 = item_correlation_df_2.T

item_correlation_df_3.head()

In [None]:
# Set negative correlations to 0
item_correlation_df_3[item_correlation_df_3<0]=0

# Dot product of 'item_correlation_df_3' and 'common_item_based_matrix'
common_item_predicted_ratings = np.dot(item_correlation_df_3, common_item_based_matrix.fillna(0))
common_item_predicted_ratings

In [None]:
# We get the predicted ratings of products that have not been rated by the user
dummy_test = common.copy()

dummy_test['reviews_rating'] = dummy_test['reviews_rating'].apply(lambda x: 1 if x>=1 else 0)

dummy_test = dummy_test.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T.fillna(0)

# Multiply 'common_item_predicted_ratings' with 'dummy_test'
common_item_predicted_ratings = np.multiply(common_item_predicted_ratings,dummy_test)

In [None]:
common_item_predicted_ratings.head()

In [None]:
from numpy import *

# Create a copy of 'common_item_predicted_ratings'
X  = common_item_predicted_ratings.copy() 

# Filter out positive ratings
X = X[X>0]

# Normalize the ratings and bring them within range 1 to 5
scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)

In [None]:
total_non_nan = np.count_nonzero(~np.isnan(y))
total_non_nan 

In [None]:
rmse = (sum(sum((common_item_based_matrix - y )**2))/total_non_nan)**0.5
print(rmse)

### Item-based correlation filter that we built gives an RMSE value of **3.57**.
### RMSE of `IBCF` (**3.57**) is higher than RMSE of `UBCF` (**2.13**).
### Thus, we will chose the `UBCF` as our recommendation system as it has less error.

In [None]:
# Saving the final ratings in a pickle file
pickle.dump(user_final_rating.astype('float32'), open('picklecopy/user_final_rating.pkl', 'wb'))