In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import zscore
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import power_transform
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_curve,auc,classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [3]:
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import  stopwords
import string

In [5]:
df=pd.read_csv('review.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Review,Rating
0,0,Nice product,4.0
1,1,Good choice,4.0
2,2,Classy product,5.0
3,3,Moderate,5.0
4,4,Super!,4.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5474 entries, 0 to 5473
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Review  5467 non-null   object 
 1   Rating  5467 non-null   float64
dtypes: float64(1), object(1)
memory usage: 85.7+ KB


We can delete unnamed: 0 column as it is just an indexing column which is not required in analysis. We will also check the values of rating column as all the values should be in integer form.

In [7]:
df=df.drop(['Unnamed: 0'],axis=1)

In [8]:
df['Rating'].value_counts()

5.0    3607
4.0    1453
3.0     319
2.0      88
Name: Rating, dtype: int64

## data balancing

In [9]:
from sklearn.utils import resample

rate5=df[df.Rating==5]
rate4=df[df.Rating==4]
rate3=df[df.Rating==3]
rate2=df[df.Rating==2]
rate4_upsampled=resample(rate4,replace=True,n_samples=len(rate5),random_state=27)
rate3_upsampled=resample(rate3,replace=True,n_samples=len(rate5),random_state=27)
rate2_upsampled=resample(rate2,replace=True,n_samples=len(rate5),random_state=27)
df_up=pd.concat([rate5,rate4_upsampled,rate3_upsampled,rate2_upsampled])
df_up['Rating'].value_counts()

5.0    3607
4.0    3607
3.0    3607
2.0    3607
Name: Rating, dtype: int64

## Feature Engineering

In [11]:
#converting all the reviews to lower case so thats its easy to analyse them.
df_up['Review']=df_up['Review'].str.lower()

In [12]:
#Replacing email address,links, phone numbers, any sort of numbers and currency as they are not a review
df_up['Review'] = df_up['Review'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailid')
df_up['Review'] = df_up['Review'].str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','link')
df_up['Review'] = df_up['Review'].str.replace(r'£|\$', 'currency')   
df_up['Review'] = df_up['Review'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phno')
df_up['Review'] = df_up['Review'].str.replace(r'\d+(\.\d+)?', 'numbr')   

In [63]:
df['Review']=df['Review'].apply(lambda x: np.str_(x))

In [67]:
#Removing punctuations
df_up['Review'] = df_up['Review'].apply(lambda x: ' '.join(
    term for term in str(x).split() if term not in string.punctuation))

In [68]:
# Removing stop words
sw = set(stopwords.words('english') + ['u', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure'])
df_up['Review'] = df_up['Review'].apply(lambda x: ' '.join(
    term for term in str(x).split() if term not in sw))

In [69]:
lm=WordNetLemmatizer()
df_up['Review'] = df_up['Review'].apply(lambda x: ' '.join(
 lm.lemmatize(t) for t in str(x).split()))

In [73]:
# Convert text into vectors using TFIDF

tfidf = TfidfVectorizer(max_features = 10000, stop_words='english')
x = tfidf.fit_transform(df_up['Review'].apply(lambda x: np.str_(x)))

In [74]:
y=df_up['Rating']

## Train test split

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
max_ac=0
randomState=0
lr=LogisticRegression()
for i in range(10):
    x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=i,test_size=0.20)
    lr.fit(x_train,y_train)
    pred_train=lr.predict(x_train)
    pred_test=lr.predict(x_test)
    if round(accuracy_score(y_train,pred_train)*100,1)==round(accuracy_score(y_test,pred_test)*100,1):
        print("\n\nAt random state:",i)
        print("\nTrain Accuracy- ",round(accuracy_score(y_train,pred_train)*100,1))
        print("\nTest Accuracy- ",round(accuracy_score(y_test,pred_test)*100,1))
        if round(accuracy_score(y_test,pred_test)*100,1)>max_ac:
            randomState=i
            max_ac=round(accuracy_score(y_test,pred_test)*100,1)
    print("\n\n Best accuracy at random state-",i)
    print("\nTrain Accuracy- ",round(accuracy_score(y_train,pred_train)*100,1))
    print("\nTest Accuracy- ",round(accuracy_score(y_test,pred_test)*100,1))



 Best accuracy at random state- 0

Train Accuracy-  36.5

Test Accuracy-  36.1


 Best accuracy at random state- 1

Train Accuracy-  36.6

Test Accuracy-  35.6


 Best accuracy at random state- 2

Train Accuracy-  36.7

Test Accuracy-  35.5


 Best accuracy at random state- 3

Train Accuracy-  36.5

Test Accuracy-  36.1


 Best accuracy at random state- 4

Train Accuracy-  36.6

Test Accuracy-  36.0


 Best accuracy at random state- 5

Train Accuracy-  36.5

Test Accuracy-  36.2


 Best accuracy at random state- 6

Train Accuracy-  36.4

Test Accuracy-  36.7


 Best accuracy at random state- 7

Train Accuracy-  37.1

Test Accuracy-  33.7


 Best accuracy at random state- 8

Train Accuracy-  36.5

Test Accuracy-  36.2


 Best accuracy at random state- 9

Train Accuracy-  36.6

Test Accuracy-  36.0


## Logistic Regression

In [99]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=9,test_size=0.20)
lr.fit(x_train,y_train)
pred_train=lr.predict(x_train)
pred_test=lr.predict(x_test)
train_accuracy=round(accuracy_score(y_train,pred_train)*100,1)
test_accuracy=round(accuracy_score(y_test,pred_test)*100,1)
print("\ntrain accuracy-",train_accuracy)
print("\ntest accuracy-",test_accuracy)


train accuracy- 36.6

test accuracy- 36.0


In [100]:
# CV Score
cv_score_best=cross_val_score(lr,x,y,cv=4).mean()*100
print("cross validation score is-",cv_score_best)
print("accuracy score for logistic regression model is-",test_accuracy)

cross validation score is- 36.42223454394233
accuracy score for logistic regression model is- 36.0


In [83]:
#classification report
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

         2.0       0.00      0.00      0.00       747
         3.0       0.00      0.00      0.00       739
         4.0       0.28      1.00      0.44       729
         5.0       1.00      0.46      0.63       671

    accuracy                           0.36      2886
   macro avg       0.32      0.37      0.27      2886
weighted avg       0.30      0.36      0.26      2886



## Decision tree classifier

In [97]:
from sklearn.tree import DecisionTreeClassifier
dt=DecisionTreeClassifier()
dt.fit(x_train,y_train)
dt_pred_train=dt.predict(x_train)
dt_pred_test=dt.predict(x_test)
dt_acc_train=round(accuracy_score(y_train,dt_pred_train)*100,1)
dt_acc_test=round(accuracy_score(y_test,dt_pred_test)*100,1)
print("acc train",dt_acc_train)
print("acc test",dt_acc_test)

acc train 36.4
acc test 36.6


In [98]:
# CV Score
cv_score_best_dt=cross_val_score(dt,x,y,cv=4).mean()*100
print("cross validation score is-",cv_score_best)
print("accuracy score for decision tree model is-",dt_acc_test)

cross validation score is- 36.42223454394233
accuracy score for decision tree model is- 36.6


In [92]:
#classification report
print(classification_report(y_test, dt_pred_test))

              precision    recall  f1-score   support

         2.0       0.00      0.00      0.00       737
         3.0       0.00      0.00      0.00       688
         4.0       0.25      0.89      0.39       713
         5.0       0.25      0.10      0.15       748

    accuracy                           0.25      2886
   macro avg       0.13      0.25      0.13      2886
weighted avg       0.13      0.25      0.13      2886



## Knn Classifier

In [88]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.20)
knn.fit(x_train,y_train)
pred_train=knn.predict(x_train)
pred_test=knn.predict(x_test)
knn_train_acc=round(accuracy_score(y_train,pred_train)*100,1)
knn_test_acc=round(accuracy_score(y_test,pred_test)*100,1)
print("\nTrain Accuracy- ",knn_train_acc)
print("\nTest Accuracy- ",knn_test_acc)


Train Accuracy-  36.6

Test Accuracy-  35.6


In [89]:
cv_score_best_knn=cross_val_score(knn,x,y,cv=11).mean()*100
print("cross validation score is-",cv_score_best_knn)
print("accuracy score for Knn classifier model is-",knn_test_acc)

cross validation score is- 36.37056541019955
accuracy score for Knn classifier model is- 35.6


In [93]:
#classification report
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

         2.0       0.00      0.00      0.00       737
         3.0       0.27      1.00      0.43       688
         4.0       0.00      0.00      0.00       713
         5.0       1.00      0.45      0.62       748

    accuracy                           0.36      2886
   macro avg       0.32      0.36      0.26      2886
weighted avg       0.32      0.36      0.26      2886



## Random Forest classifier

In [94]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=10,test_size=0.20)
rf.fit(x_train,y_train)
pred_train=rf.predict(x_train)
pred_test=rf.predict(x_test)
rf_train_acc=round(accuracy_score(y_train,pred_train)*100,1)
rf_test_acc=round(accuracy_score(y_test,pred_test)*100,1)
print("\nTrain Accuracy- ",rf_train_acc)
print("\nTest Accuracy- ",rf_test_acc)


Train Accuracy-  36.4

Test Accuracy-  36.6


In [95]:
cv_score_best_rf=cross_val_score(rf,x,y,cv=11).mean()*100
print("cross validation score is-",cv_score_best_rf)
print("accuracy score for Knn classifier model is-",rf_test_acc)

cross validation score is- 36.43986116114542
accuracy score for Knn classifier model is- 36.6


In [96]:
#classification report
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

         2.0       0.00      0.00      0.00       726
         3.0       0.28      1.00      0.44       717
         4.0       0.00      0.00      0.00       746
         5.0       1.00      0.49      0.65       697

    accuracy                           0.37      2886
   macro avg       0.32      0.37      0.27      2886
weighted avg       0.31      0.37      0.27      2886



In [102]:
models=['Logistic Regression','Decision Tree Classifier','Knn Classifier','Random Forest Classifier']
test_acc=[test_accuracy,dt_acc_test,knn_test_acc,rf_test_acc]
cv=[cv_score_best,cv_score_best_dt,cv_score_best_knn,cv_score_best_rf]
dfm=pd.DataFrame(list(zip(models,test_acc,cv)),columns=['Models','Test Accuracy','CV Score'])
dfm

Unnamed: 0,Models,Test Accuracy,CV Score
0,Logistic Regression,36.0,36.422235
1,Decision Tree Classifier,36.6,36.422235
2,Knn Classifier,35.6,36.370565
3,Random Forest Classifier,36.6,36.439861


We would finalize Random Forest Classifier as our final model because it has highest accuracy and recall and f1-score is also highest for this model among all others.

## Hyperparameter tuning

In [103]:
param_grid = {
    "n_estimators":[100,200,300],
    "max_depth":[10, 50, 100],
    "max_features":[6,8,10,12,14,16],
    'bootstrap': [True, False],
    "min_samples_split": [2, 6, 10]
}

rf_reg = RandomForestClassifier()

rf_reg_tuned = GridSearchCV(estimator=rf_reg,
                            param_grid=param_grid,
                            cv=3,
                            n_jobs=-1,
                            verbose=2)

rf_reg_tuned.fit(x_train, y_train)
rf_reg_tuned.best_estimator_

Fitting 3 folds for each of 324 candidates, totalling 972 fits


RandomForestClassifier(max_depth=10, max_features=6)

In [106]:
rf1=RandomForestClassifier(max_depth=10, max_features=6,bootstrap=True)
rf1.fit(x_train,y_train)
pred_train=rf1.predict(x_train)
pred_test=rf1.predict(x_test)
train_acc=round(accuracy_score(y_train,pred_train)*100,1)
test_acc=round(accuracy_score(y_test,pred_test)*100,1)
print("\nTrain Accuracy- ",train_acc)
print("\nTest Accuracy- ",test_acc)


Train Accuracy-  36.6

Test Accuracy-  36.0


No change in accuracy.

## Model Saving

In [107]:
import pickle
filename = 'finalized_model.pkl'
pickle.dump(rf, open(filename, 'wb'))