# ***Importing Packages***

In [0]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, ensemble
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MaxAbsScaler
import pandas as pd, xgboost, numpy, textblob, string, random

# ***1. Dataset preparation***

In [6]:
# LOADING THE DATASET AND SEEING THE DETAILS
data_df_train = pd.read_csv('Amazon_Unlocked_Mobile.csv', encoding = 'unicode_escape',header=0)
data_df_test = pd.read_csv('Review.csv', encoding = 'unicode_escape',header=0)
# SHAPE OF THE DATASET
print("Shape of the dataset:")
print(data_df_train.shape)
# COLUMN NAMES
print("Column names:")
print(data_df_train.columns)
# DATATYPE OF EACH COLUMN
print("Datatype of each column:")
print(data_df_train.dtypes)
# SEEING FEW OF THE ENTRIES
print("Few dataset entries:")
print(data_df_train.head())
# DATASET SUMMARY
data_df_train.describe(include='all')

Shape of the dataset:
(413746, 9)
Column names:
Index(['Pd_id', 'User_id', 'Review_id', 'Product Name', 'Brand Name', 'Price',
       'Rating', 'Reviews', 'Review Votes'],
      dtype='object')
Datatype of each column:
Pd_id             int64
User_id           int64
Review_id         int64
Product Name     object
Brand Name       object
Price           float64
Rating            int64
Reviews          object
Review Votes    float64
dtype: object
Few dataset entries:
    Pd_id  ...  Review Votes
0  413841  ...           1.0
1  413842  ...           0.0
2  413843  ...           0.0
3  413844  ...           0.0
4  413845  ...           0.0

[5 rows x 9 columns]


Unnamed: 0,Pd_id,User_id,Review_id,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
count,413746.0,413746.0,413746.0,413746,348590,407814.0,413746.0,413684,401451.0
unique,,,,4410,384,,,162483,
top,,,,Apple iPhone 4s 8GB Unlocked Smartphone w/ 8MP...,Samsung,,,Good,
freq,,,,1451,65730,,,2879,
mean,206935.91875,206935.91875,206935.91875,,,226.868381,3.819331,,1.507581
std,119464.473511,119464.473511,119464.473511,,,273.015107,1.548296,,9.164886
min,16.0,16.0,16.0,,,1.73,1.0,,0.0
25%,103483.25,103483.25,103483.25,,,79.99,3.0,,0.0
50%,206927.5,206927.5,206927.5,,,144.71,5.0,,0.0
75%,310395.75,310395.75,310395.75,,,269.99,5.0,,1.0


In [7]:
# Checking if data set is balanced for all the ratings
print('Total no. of training records for Rating 5:', len(data_df_train[(data_df_train['Rating']==5)]))
print('Total no. of training records for Rating 4:',len(data_df_train[(data_df_train['Rating']==4)]))
print('Total no. of training records for Rating 3:',len(data_df_train[(data_df_train['Rating']==3)]))
print('Total no. of training records for Rating 2:',len(data_df_train[(data_df_train['Rating']==2)]))
print('Total no. of training records for Rating 1:',len(data_df_train[(data_df_train['Rating']==1)]))

Total no. of training records for Rating 5: 223518
Total no. of training records for Rating 4: 61387
Total no. of training records for Rating 3: 31763
Total no. of training records for Rating 2: 24728
Total no. of training records for Rating 1: 72350


In [0]:
# Taking a copy of the train and test data set
train_original = data_df_train.copy() 
test_original = data_df_test.copy()

In [9]:
# Dropping rows with NA
data_df_train = data_df_train.dropna(axis = 0, how ='any')
print(data_df_train.shape)

(334257, 9)


In [10]:
# Viewing the test data(Reviews to be rated)
test = pd.DataFrame(data_df_test, columns=['Reviews','Rating'])
test

Unnamed: 0,Reviews,Rating
0,This is the best phone I've ever used. The cam...,
1,Worth it's price. Better than the previous gen...,
2,Great value for money. Good phone.,
3,Worst phone. I did not like the phone. Google ...,
4,Compared to Pixel XL seems to be good. The bat...,
5,Not a good phone,
6,Worst phone. Poor batery performance. The phon...,
7,not good. Not bad,
8,worst phone,
9,I like it. Has been one week since I bought it...,


In [0]:
#Taking only the necessary columns for train data
X = data_df_train['Reviews']
y = data_df_train['Rating']

In [0]:
# Split the dataset into training and validation datasets 

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(X, y, test_size=0.3, random_state=42)

#Test data
test_x = test['Reviews']
test_y = test['Rating']

# Label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)
test_y = encoder.fit_transform(test_y)

# ***2. Feature Engineering***

***2.1 Count Vectors as Features***

In [0]:
# create a count vectorizer object 

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
ctrain = count_vect.fit(train_x)

# transform the training and validation data using count vectorizer object

xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)
xtest_count =  count_vect.transform(test_x)

***2.2 TF-IDF Vectors as features***

In [0]:
# word level tf-idf

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train_x)
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)
xtest_tfidf =  tfidf_vect.transform(test_x)

In [0]:
# ngram level tf-idf 

tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(train_x)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_x)

In [16]:
# characters level tf-idf

tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(train_x)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 
xtest_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x)



# ***3. Model Building*** 

***3.1 Naive Bayes***

In [17]:
# Naive Bayes on Count Vectors

random.seed(100)
classifier = naive_bayes.MultinomialNB()
clf = classifier.fit(xtrain_count, train_y)
predictions = clf.predict(xvalid_count)
predictions_test = clf.predict(xtest_count)
classification = metrics.classification_report(predictions, valid_y)
print(classification)

              precision    recall  f1-score   support

           0       0.83      0.67      0.74     21303
           1       0.17      0.58      0.26      1818
           2       0.25      0.50      0.34      3940
           3       0.29      0.49      0.36      8914
           4       0.93      0.78      0.85     64303

    accuracy                           0.72    100278
   macro avg       0.49      0.60      0.51    100278
weighted avg       0.81      0.72      0.75    100278



In [18]:
# Naive Bayes on Word Level TF IDF Vectors

random.seed(100)
classifier = naive_bayes.MultinomialNB()
clf = classifier.fit(xtrain_tfidf, train_y)
predictions = clf.predict(xvalid_tfidf)
predictions_test = clf.predict(xtest_tfidf)
classification = metrics.classification_report(predictions, valid_y)
print(classification)

              precision    recall  f1-score   support

           0       0.81      0.65      0.72     21431
           1       0.05      0.63      0.09       450
           2       0.09      0.49      0.15      1422
           3       0.18      0.41      0.25      6526
           4       0.95      0.72      0.82     70449

    accuracy                           0.68    100278
   macro avg       0.41      0.58      0.40    100278
weighted avg       0.85      0.68      0.75    100278



In [19]:
# Predicted Ratings (Scale of 0-4)
predictions_test

array([4, 4, 4, 0, 4, 4, 0, 0, 0, 4, 4, 4, 4, 4, 4])

***Found the Naive Bayes on Ngram Level TF IDF Vectors model to be the best Naive Bayes model as the precision for all ratings are higher compared to other models.***

In [20]:
# Naive Bayes on Ngram Level TF IDF Vectors

random.seed(100)
classifier = naive_bayes.MultinomialNB()
clf = classifier.fit(xtrain_tfidf_ngram, train_y)
predictions = clf.predict(xvalid_tfidf_ngram)
predictions_test = clf.predict(xtest_tfidf_ngram)
classification = metrics.classification_report(predictions, valid_y)
print(classification)

              precision    recall  f1-score   support

           0       0.76      0.65      0.70     20275
           1       0.06      0.49      0.10       681
           2       0.15      0.44      0.22      2700
           3       0.22      0.46      0.29      7054
           4       0.94      0.73      0.82     69568

    accuracy                           0.69    100278
   macro avg       0.43      0.55      0.43    100278
weighted avg       0.83      0.69      0.74    100278



In [21]:
# Predicted Ratings(Scale of 0-4)
predictions_test

array([4, 4, 4, 0, 4, 3, 0, 2, 0, 4, 3, 4, 4, 4, 4])

In [22]:
# Converting the predicted ratings to a scale of 1-5
predictions_test = [i+1 for i in predictions_test]
predictions_test

[5, 5, 5, 1, 5, 4, 1, 3, 1, 5, 4, 5, 5, 5, 5]

In [23]:
# Reviews and Predicted Ratings
test['Rating'] = predictions_test
test

Unnamed: 0,Reviews,Rating
0,This is the best phone I've ever used. The cam...,5
1,Worth it's price. Better than the previous gen...,5
2,Great value for money. Good phone.,5
3,Worst phone. I did not like the phone. Google ...,1
4,Compared to Pixel XL seems to be good. The bat...,5
5,Not a good phone,4
6,Worst phone. Poor batery performance. The phon...,1
7,not good. Not bad,3
8,worst phone,1
9,I like it. Has been one week since I bought it...,5


In [0]:
# Saving the Predicted ratings as per the format of Review table

submission=pd.read_csv("Sample_Submission.csv")
submission['Rating']=predictions_test 
submission['Pd_id']=test_original['Pd_id']
submission['User_id']=test_original['User_id']
submission['Review_id']=test_original['Review_id']
submission['Reviews']=test_original['Reviews']
pd.DataFrame(submission, columns=['Pd_id','User_id','Review_id','Reviews','Rating']).to_csv('Review_new.csv')

In [26]:
# Naive Bayes on Character Level TF IDF Vectors

classifier = naive_bayes.MultinomialNB()
clf = classifier.fit(xtrain_tfidf_ngram_chars, train_y)
predictions = clf.predict(xvalid_tfidf_ngram_chars)
predictions_test = clf.predict(xtest_tfidf_ngram_chars)
classification = metrics.classification_report(predictions, valid_y)
print(classification)

              precision    recall  f1-score   support

           0       0.81      0.57      0.67     24742
           1       0.02      0.42      0.03       225
           2       0.10      0.35      0.16      2300
           3       0.25      0.35      0.30     10904
           4       0.87      0.76      0.81     62107

    accuracy                           0.66    100278
   macro avg       0.41      0.49      0.39    100278
weighted avg       0.77      0.66      0.70    100278



In [27]:
# Predicted Ratings(Scale of 0-4)
predictions_test

array([4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 3, 4, 4, 4, 4])

***3.2 Linear Classifier***

In [28]:
# Linear Classifier on Count Vectors

classifier = linear_model.LogisticRegression(solver='sag', max_iter = 3000)
clf = classifier.fit(xtrain_count, train_y)
predictions = clf.predict(xvalid_count)
predictions_test = clf.predict(xtest_count)
classification = metrics.classification_report(predictions, valid_y)
print(classification)

              precision    recall  f1-score   support

           0       0.84      0.71      0.77     20404
           1       0.20      0.56      0.30      2236
           2       0.28      0.52      0.37      4298
           3       0.25      0.56      0.35      6805
           4       0.96      0.78      0.86     66535

    accuracy                           0.73    100278
   macro avg       0.51      0.62      0.53    100278
weighted avg       0.84      0.73      0.77    100278



In [29]:
# Predicted Ratings(Scale of 0-4)
predictions_test

array([4, 4, 4, 0, 4, 4, 0, 0, 0, 4, 3, 4, 4, 4, 4])

In [30]:
# Linear Classifier on Word Level TF IDF Vectors

classifier = linear_model.LogisticRegression(solver='sag', max_iter = 3000)
clf = classifier.fit(xtrain_tfidf, train_y)
predictions = clf.predict(xvalid_tfidf)
accuracy = metrics.accuracy_score(predictions, valid_y)
print ("LR Validation, WordLevel TF-IDF: ", accuracy)
predictions_test = clf.predict(xtest_tfidf)

LR Validation, WordLevel TF-IDF:  0.7354654061708451


In [31]:
# Predicted Ratings(Scale of 0-4)
predictions_test

array([4, 4, 4, 0, 3, 0, 0, 0, 0, 4, 3, 4, 4, 4, 4])

In [32]:
# Linear Classifier on Ngram Level TF IDF Vectors

classifier = linear_model.LogisticRegression(solver='sag', max_iter = 3000)
clf = classifier.fit(xtrain_tfidf_ngram, train_y)
predictions = clf.predict(xvalid_tfidf_ngram)
accuracy = metrics.accuracy_score(predictions, valid_y)
print ("LR Validation, N-Gram Vectors: ", accuracy)
predictions_test = clf.predict(xtest_tfidf_ngram)

LR Validation, N-Gram Vectors:  0.7180936995153473


In [33]:
# Predicted Ratings(Scale of 0-4)
predictions_test

array([4, 4, 4, 0, 4, 0, 0, 2, 0, 4, 3, 4, 4, 4, 4])

In [34]:
# Linear Classifier on Character Level TF IDF Vectors

classifier = linear_model.LogisticRegression(solver='sag', max_iter = 500)
clf = classifier.fit(xtrain_tfidf_ngram_chars, train_y)
predictions = clf.predict(xvalid_tfidf_ngram_chars)
accuracy = metrics.accuracy_score(predictions, valid_y)
print ("LR Validation, CharLevel Vectors: ", accuracy)
predictions_test = clf.predict(xtest_tfidf_ngram_chars)

LR Validation, CharLevel Vectors:  0.7296316240850436


In [35]:
# Predicted Ratings(Scale of 0-4)
predictions_test

array([4, 4, 4, 4, 3, 3, 0, 0, 0, 4, 3, 4, 4, 4, 4])