In [60]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
data = pd.read_csv('chrome_reviews.csv')

In [20]:
data.head(1)

Unnamed: 0,ID,Review URL,Text,Star,Thumbs Up,User Name,Developer Reply,Version,Review Date,App ID
0,3886,https://play.google.com/store/apps/details?id=...,This is very helpfull aap.,5,0,INDIAN Knowledge,,83.0.4103.106,2020-12-19,com.android.chrome


In [21]:
data.shape

(7204, 10)

In [22]:
cols = ['ID','Review URL','Thumbs Up','User Name','Developer Reply','Version','Review Date','App ID']

In [23]:
data.drop(cols, axis=1, inplace=True)

In [24]:
data.head()

Unnamed: 0,Text,Star
0,This is very helpfull aap.,5
1,Good,3
2,Not able to update. Neither able to uninstall.,1
3,Nice app,4
4,Many unwanted ads,1


In [25]:
data['Star'].value_counts()

5    3871
1    1894
4     652
3     451
2     336
Name: Star, dtype: int64

In [26]:
data.dropna(inplace=True)
data = data[data['Star']!=3]
data['label'] = np.where(data['Star']>3,1,0)
data.drop('Star', axis=1, inplace=True)
data.head()

Unnamed: 0,Text,label
0,This is very helpfull aap.,1
2,Not able to update. Neither able to uninstall.,0
3,Nice app,1
4,Many unwanted ads,0
5,This app good,1


In [27]:
data.shape

(6752, 2)

In [28]:
data['label'].value_counts()

1    4522
0    2230
Name: label, dtype: int64

In [29]:
stop_words = set(stopwords.words('english'))
stop_words.remove('not')
stop_words.remove('nor')

In [30]:
ps = PorterStemmer()
def clean_text(text):
  text = re.sub('[^A-Za-z]',' ',text)
  text = text.lower()
  word = nltk.word_tokenize(text)
  word = [ps.stem(j) for j in word if j not in stop_words]
  return (' '.join(word))

In [31]:
cl_text = [clean_text(i) for i in data['Text']]

In [32]:
data['clean_text'] = cl_text
data.drop('Text', axis=1, inplace=True)

In [33]:
data.head()

Unnamed: 0,label,clean_text
0,1,helpful aap
2,0,not abl updat neither abl uninstal
3,1,nice app
4,0,mani unwant ad
5,1,app good


In [34]:
data.shape

(6752, 2)

In [35]:
data.isnull().any()

label         False
clean_text    False
dtype: bool

In [36]:
X = data['clean_text']
y = data['label']

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [38]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(6076,) (676,) (6076,) (676,)


In [39]:
one_per_tr = (np.sum(y_train==1) / len(y_train))*100
zero_per_tr = (np.sum(y_train==0) / len(y_train))*100
one_per_te = (np.sum(y_test==1) / len(y_test))*100
zero_per_te = (np.sum(y_test==0) / len(y_test))*100

print('{}% 1 and {}% 0 is in train dataset'.format(one_per_tr, zero_per_tr))
print('{}% 1 and {}% 0 is in test dataset'.format(one_per_te, zero_per_te))

67.3633969716919% 1 and 32.636603028308095% 0 is in train dataset
63.46153846153846% 1 and 36.53846153846153% 0 is in test dataset


In [40]:
tf_idf = TfidfVectorizer(max_features=2000, min_df=2, ngram_range=(1,2))
tf_idf.fit(X_train)
X_train_tfidf = tf_idf.fit_transform(X_train)
X_test_tfidf = tf_idf.transform(X_test)

In [44]:
pickle.dump(tf_idf.vocabulary_,open('tfidf.pkl','wb'))

In [45]:
X_train_tfidf.shape

(6076, 2000)

In [48]:
X_test_tfidf.shape

(676, 2000)

In [47]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier

### Naive Bayes

In [49]:
param_nb = {'alpha' : [0.01,0.1,1,10]}
model_nb = MultinomialNB()
clf_nb = GridSearchCV(model_nb, param_nb, cv=5 ,return_train_score=True)
clf_nb.fit(X_train_tfidf, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.01, 0.1, 1, 10]}, pre_dispatch='2*n_jobs',
             refit=True, return_train_score=True, scoring=None, verbose=0)

In [50]:
y_pred_tr = clf_nb.predict(X_train_tfidf)
y_pred_te = clf_nb.predict(X_test_tfidf)

In [51]:
cm_tr = confusion_matrix(y_train, y_pred_tr)
acc_tr = accuracy_score(y_train, y_pred_tr)

cm_te = confusion_matrix(y_test, y_pred_te)
acc_te = accuracy_score(y_test, y_pred_te)
print('Train confusion matrix: ')
print(cm_tr)
print('Test confusion matrix: ')
print(cm_te)
print('Train accuracy: ',acc_tr,' Testaccuracy: ',acc_te)

Train confusion matrix: 
[[1603  380]
 [ 235 3858]]
Test confusion matrix: 
[[181  66]
 [ 32 397]]
Train accuracy:  0.8987820934825543  Testaccuracy:  0.8550295857988166


In [52]:
clf_nb.best_params_

{'alpha': 1}

### KNN

In [330]:
param_knn = {'n_neighbors' : [1,5,10,20,50]}
model_knn = KNeighborsClassifier()
clf_knn = GridSearchCV(model_knn, param_knn, cv=5 ,return_train_score=True)
clf_knn.fit(X_train_tfidf, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [1, 5, 10, 20, 50]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [331]:
y_pred_tr = clf_knn.predict(X_train_tfidf)
y_pred_te = clf_knn.predict(X_test_tfidf)

In [332]:
cm_tr = confusion_matrix(y_train, y_pred_tr)
acc_tr = accuracy_score(y_train, y_pred_tr)

cm_te = confusion_matrix(y_test, y_pred_te)
acc_te = accuracy_score(y_test, y_pred_te)
print('Train confusion matrix: ')
print(cm_tr)
print('Test confusion matrix: ')
print(cm_te)
print('Train accuracy: ',acc_tr,' Testaccuracy: ',acc_te)

Train confusion matrix: 
[[1007 1016]
 [ 142 3911]]
Test confusion matrix: 
[[ 85 122]
 [ 20 449]]
Train accuracy:  0.8094140882159315  Testaccuracy:  0.7899408284023669


### Logistic Regression

In [333]:
param_lr = {'C' : [0.0001, 0.001, 0.01,0.1,1,10,100]}
model_lr = LogisticRegression()
clf_lr = GridSearchCV(model_lr, param_lr, cv=5 ,return_train_score=True)
clf_lr.fit(X_train_tfidf, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [334]:
y_pred_tr = clf_lr.predict(X_train_tfidf)
y_pred_te = clf_lr.predict(X_test_tfidf)

In [335]:
cm_tr = confusion_matrix(y_train, y_pred_tr)
acc_tr = accuracy_score(y_train, y_pred_tr)

cm_te = confusion_matrix(y_test, y_pred_te)
acc_te = accuracy_score(y_test, y_pred_te)
print('Train confusion matrix: ')
print(cm_tr)
print('Test confusion matrix: ')
print(cm_te)
print('Train accuracy: ',acc_tr,' Testaccuracy: ',acc_te)

Train confusion matrix: 
[[1592  431]
 [ 213 3840]]
Test confusion matrix: 
[[159  48]
 [ 28 441]]
Train accuracy:  0.8940092165898618  Testaccuracy:  0.8875739644970414


### SVC

In [336]:
param_svm = {'alpha' : [0.0001, 0.001, 0.01,0.1,1]}
model_svm = SGDClassifier(loss='hinge')
clf_svm = GridSearchCV(model_svm, param_svm, cv=5 ,return_train_score=True)
clf_svm.fit(X_train_tfidf, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     random_state=None, shuffle=True, tol=0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=0)

In [337]:
y_pred_tr = clf_svm.predict(X_train_tfidf)
y_pred_te = clf_svm.predict(X_test_tfidf)

In [338]:
cm_tr = confusion_matrix(y_train, y_pred_tr)
acc_tr = accuracy_score(y_train, y_pred_tr)

cm_te = confusion_matrix(y_test, y_pred_te)
acc_te = accuracy_score(y_test, y_pred_te)
print('Train confusion matrix: ')
print(cm_tr)
print('Test confusion matrix: ')
print(cm_te)
print('Train accuracy: ',acc_tr,' Testaccuracy: ',acc_te)

Train confusion matrix: 
[[1677  346]
 [ 186 3867]]
Test confusion matrix: 
[[157  50]
 [ 34 435]]
Train accuracy:  0.9124423963133641  Testaccuracy:  0.8757396449704142


# Naive Bayes classifier with alpha=1 is best classifier

In [53]:
import pickle

In [54]:
model = MultinomialNB(alpha=1)
model.fit(X_train_tfidf.toarray(), y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [55]:
pickle.dump(model, open('model.pkl','wb'))

In [56]:
model = pickle.load(open('model.pkl','rb'))

In [57]:
tfidf = TfidfVectorizer(vocabulary=pickle.load(open("tfidf.pkl", "rb")))

In [61]:
transformer = TfidfTransformer()

In [71]:
temp = transformer.fit_transform(tfidf.fit_transform(["aaa ccc eee"]))

In [69]:
temp.toarray().shape

(1, 2000)

In [73]:
model.predict(temp)[0]

1