In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
from pickle import dump

In [2]:
df = pd.read_csv('data/playstore_reviews.csv')
df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [3]:
df = df.drop('package_name', axis=1)
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


In [4]:
df['review'] = df['review'].str.strip().str.lower()
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [5]:
# 0 = Negative
# 1 = Positive
df['polarity'].value_counts()

polarity
0    584
1    307
Name: count, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

### Vectorize Data

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Bernoulli Model

In [8]:
model_B = BernoulliNB()
model_B.fit(X_train, y_train)

In [9]:
y_pred_B = model_B.predict(X_test)
y_pred_B

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0])

In [10]:
report_B = classification_report(y_test, y_pred_B)
print(f'Bernoulli Accuracy Score:\n {accuracy_score(y_test, y_pred_B)}')
print(report_B)

Bernoulli Accuracy Score:
 0.770949720670391
              precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77       179
   macro avg       0.74      0.66      0.68       179
weighted avg       0.76      0.77      0.75       179



### Gaussian Model

In [11]:
model_G = GaussianNB()
model_G.fit(X_train, y_train)

In [12]:
y_pred_G = model_G.predict(X_test)
y_pred_G

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0])

In [13]:
report_G = classification_report(y_test, y_pred_G)
print(f'Gaussian Accuracy Score:\n {accuracy_score(y_test, y_pred_G)}')
print(report_G)

Gaussian Accuracy Score:
 0.8044692737430168
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179



### Multinomial Model

In [14]:
model_M = MultinomialNB()
model_M.fit(X_train, y_train)

In [15]:
y_pred_M = model_M.predict(X_test)
y_pred_M

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [16]:
report_M = classification_report(y_test, y_pred_M)
print(f'Multinomial Accuracy Score:\n {accuracy_score(y_test, y_pred_M)}')
print(report_M)

report_G = classification_report(y_test, y_pred_G)
print(f'Gaussian Accuracy Score:\n {accuracy_score(y_test, y_pred_G)}')
print(report_G)

report_B = classification_report(y_test, y_pred_B)
print(f'Bernoulli Accuracy Score:\n {accuracy_score(y_test, y_pred_B)}')
print(report_B)

Multinomial Accuracy Score:
 0.8156424581005587
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179

Gaussian Accuracy Score:
 0.8044692737430168
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179

Bernoulli Accuracy Score:
 0.770949720670391
              precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77 

### Optimized Models

### Optimized Multinomial           

In [17]:
hyperparameters = {
    'alpha': np.linspace(0.001, 10.0, 200),
    'fit_prior': [True, False]
}

random_search_m = RandomizedSearchCV(model_M, hyperparameters, n_iter=10, scoring='accuracy', cv=5, random_state=42)
random_search_m 

In [18]:
random_search_m.fit(X_train, y_train)
print(f'Best Hyperparameters:\n{random_search_m.best_params_}')


Best Hyperparameters:
{'fit_prior': False, 'alpha': np.float64(2.312326633165829)}


In [19]:
opt_model_M = MultinomialNB(alpha=2.312326633165829, fit_prior=False)
opt_model_M.fit(X_train, y_train)
opt_y_pred_M = opt_model_M.predict(X_test)

In [20]:
report_M = classification_report(y_test, y_pred_M)
print(f'Multinomial Accuracy Score:\n {accuracy_score(y_test, y_pred_M)}')
print(report_M)

opt_report_M = classification_report(y_test, opt_y_pred_M)
print(f'Optimized Accuracy Score:\n {accuracy_score(y_test,opt_y_pred_M)}')
print(opt_report_M)

Multinomial Accuracy Score:
 0.8156424581005587
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179

Optimized Accuracy Score:
 0.8268156424581006
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       126
           1       0.74      0.64      0.69        53

    accuracy                           0.83       179
   macro avg       0.80      0.77      0.78       179
weighted avg       0.82      0.83      0.82       179



### Optimized Bernoulli Model

In [21]:
from sklearn.model_selection import GridSearchCV

param_grid_B = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 100],
    'binarize': [0.01, 0.0, 0.5, 1.0, 10]
}

# Set up GridSearchCV
grid_search_B = GridSearchCV(model_B, param_grid_B, cv=5, scoring='accuracy')
grid_search_B

In [22]:
grid_search_B.fit(X_train, y_train)
print(f'Best Hyperparameters:\n{grid_search_B.best_params_}')

Best Hyperparameters:
{'alpha': 0.1, 'binarize': 0.01}


In [23]:
opt_model_B = BernoulliNB(alpha=0.1, binarize=0.0)
opt_model_B.fit(X_train, y_train)
opt_y_pred_B = opt_model_B.predict(X_test)

In [24]:
report_B = classification_report(y_test, y_pred_B)
print(f'Bernoulli Accuracy Score:\n {accuracy_score(y_test, y_pred_B)}')
print(report_B)

opt_report_B = classification_report(y_test, opt_y_pred_B)
print(f'Optimized Accuracy Score:\n {accuracy_score(y_test,opt_y_pred_B)}')
print(opt_report_B)

Bernoulli Accuracy Score:
 0.770949720670391
              precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77       179
   macro avg       0.74      0.66      0.68       179
weighted avg       0.76      0.77      0.75       179

Optimized Accuracy Score:
 0.8324022346368715
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       126
           1       0.71      0.74      0.72        53

    accuracy                           0.83       179
   macro avg       0.80      0.80      0.80       179
weighted avg       0.83      0.83      0.83       179



### Optimized Gaussian Model

In [25]:
param_grid_G = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}
grid_search_G = GridSearchCV(model_G, param_grid_G, cv=5, scoring='accuracy')
grid_search_G

In [26]:
grid_search_G.fit(X_train, y_train)
print(f'Best Hyperparameters:\n{grid_search_G.best_params_}')

Best Hyperparameters:
{'var_smoothing': 1e-09}


In [27]:
opt_model_G = GaussianNB(var_smoothing=1e-09)
opt_model_G.fit(X_train, y_train)
opt_y_pred_G = opt_model_G.predict(X_test)

In [28]:
report_G = classification_report(y_test, y_pred_G)
print(f'GaussianNB Accuracy Score:\n {accuracy_score(y_test, y_pred_G)}')
print(report_G)

opt_report_G = classification_report(y_test, opt_y_pred_G)
print(f'Optimized Accuracy Score:\n {accuracy_score(y_test, opt_y_pred_G)}')
print(opt_report_G)

GaussianNB Accuracy Score:
 0.8044692737430168
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179

Optimized Accuracy Score:
 0.8044692737430168
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179



### The model with the highest accuracy is the Optimized Bernoulli Model - 83%

In [30]:
dump(opt_model_B, open("model/optimized_bernoulli_model-review_sentiment_analysis.sav", "wb"))