In [473]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
nltk.download('punkt')  # for tokenization
nltk.download('stopwords')  # for stopword removal
nltk.download('wordnet')  # for lemmatization


[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [474]:
df = pd.read_csv('data/raw/playstore_reviews.csv')
df

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


In [475]:

if 'package_name' in df.columns:
    df = df.drop(['package_name'], axis=1)
#df["review"] = df["review"].str.strip().str.lower()

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\W', ' ', text)      # Remove non-alphanumeric characters
    text = text.lower()                  # Convert to lowercase
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['review'] = df['review'].apply(preprocess_text)

df

Unnamed: 0,review,polarity
0,privacy least put option appear offline mean p...,0
1,messenger issue ever since last update initial...,0
2,profile time wife anybody one post view would ...,0
3,new feature suck u working back button guy mak...,0
4,forced reload uploading pic replying comment l...,0
...,...,...
886,loved loooooooooooooovvved incredible awesome ...,1
887,time legendary game birthday party level short...,1
888,ad way heavy listen bad review ad every round ...,0
889,fun work perfectly well ad annoying think espe...,1


In [476]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['review'])


X_train, X_test, y_train, y_test = train_test_split(X, df['polarity'], test_size=0.2, random_state=42)

X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 11811 stored elements and shape (712, 3345)>

In [477]:
#Feature selection

from sklearn.feature_selection import SelectKBest, chi2
selector = SelectKBest(chi2, k=1000)  # Select top 1000 features
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

## BernoulliNB model

In [478]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report

In [479]:
model = BernoulliNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7821229050279329
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.93      0.86       126
           1       0.72      0.43      0.54        53

    accuracy                           0.78       179
   macro avg       0.76      0.68      0.70       179
weighted avg       0.77      0.78      0.76       179



## Optimization

Grid Search

In [480]:
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0]}

model = BernoulliNB()

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Accuracy:", grid_search.best_score_)

best_bnb = grid_search.best_estimator_

Best Parameters: {'alpha': 0.01}
Best Cross-validation Accuracy: 0.905919432679996


In [481]:
model = BernoulliNB(alpha=grid_search.best_params_['alpha'])
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8435754189944135
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.90      0.89       126
           1       0.76      0.70      0.73        53

    accuracy                           0.84       179
   macro avg       0.82      0.80      0.81       179
weighted avg       0.84      0.84      0.84       179



In [482]:
# Saving the model
import pickle


with open(f"models/BernoulliNB_alpha-{grid_search.best_params_["alpha"]}.pkl", 'wb') as file:
    pickle.dump(model, file)

Ensemble Stacking

In [483]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC

# Combine multiple models
estimators = [('nb', BernoulliNB()), ('svm', SVC())]

# Final estimator
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8100558659217877
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.91      0.87       126
           1       0.73      0.57      0.64        53

    accuracy                           0.81       179
   macro avg       0.78      0.74      0.75       179
weighted avg       0.80      0.81      0.80       179



Random Search

In [484]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import scipy.stats as stats

X_train, X_test, y_train, y_test = train_test_split(df['review'], df['polarity'], test_size=0.2, random_state=42)

# Define the pipeline with CountVectorizer and BernoulliNB
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(binary=True, stop_words='english')),
    ('bnb', BernoulliNB())
])

# Define the hyperparameter search space
param_distributions = {
    'vectorizer__ngram_range': [(1, 1), (1, 2)],  # Unigrams or Bigrams
    'vectorizer__max_df': stats.uniform(0.7, 0.3),  # Max document frequency between 0.7 and 1.0
    'vectorizer__min_df': stats.uniform(0.0, 0.1),  # Min document frequency between 0.0 and 0.1
    'bnb__alpha': stats.uniform(0.0, 1.0)  # Alpha between 0 and 1 (Laplace smoothing)
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=10, cv=5, verbose=1, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Cross-validation Score:", random_search.best_score_)

# Predict on the test set using the best estimator
y_pred = random_search.best_estimator_.predict(X_test)

# Evaluate the model
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'bnb__alpha': np.float64(0.596850157946487), 'vectorizer__max_df': np.float64(0.8337498258560773), 'vectorizer__min_df': np.float64(0.00999749158180029), 'vectorizer__ngram_range': (1, 1)}
Best Cross-validation Score: 0.8089333202009259
Test Accuracy: 0.7821229050279329
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.77      0.83       126
           1       0.60      0.81      0.69        53

    accuracy                           0.78       179
   macro avg       0.75      0.79      0.76       179
weighted avg       0.81      0.78      0.79       179

