In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import classification_report
import joblib

In [2]:
url="https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
df=pd.read_csv(url)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [4]:
df

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0
...,...,...,...
886,com.rovio.angrybirds,loved it i loooooooooooooovvved it because it...,1
887,com.rovio.angrybirds,all time legendary game the birthday party le...,1
888,com.rovio.angrybirds,ads are way to heavy listen to the bad review...,0
889,com.rovio.angrybirds,fun works perfectly well. ads aren't as annoy...,1


In [5]:
df.drop(columns=['package_name'], inplace=True)

In [6]:
df['review'] = df['review'].str.strip().str.lower()

In [7]:
X = df['review']
y = df['polarity']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
#vectorizacion

v_model = CountVectorizer(stop_words= 'english')
X_train_count = v_model.fit_transform(X_train).toarray()
X_test_count = v_model.transform(X_test).toarray()

In [10]:
#GaussianNB model

gaussian_model = GaussianNB()
gaussian_model.fit(X_train_count, y_train)
y_pred_gaussian = gaussian_model.predict(X_test_count)
print(accuracy_score(y_test, y_pred_gaussian))
print(classification_report(y_test, y_pred_gaussian))

0.8044692737430168
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179



In [11]:
# MultinomialNB model

multinomial_model = MultinomialNB()
multinomial_model.fit(X_train_count, y_train)
y_pred_multin = multinomial_model.predict(X_test_count)
print(accuracy_score(y_test, y_pred_multin))
print(classification_report(y_test, y_pred_multin))


0.8156424581005587
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179



In [12]:
# BernoulliNB model

bernoulli_model = BernoulliNB()
bernoulli_model.fit(X_train_count, y_train)
y_pred_bernou = bernoulli_model.predict(X_test_count)
print(accuracy_score(y_test, y_pred_gaussian))
print(classification_report(y_test, y_pred_bernou))

0.8044692737430168
              precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77       179
   macro avg       0.74      0.66      0.68       179
weighted avg       0.76      0.77      0.75       179



In [13]:
# RandomForest
def grid_RandomForest(X_train_res, y_train_res):
    model = RandomForestClassifier(random_state=0, class_weight='balanced')
    n_estimators = [100, 200]
    criterion = ['gini', 'entropy']
    min_samples_split = [2, 5, 10, 20, 30]
    max_depth = [10, 20, 30, 50, None]
    min_samples_leaf = [1, 2, 4, 10]
    max_features = ['sqrt', 'log2']

    grid = dict(n_estimators=n_estimators, criterion=criterion, min_samples_split=min_samples_split, max_depth=max_depth, min_samples_leaf=min_samples_leaf, max_features=max_features)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)

    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='roc_auc', error_score='raise')
    grid_result = grid_search.fit(X_train_res, y_train_res)

    print("Random Forest ROC AUC Score: ", grid_result.best_score_)
    return grid_result.best_estimator_


In [14]:
model_rf = grid_RandomForest(X_train_count, y_train)
model_rf

Random Forest ROC AUC Score:  0.8719008131680767


In [15]:
y_pred_rf = model_rf.predict(X_test_count)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.85      0.86       126
           1       0.65      0.68      0.67        53

    accuracy                           0.80       179
   macro avg       0.76      0.76      0.76       179
weighted avg       0.80      0.80      0.80       179



In [16]:
roc_auc = roc_auc_score(y_test, model_rf.predict_proba(X_test_count)[:, 1])
print("Test ROC AUC Score: ", roc_auc)

Test ROC AUC Score:  0.852051512428871


In [17]:
joblib.dump(model_rf, 'best_rf_model.pkl')

['best_rf_model.pkl']