# Playstore Reviews

In [2]:
import logging 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ( train_test_split, GridSearchCV ) 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import ( MinMaxScaler, LabelEncoder, StandardScaler, OneHotEncoder ) 
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import ( 
        accuracy_score, 
        f1_score, 
        matthews_corrcoef, 
        classification_report, 
        ConfusionMatrixDisplay ) 
from sklearn.inspection import permutation_importance
from sklearn.impute import SimpleImputer
logger = logging.getLogger()
logger.setLevel(logging.INFO)

### **1. Data loading** **Objective**: Obtain the data from source and get a first glimpse of their properties and presentation

In [3]:
#https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv
#Step 1: Loading the dataset
data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")
data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


### **2. Data preprocessing** **Objectives**: Perform the data cleaning, data transformation and data reduction steps to avoid data mistmatching, noisy data or data not wrangled

In [4]:
#Step 2: Studying the variables and their content
#Eliminating irrelevant variable

data.drop(["package_name"], axis = 1, inplace = True)
data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


In [5]:
#Processing the text
#Removing spaces and converting the text to lowercase
data["review"] = data["review"].str.strip().str.lower()
data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


### **3. Exploratory Data Analysis** **Objective**: Summarize the main characteristics of the dataset using descriptive statistics and data visualization methods

In [6]:
# Dividing the dataset into training and test samples
from sklearn.model_selection import train_test_split

X = data["review"]
y = data["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [7]:
X_test.head()

709    love/hate has bug and security issues. i tried...
439    whatsapp i use this app now that blackberry me...
840                             usefully verry  nice app
720    fonts why in the heck is this thing analysing ...
39     app doesn't work after latest upgrade the face...
Name: review, dtype: object

### **4. Machine learning** **Objective**: Create a model that learns from data to make predictions and generalize to unseen data, and thus perform tasks without explicit instructions

In [21]:
# Dividir el dataset
X_train, X_test, y_train, y_test = train_test_split(data["review"], data["polarity"], test_size=0.2, random_state=42)

# Vectorización
vec_model = CountVectorizer(stop_words="english")
X_train_vec = vec_model.fit_transform(X_train)
X_test_vec = vec_model.transform(X_test)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

# --- Entrenar y Evaluar Modelos ---

results = {}

# 1. Multinomial Naive Bayes
mnb_model = MultinomialNB()
mnb_model.fit(X_train_vec, y_train)
y_pred_mnb = mnb_model.predict(X_test_vec)
results['MultinomialNB'] = {
    'accuracy': accuracy_score(y_test, y_pred_mnb),
    'f1_score': f1_score(y_test, y_pred_mnb, average='weighted'),
    'precision': precision_score(y_test, y_pred_mnb, average='weighted'),
    'recall': recall_score(y_test, y_pred_mnb, average='weighted')
}

# 2. Bernoulli Naive Bayes
bnb_model = BernoulliNB()
bnb_model.fit(X_train_vec, y_train)
y_pred_bnb = bnb_model.predict(X_test_vec)
results['BernoulliNB'] = {
    'accuracy': accuracy_score(y_test, y_pred_bnb),
    'f1_score': f1_score(y_test, y_pred_bnb, average='weighted'),
    'precision': precision_score(y_test, y_pred_bnb, average='weighted'),
    'recall': recall_score(y_test, y_pred_bnb, average='weighted')
}

# 3. Gaussian Naive Bayes (con datos densos)
gnb_model = GaussianNB()
X_train_dense = X_train_vec.toarray()
X_test_dense = X_test_vec.toarray()
gnb_model.fit(X_train_dense, y_train)
y_pred_gnb = gnb_model.predict(X_test_dense)
results['GaussianNB'] = {
    'accuracy': accuracy_score(y_test, y_pred_gnb),
    'f1_score': f1_score(y_test, y_pred_gnb, average='weighted'),
    'precision': precision_score(y_test, y_pred_gnb, average='weighted'),
    'recall': recall_score(y_test, y_pred_gnb, average='weighted')
}

# 4. Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_vec, y_train)
y_pred_rf = rf_model.predict(X_test_vec)
results['RandomForest'] = {
    'accuracy': accuracy_score(y_test, y_pred_rf),
    'f1_score': f1_score(y_test, y_pred_rf, average='weighted'),
    'precision': precision_score(y_test, y_pred_rf, average='weighted'),
    'recall': recall_score(y_test, y_pred_rf, average='weighted')
}

# Imprimir los resultados
print("Resultados de los Modelos:")
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

Resultados de los Modelos:

MultinomialNB:
  accuracy: 0.8156
  f1_score: 0.8103
  precision: 0.8098
  recall: 0.8156

BernoulliNB:
  accuracy: 0.7709
  f1_score: 0.7488
  precision: 0.7600
  recall: 0.7709

GaussianNB:
  accuracy: 0.8045
  f1_score: 0.8015
  precision: 0.8000
  recall: 0.8045

RandomForest:
  accuracy: 0.7989
  f1_score: 0.8026
  precision: 0.8097
  recall: 0.7989


In [24]:
results_df = pd.DataFrame.from_dict(results, orient='index')
print("\nTabla de Comparación de Resultados:")
print(results_df)


Tabla de Comparación de Resultados:
               accuracy  f1_score  precision    recall
MultinomialNB  0.815642  0.810268   0.809751  0.815642
BernoulliNB    0.770950  0.748792   0.759998  0.770950
GaussianNB     0.804469  0.801531   0.800005  0.804469
RandomForest   0.798883  0.802642   0.809699  0.798883


### **5. Model saving**
**Objective**: Save the model and some critical information we consider necessary


In [25]:
import pickle
import os

models_to_save = {
    'multinomial_nb_model.pkl': (mnb_model, vec_model),
    'bernoulli_nb_model.pkl': (bnb_model, vec_model),
    'gaussian_nb_model.pkl': (gnb_model, vec_model),
    'random_forest_model.pkl': (rf_model, vec_model)
}

output_dir = 'saved_models'
os.makedirs(output_dir, exist_ok=True)

for filename, (model, vectorizer) in models_to_save.items():
    filepath = os.path.join(output_dir, filename)
    with open(filepath, 'wb') as f:
        pickle.dump({'model': model, 'vectorizer': vectorizer}, f)
    print(f"Modelo y vectorizador guardados en: {filepath}")

Modelo y vectorizador guardados en: saved_models/multinomial_nb_model.pkl
Modelo y vectorizador guardados en: saved_models/bernoulli_nb_model.pkl
Modelo y vectorizador guardados en: saved_models/gaussian_nb_model.pkl
Modelo y vectorizador guardados en: saved_models/random_forest_model.pkl
