### Proyecto Naive Bayes

#### 0.- Importacion de librerias

In [63]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
from sklearn.linear_model import LogisticRegression

#### 1.- Carga del conjunto de datos y Guardarlos

Leer arcivo csv y guardarlo en raw

In [64]:
total_data = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv")
total_data.to_csv('/workspaces/Tutorial-del-Proyecto-Naive-Bayes/data/raw/playstore_reviews_guardado.csv', index=False)
total_data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


#### 2.- Estudio de variables y su contenido + Limpieza

In [65]:
total_data = total_data.drop(columns=['package_name'])
total_data["review"] = total_data["review"].str.strip().str.lower()
total_data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


Dividir el conjunto de datos en train y test

In [66]:
X = total_data['review']
y = total_data['polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vec_model = CountVectorizer(stop_words='english')
X_train_vec = vec_model.fit_transform(X_train).toarray()
X_test_vec = vec_model.transform(X_test).toarray()

#### 3.- Construir modelos Naive Bayes

In [67]:
# BernoulliNB

bernoulli_nb = BernoulliNB()
bernoulli_nb.fit(X_train_vec, y_train)
y_pred_bernoulli = bernoulli_nb.predict(X_test_vec)

print("\nBernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bernoulli))
print(classification_report(y_test, y_pred_bernoulli))


BernoulliNB Accuracy: 0.770949720670391
              precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77       179
   macro avg       0.74      0.66      0.68       179
weighted avg       0.76      0.77      0.75       179



In [68]:
# GaussianNB

gaussian_nb = GaussianNB()
gaussian_nb.fit(X_train_vec, y_train)
y_pred_gaussian = gaussian_nb.predict(X_test_vec)

print("\nGaussianNB Accuracy:", accuracy_score(y_test, y_pred_gaussian))
print(classification_report(y_test, y_pred_gaussian))

# MultinomialNB

multi_nb = MultinomialNB()
multi_nb.fit(X_train_vec, y_train)
y_pred_multi = multi_nb.predict(X_test_vec)

print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_multi))
print(classification_report(y_test, y_pred_multi))


GaussianNB Accuracy: 0.8044692737430168
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179

MultinomialNB Accuracy: 0.8156424581005587
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       126
           1       0.73      0.60      0.66        53

    accuracy                           0.82       179
   macro avg       0.79      0.75      0.77       179
weighted avg       0.81      0.82      0.81       179



Vemos que el GaussianNB es el mas effectivo

#### 5.- Optimizaciones y guardado

In [69]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_vec, y_train)
y_pred_rf = rf_model.predict(X_test_vec)

print("\nRandom Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

joblib.dump(multi_nb, '/workspaces/Tutorial-del-Proyecto-Naive-Bayes/models/multinomial_nb_model.pkl')
joblib.dump(vec_model, '/workspaces/Tutorial-del-Proyecto-Naive-Bayes/models/vectorizer.pkl')



Random Forest Accuracy: 0.7988826815642458
              precision    recall  f1-score   support

           0       0.88      0.83      0.85       126
           1       0.64      0.74      0.68        53

    accuracy                           0.80       179
   macro avg       0.76      0.78      0.77       179
weighted avg       0.81      0.80      0.80       179



['/workspaces/Tutorial-del-Proyecto-Naive-Bayes/models/vectorizer.pkl']

#### 6.- Explora otras alternativas

In [70]:
lr_model = LogisticRegression()
lr_model.fit(X_train_vec, y_train)
y_pred_lr = lr_model.predict(X_test_vec)

print("\nLogistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))




Logistic Regression Accuracy: 0.8324022346368715
              precision    recall  f1-score   support

           0       0.91      0.84      0.88       126
           1       0.68      0.81      0.74        53

    accuracy                           0.83       179
   macro avg       0.80      0.83      0.81       179
weighted avg       0.85      0.83      0.84       179



##### Argumento:  Modelos como Logistic Regression o SVM suelen superar a Naive Bayes cuando hay muchos datos y relaciones más complejas, ya que no hacen suposiciones tan fuertes como la independencia entre features.