<a href="https://colab.research.google.com/github/Nathan2605/naive-bayes-project-tutorial/blob/main/proyecto4geeks_NaiveBayes_NH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

####DATA

In [2]:
url = 'https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv'
df = pd.read_csv(url)

In [3]:
def pre_clean(x):
  x = str(x)
  x = x.lower()
  x = x.lstrip()
  x = re.sub(r'\s+','', x)
  return x

In [4]:
df.drop(columns='package_name', inplace=True) #eliminar columna que no aporta
df['review'] = df['review'].str.lower() #pasar minuscula
df['review'] = df['review'].apply(lambda x: pre_clean(x)) #eliminar espacios

In [5]:
#split
X = df['review']
y = df['polarity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
#vectorizar - crear matriz de recuento de palabras
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

####NaiveBayes

In [7]:
#MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.97      0.84       126
           1       0.69      0.17      0.27        53

    accuracy                           0.73       179
   macro avg       0.71      0.57      0.55       179
weighted avg       0.72      0.73      0.67       179



In [8]:
#BernoulliNB
model2 = BernoulliNB()
model2.fit(X_train, y_train)

y_pred2 = model2.predict(X_test)

print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.71      1.00      0.83       126
           1       1.00      0.04      0.07        53

    accuracy                           0.72       179
   macro avg       0.86      0.52      0.45       179
weighted avg       0.80      0.72      0.61       179



In [9]:
#GaussianNB
model3 = GaussianNB()
model3.fit(X_train, y_train)

y_pred3 = model3.predict(X_test)

print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.86      0.14      0.24       126
           1       0.32      0.94      0.47        53

    accuracy                           0.38       179
   macro avg       0.59      0.54      0.36       179
weighted avg       0.70      0.38      0.31       179



####Optimizar

In [10]:
#gridSearch
param_grid = { 'alpha': np.linspace(0.1, 50, 200),
               'fit_prior': [True, False],}

grid_search = GridSearchCV(model, param_grid, cv=8, scoring= 'f1')
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)


y_pred4 = grid_search.predict(X_test)

print(classification_report(y_test, y_pred4))

Best hyperparameters: {'alpha': 25.676884422110557, 'fit_prior': False}
              precision    recall  f1-score   support

           0       0.74      0.96      0.83       126
           1       0.67      0.19      0.29        53

    accuracy                           0.73       179
   macro avg       0.70      0.57      0.56       179
weighted avg       0.72      0.73      0.67       179



In [11]:
#randomSearch
param_dist = {'alpha': np.linspace(0.1, 50, 200),
              'fit_prior': [True, False]}

random_search = RandomizedSearchCV(estimator=model,param_distributions=param_dist,n_iter=200,cv=5, scoring='f1',random_state=42)
random_search.fit(X_train, y_train)
print("Mejores hiperparámetros:", random_search.best_params_)


y_pred5 = random_search.predict(X_test)

print(classification_report(y_test, y_pred5))

Mejores hiperparámetros: {'fit_prior': False, 'alpha': 26.178391959798997}
              precision    recall  f1-score   support

           0       0.74      0.96      0.83       126
           1       0.67      0.19      0.29        53

    accuracy                           0.73       179
   macro avg       0.70      0.57      0.56       179
weighted avg       0.72      0.73      0.67       179



####alternativa Logistic Regresion

In [12]:
def grid_lr(X_train, y_train):
    model = LogisticRegression(random_state=666, max_iter=1000)
    class_weight =  [{0:0.05, 1:0.95}, {0:0.1, 1:0.9}, {0:0.2, 1:0.8}]
    solvers = ['liblinear']
    penalty = ['l2','l1']
    c_values = [ 10, 1.0, 0.1, 0.01, 0.001, ]
    grid = dict(solver=solvers,penalty=penalty,C=c_values, class_weight= class_weight)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring='accuracy',error_score='raise')
    grid_result = grid_search.fit(X_train, y_train)
    return  grid_result.best_estimator_

In [13]:
logistic_model = grid_lr(X_train, y_train)
y_pred6 = logistic_model.predict(X_test)
print(classification_report(y_test, y_pred6))

              precision    recall  f1-score   support

           0       0.72      0.98      0.83       126
           1       0.62      0.09      0.16        53

    accuracy                           0.72       179
   macro avg       0.67      0.54      0.50       179
weighted avg       0.69      0.72      0.63       179



####alternativa SVM

In [14]:
def grid_SVC(X_train, y_train, performance_metric='f1', resultsGrid=False):
    model = SVC()
    C = np.linspace(0.000001 , 1000, 10)
    kernels = ['poly', 'rbf', 'linear', 'sigmoid']
    gamma = ['scale', 'auto']
    grid = dict(C = C, kernel = kernels, gamma = gamma)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring=performance_metric,error_score='raise')
    grid_result = grid_search.fit(X_train, y_train)
    if resultsGrid==True:
        return grid_result.cv_results_
    else:
        return  grid_result.best_estimator_

In [15]:
SVM_model = grid_SVC(X_train, y_train)
y_pred7 = SVM_model.predict(X_test)
print(classification_report(y_test, y_pred7))

              precision    recall  f1-score   support

           0       1.00      0.02      0.03       126
           1       0.30      1.00      0.46        53

    accuracy                           0.31       179
   macro avg       0.65      0.51      0.25       179
weighted avg       0.79      0.31      0.16       179

