![ITI.jpg](https://drive.google.com/uc?export=view&id=1ois0vnRw0a0326tbE-ZA-8y3san-gf4d)

# Capstone - Master en Data Science (2406VDSO)
# IMMUNE Technology Insitute

Autores:

* Keilor Fallas
* Lindsay López
* Wendy Rodriguez
* Allan Vargas

# **Sección 1**: Exploración y preparación de datos

## Librerías

In [None]:
!pip install pyjanitor # Solo si no está instalada

In [None]:
import pandas as pd
import numpy as np
import janitor

In [None]:
# Conexión con Google Drive

from google.colab import drive
drive.mount('/content/drive')

## Limpieza de Datos

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Capstone IMMUNE /Datasets/DisneylandReviews/DisneylandReviews.csv', encoding='latin-1')
df

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.index

In [None]:
df_m=df.loc[df['Year_Month']=='missing']
df_m

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Capstone IMMUNE /Datasets/DisneylandReviews/DisneylandReviews.csv', encoding='latin-1', na_values=['missing'])
df

In [None]:
df=df.dropna().reset_index()
print ("\nMissing values :  ", df.isnull().sum().values.sum())

In [None]:
df

In [None]:
# Estandarizar nombre de variables

df= df.clean_names()
print(df.columns)

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
print(len(df['reviewer_location'].unique()))

In [None]:
print(df['reviewer_location'].unique())

In [None]:
print(df['branch'].value_counts())

In [None]:
## Da formato a Year_Month de tal manera que el día tenga 2 dígitos

df['year_month'] = df['year_month'].astype(str).apply(
    lambda x: x if '-' not in x else x.split('-')[0] + '-' + x.split('-')[1].zfill(2)
)

In [None]:
df = df.drop('index', axis=1)
df.shape

In [None]:
df['reviewer_location'] = df['reviewer_location'].str.strip()
df['branch'] = df['branch'].str.strip()
df

In [None]:
print(df['rating'].unique())

# Análisis Exploratorio de Datos

## Distribución Rating

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(8, 4))
ax = sns.histplot(data=df, x='rating', bins=range(1, 7), color='mediumslateblue', edgecolor='black', discrete=True)

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/2, height + 1, int(height), ha="center")

plt.title('Distribución de Ratings')
plt.xlabel('Rating')
plt.ylabel('Frecuencia')
plt.show()


## Distribución por Parque

In [None]:
branch_counts = df['branch'].value_counts()

plt.figure(figsize=(8, 8))
branch_counts.plot.pie(autopct='%1.1f%%', colors=sns.color_palette('pastel', len(branch_counts)), startangle=90)
plt.title('Distribución de reseñas por Parque')
plt.ylabel('')
plt.show()

## Cantidad de países

In [None]:
print("Número de países únicos:", df['reviewer_location'].nunique())

## Países con más reseñas

In [None]:
top_paises = df['reviewer_location'].value_counts().head(10)
sns.barplot(x=top_paises.values, y=top_paises.index, palette='coolwarm')
plt.title('Top 10 países con mas reseñas')
plt.xlabel('Cantidad de reseñas')
plt.ylabel('País')
plt.show()

## Reseñas por año

In [None]:
df['year_month'] = pd.to_datetime(df['year_month'], format='%Y-%m', errors='ignore')

reviews_by_month = df.groupby('year_month').size()
reviews_by_month.plot()
plt.title('Reseñas a lo largo del tiempo')
plt.xlabel('Fecha')
plt.ylabel('Cantidad de reseñas')
plt.grid()
plt.show()

## Distribución de sentimiento

In [None]:
df['target'] = df['rating'].apply(lambda x: 1 if x >= 4 else 0)
print(df['target'].value_counts())
sns.countplot(x='target', data=df, palette='muted')
plt.title('Distribución de Sentimiento')
plt.xlabel('Sentimiento')
plt.ylabel('Cantidad')
plt.xticks([0,1], ['Negativa', 'Positiva'])
plt.show()

# Sección 2: Modelos de Machine Learning

# Modelos

## Liberias para modelos

In [None]:
# Cargar df nuevamente para utilizar dataset completo

df = pd.read_csv('/content/drive/MyDrive/Capstone IMMUNE /Datasets/DisneylandReviews/DisneylandReviews.csv', encoding='latin-1').clean_names()
df

In [None]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score,
                            recall_score, f1_score, confusion_matrix, roc_curve,
                            auc, classification_report, precision_recall_curve,
                            average_precision_score)
from sklearn.svm import LinearSVC, SVC
import joblib

## Limpieza de reseñas para modelos

In [None]:
# Librerías de procesamiento de texto
import nltk
from nltk.corpus import stopwords # Para manejar las stopwords ('palabras vacías')
from nltk.tokenize import word_tokenize # Divide el texto en tokens ('palabras individuales')
from nltk.stem import WordNetLemmatizer # Reduce las palabras a su forma base
nltk.download('all',quiet=True) # Expresiones regulares
import re


In [None]:
# Limpieza

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Creación de función para limpieza de texto

def clean_text(text):
    text = re.sub(r'http\S+|www\.\S+', '', text)             # URLs
    text = re.sub(r'@\w+|#\w+', '', text)                    # menciones/hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)                  # puntuación y números
    text = text.lower()                                      # minúsculas
    tokens = nltk.word_tokenize(text)                        # tokenización
    tokens = [tok for tok in tokens if tok not in stop_words]  # stopwords
    tokens = [lemmatizer.lemmatize(tok) for tok in tokens]     # lematización
    return ' '.join(tokens)


In [None]:
# Aplicar fórmula al texto de las reseñas

df['clean_review_text'] = df['review_text'].apply(clean_text)

In [None]:
# Muestra de reviews limpias

print(df[['review_text','clean_review_text']].head())

## Definición de variable 'target'

In [None]:
df['target'] = df['rating'].apply(lambda x: 1 if x >= 4 else 0) # Definición de variable target

In [None]:
print(df['target'].value_counts())

In [None]:
# Guardar df limpio

df.to_csv('/content/drive/MyDrive/Capstone IMMUNE /Datasets/DisneylandReviews/DisneylandReviews_clean.csv', index=False)

In [None]:
# Cargar df limpio

df = pd.read_csv('/content/drive/MyDrive/Capstone IMMUNE /Datasets/DisneylandReviews/DisneylandReviews_clean.csv')


## Split Train - Test

In [None]:
# Train/test


df_train, df_test = train_test_split(df, test_size=0.2,
                                     stratify=df["target"],
                                     random_state=42)


print('Clases en datos de entrenamiento')
print(df_train['target'].value_counts())


print('Clases en datos de prueba')
print(df_test['target'].value_counts())

X_train = df_train['clean_review_text']
y_train = df_train['target']
X_test = df_test['clean_review_text']
y_test = df_test['target']

In [None]:
# Mostrar composición de los DF

print(df_train.shape)
print(df_test.shape)

In [None]:
'''# Balanceo de clases

from sklearn.utils import resample
from sklearn.model_selection import train_test_split


df_train_positive= df_train[df_train['target'] == 1]
df_train_negative = df_train[df_train['target'] == 0]

df_train_positive_downsampled =  resample(df_train_positive,
                                    replace=False,
                                    n_samples=len(df_train_negative)
                                    )
df_train_bal = pd.concat([df_train_positive_downsampled,df_train_negative],axis=0)

# Verificar balanceo de clases

print('Clases en datos de entrenamiento')
print(df_train_bal['target'].value_counts())


print('Clases en datos de prueba')
print(df_test['target'].value_counts())'''

In [None]:
# Definir class_weight para tomar en cuenta las diferencias de clases en los modelos

from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))
print(class_weights_dict)

## Regresión Logística

In [None]:
# Creación de variables de test y train

features = ['branch', 'reviewer_location', 'clean_review_text']
X_train = df_train[features]
y_train = df_train['target']
X_test = df_test[features]
y_test = df_test['target']

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(max_features=5000, stop_words='english'), 'clean_review_text'),
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['branch', 'reviewer_location'])
])

In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(C=0.1, penalty='l2', solver='liblinear', class_weight='balanced', max_iter=1000))
])

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear']
}

logreg_grid = GridSearchCV(model, param_grid, cv=5, scoring='f1')
logreg_grid.fit(X_train, y_train)

print("Mejores parámetros:", logreg_grid.best_params_)

In [None]:
# Predecir las probabilidades para el set de prueba

y_probs = logreg_grid.predict_proba(X_test)[:, 1]

# Predecir la clases para el set de prueba

y_pred = logreg_grid.predict(X_test)

# Matriz de confusión

print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))

In [None]:
report_rl = classification_report(y_test, y_pred, target_names=['Negativa', 'Positiva'],output_dict=True)
print("\nReporte de clasificación: Regresión Logística")
print(classification_report(y_test, y_pred, target_names=['Negativa', 'Positiva']))


In [None]:
# Calcular el AUC - ROC score
roc_auc_rl = roc_auc_score(y_test, y_probs)

In [None]:
# Generar curva ROC

fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc_rl = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_rl:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Calcular precision y recall para varios umbrales
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

# Calcular el Average Precision Score
avg_precision = average_precision_score(y_test, y_probs)

# Graficar la curva Precision-Recall
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall')
plt.legend()
plt.grid(True)
plt.show()

El modelo es bueno detectando positivos, con un 85% de recall para la clase 1. Sin embargo, su desempeño en detectar negativos es más limitado, con un 82% de recall para la clase 0.
En la matriz de confusión se observa que 1324 ejemplos fueron correctamente clasificados como negativos, y 286 fueron erróneamente clasificados como positivos.
La precisión general del modelo (accuracy) fue del 84%.

In [None]:
# Guardar el modelo

joblib.dump(logreg_grid, '/content/drive/MyDrive/Capstone IMMUNE /Entregables/Modelos/logreg_grid.pkl')

## Support vector machine (SVM)

In [None]:
X_train = df_train['clean_review_text']
y_train = df_train['target']
X_test = df_test['clean_review_text']
y_test = df_test['target']

In [None]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
svm = SVC(kernel='linear',probability=True, random_state=42,class_weight='balanced')
svm.fit(X_train_tfidf, y_train)

In [None]:
# Predecir las probabilidades para el set de prueba

y_probs = svm.predict_proba(X_test_tfidf)[:,1]

# Predecir la clases para el set de prueba

y_pred = svm.predict(X_test_tfidf)

# Matriz de confusión

print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))

# Classification report

report_svm = classification_report(y_test, y_pred, target_names=['Negativa', 'Positiva'], output_dict=True)
print(print("\nReporte de clasificación: SVM"))
print(classification_report(y_test, y_pred, target_names=['Negativa', 'Positiva']))

In [None]:
# Calcular ROC y AUC
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc_svm = auc(fpr, tpr)

# Mostrar resultados
print(f"AUC: {roc_auc_svm:.2f}")

# Graficar la curva ROC

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc_svm:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # línea base
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Calcular precision y recall para la curva
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

# Calcular el average precision score
avg_precision = average_precision_score(y_test, y_probs)

# Graficar la curva Precision-Recall
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall (SVM)')
plt.legend()
plt.grid(True)
plt.show()

El modelo SVM tiene un buen desempeño general, logrando un 85% de recall para la clase Positiva, pero solo un 81% de recall para la clase Negativa.
En la matriz de confusión se observa que el modelo identifica correctamente la mayoría de los casos positivos, aunque le cuesta más clasificar adecuadamente los negativos.
La precisión general del modelo (accuracy) fue del 84%.

In [None]:
# Guardar el modelo SVM y el vectorizador

joblib.dump(svm, '/content/drive/MyDrive/Capstone IMMUNE /Entregables/Modelos/svm_model.pkl')
joblib.dump(vectorizer, '/content/drive/MyDrive/Capstone IMMUNE /Entregables/Modelos/tfidf_vectorizer.pkl')

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
X_train = df_train['clean_review_text']
y_train = df_train['target']
X_test = df_test['clean_review_text']
y_test = df_test['target']

In [None]:
# Normalizar los pesos de las clases

class_weights_norm = class_weights / class_weights.sum()

# Modelo

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred_nb = nb.predict(X_test_tfidf)
y_probs = nb.predict_proba(X_test_tfidf)[:, 1]

In [None]:
report_nb = classification_report(y_test, y_pred_nb, target_names=['Negativa', 'Positiva'],output_dict=True)
print("\n Naive Bayes ")
print(classification_report(y_test, y_pred_nb, target_names=['Negativa', 'Positiva']))

In [None]:
print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred_nb))

In [None]:
# Calcular ROC y AUC
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc_nb = auc(fpr, tpr)
print(f"AUC: {roc_auc_nb:.2f}")

# Graficar curva ROC
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc_nb:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel("Tasa de falsos positivos")
plt.ylabel("Tasa de verdaderos positivos")
plt.title("ROC")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Calcular precisión y recall
precision, recall, _ = precision_recall_curve(y_test, y_probs)
avg_precision = average_precision_score(y_test, y_probs)

# Graficar
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall (Naive Bayes)')
plt.legend()
plt.grid(True)
plt.show()

Modelo bueno detectando positivas 86%, pero de los realmente negativos detecto un 32%
En la matriz de confusion se observa que 518 fueron correctamente clasificadas como negativas y 1092 fueron erroneamente clasificadas como positivas.

In [None]:
# Guardar el modelo Naive Bayes
joblib.dump(nb, '/content/drive/MyDrive/Capstone IMMUNE /Entregables/Modelos/nb_model.pkl')

## Árbol de Decisión

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
X_train = df_train['clean_review_text']
y_train = df_train['target']
X_test = df_test['clean_review_text']
y_test = df_test['target']

In [None]:
tree = DecisionTreeClassifier(max_depth=15, random_state=42,class_weight='balanced')
tree.fit(X_train_tfidf, y_train)


In [None]:
y_pred_tree = tree.predict(X_test_tfidf)
y_probs = tree.predict_proba(X_test_tfidf)

In [None]:
report_dt = classification_report(y_test, y_pred_tree, target_names=['Negativa', 'Positiva'],output_dict=True)
print("\n Reporte de Clasificación: Árbol de Decisión ")
print(pd.DataFrame.from_dict(report_dt).round(2).T)

In [None]:
print("Matriz de confusión:")
print(confusion_matrix(y_test, y_pred_tree))

In [None]:
# Calcular ROC y AUC
fpr, tpr, thresholds = roc_curve(y_test, y_probs[:, 1])
roc_auc_dt = auc(fpr, tpr)
print(f"AUC: {roc_auc_dt:.2f}")

# Graficar curva ROC
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc_dt:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel("Tasa de falsos positivos")
plt.ylabel("Tasa de verdaderos positivos")
plt.title("ROC")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Obtener probabilidades de la clase positiva
y_probs_tree = y_probs[:, 1]

# Calcular precision y recall
precision, recall, _ = precision_recall_curve(y_test, y_probs_tree)
avg_precision = average_precision_score(y_test, y_probs_tree)

# Graficar
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall (Árbol de Decisión)')
plt.legend()
plt.grid(True)
plt.show()

El modelo detecta el 89% de las reseñas positivas pero solo detecta el 43% de las reseñas negativas

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

In [None]:
plt.figure(figsize=(20, 10))  # Ajusta el tamaño si quieres
plot_tree(tree,
          filled=True,
          feature_names=vectorizer.get_feature_names_out(),
          class_names=['Negativa', 'Positiva'],
          max_depth=2, # Solo mostrar 2 niveles para que sea más claro
          fontsize=10)
plt.show()

Con el arbol de decision se observa que hay mas reseñas positivas por lo que genera una tendencia en los modelos a predecir mayormente reseñas positivas para tener mas aciertos.

In [None]:
# Guardar el modelo de Árbol de Decisión
joblib.dump(tree, '/content/drive/MyDrive/Capstone IMMUNE /Entregables/Modelos/tree_model.pkl')

## XGBoost

In [None]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier

In [None]:
# Train/Test Split

X_train = df_train['clean_review_text']
y_train = df_train['target']
X_test = df_test['clean_review_text']
y_test = df_test['target']

# ratio = cantidad de negativos / positivos
ratio = sum(y_train == 0) / sum(y_train == 1)


# Pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Convierte texto en vectores
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42,scale_pos_weight=ratio))
])

# Espacio de busqueda

param_dist = {
    'tfidf__max_df': [0.8, 0.9, 1.0],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'xgb__n_estimators': [100, 200, 300],
    'xgb__max_depth': [3, 4, 5, 6],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0]
}

# RandomizedSearchCV

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    scoring='roc_auc',
    n_jobs=-1,
    cv=3,
    verbose=1,
    random_state=42
)
# Hacer el fit del modelo

search.fit(X_train, y_train)

In [None]:
# Evaluar el modelo
y_pred = search.predict(X_test)
report_xgb = classification_report(y_test, y_pred,target_names=['Negativa', 'Positiva'],output_dict=True)
print("Mejores parámetros encontrados:", search.best_params_)
print("\n Reporte de Clasificación: XGBoost")
print(pd.DataFrame.from_dict(report_xgb).round(2).T)

In [None]:
# Obtener las probabilidades con el mejor modelo
best_model = search.best_estimator_
y_probs = best_model.predict_proba(X_test)[:, 1]  # Probabilidad de clase positiva

# Calcular curva ROC y AUC
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc_xgb = auc(fpr, tpr)
print(f"AUC: {roc_auc_xgb:.2f}")

# Graficar curva ROC
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc_xgb:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Calcular precisión y recall
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)
avg_precision = average_precision_score(y_test, y_probs)
print(f"Average Precision (AP): {avg_precision:.2f}")

# Graficar curva Precision-Recall
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}', color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall del Mejor Modelo')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Guardar el modelo encontrado por RandomizedSearchCV
best_model = search.best_estimator_

# Guardar el modelo en Google Drive
joblib.dump(best_model, '/content/drive/MyDrive/Capstone IMMUNE /Entregables/Modelos/xgb_best_model.pkl')

## SVCLinear

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, precision_recall_curve
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sentence_transformers import SentenceTransformer
from scipy.sparse import hstack
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.combine import SMOTEENN
from sklearn.decomposition import TruncatedSVD


In [None]:
# División train/test

X, X_test, y, y_test = train_test_split(
    df['clean_review_text'], df['target'], test_size=0.2,
    random_state=42, stratify=df['target']
)

In [None]:
print(df['target'].value_counts())

In [None]:
# Extracción de features
tfidf = TfidfVectorizer(max_features=15000, ngram_range=(3,5), analyzer='char')
X_tfidf = tfidf.fit_transform(X)
X_test_tfidf = tfidf.transform(X_test)
embed = SentenceTransformer('all-mpnet-base-v2')
emb = embed.encode(X.tolist(), batch_size=128, show_progress_bar=True)
emb_test = embed.encode(X_test.tolist(), batch_size=128, show_progress_bar=True)
X_feat = hstack([X_tfidf, emb])
X_test_feat = hstack([X_test_tfidf, emb_test])

In [None]:
# Reducir solo la parte TF-IDF
svd = TruncatedSVD(n_components=300, random_state=42)
X_tfidf_reduced = svd.fit_transform(X_tfidf)
X_test_tfidf_reduced = svd.transform(X_test_tfidf)

# Concatenar con los embeddings densos
X_feat = np.hstack([X_tfidf_reduced, emb])
X_test_feat = np.hstack([X_test_tfidf_reduced, emb_test])


In [None]:
# División para calibración y umbral
X_train, X_val, y_train, y_val = train_test_split(
    X_feat, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Búsqueda de hiperparámetros en train

base_svc = LinearSVC(class_weight='balanced', max_iter=10000)
search = RandomizedSearchCV(
    LinearSVC(class_weight='balanced', max_iter=15000),
    {'C': np.logspace(-4,1,20)},
    n_iter=15, scoring='f1', cv=StratifiedKFold(5),
    random_state=42, n_jobs=-1
)
search.fit(X_train, y_train)
best_svc = search.best_estimator_

print(f"Mejor SVC C={search.best_params_['C']}, F1 train CV={search.best_score_:.3f}")

In [None]:
# Calibración y umbral
calibrated = CalibratedClassifierCV(best_svc, method='sigmoid', cv='prefit')
calibrated.fit(X_train, y_train)

y_val_probs = calibrated.predict_proba(X_val)[:,1]
dp_val, dr_val, dt_val = precision_recall_curve(y_val, y_val_probs)
f1_scores_val = 2 * dp_val * dr_val / (dp_val + dr_val + 1e-6)
best_idx_val = np.nanargmax(f1_scores_val)
best_thr = dt_val[best_idx_val]


In [None]:
# Evaluación final en test con umbral fijado
y_test_probs = calibrated.predict_proba(X_test_feat)[:,1]
y_pred = (y_test_probs >= best_thr).astype(int)
report_svcl = classification_report(y_test, y_pred, target_names=['Negativa','Positiva'],
                            output_dict=True)

print("\n Reporte de Clasificación: SVC")
print(classification_report(y_test, y_pred, target_names=['Negativa','Positiva']))


In [None]:
# Calcular ROC y AUC
fpr, tpr, thresholds = roc_curve(y_test, y_test_probs)
roc_auc_svcl = auc(fpr, tpr)
print(f"AUC: {roc_auc_svcl:.2f}")

# Graficar curva ROC
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc_svcl:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel("Tasa de falsos positivos")
plt.ylabel("Tasa de verdaderos positivos")
plt.title("ROC")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Calcular precision y recall
precision, recall, _ = precision_recall_curve(y_test, y_test_probs)
avg_precision = average_precision_score(y_test, y_test_probs)

# Graficar
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall (SVCLinear)')
plt.legend()
plt.grid(True)
plt.show()

# **Sección 3**: Modelos de DeepLearning

# Modelos DeepLearning

## Análisis de sentimiento con NLTK

### Importar paquetes necesarios

In [None]:
import nltk
from nltk.corpus import stopwords # Para manejar las stopwords ('palabras vacías')
from nltk.tokenize import word_tokenize # Divide el texto en tokens ('palabras individuales')
from nltk.stem import WordNetLemmatizer # Reduce las palabras a su forma base
from nltk.sentiment.vader import SentimentIntensityAnalyzer # Analizador de sentimientos

nltk.download('all',quiet=True)


### Modelo

In [None]:
# Instanciar SentimentIntensityAnalizer

sia = SentimentIntensityAnalyzer()

# Crear una fucnión para definir el mejor umbral para la clasificación con SIA

def find_best_threshold(df, true_label_col='true_label', text_col='clean_review_text'):
    thresholds = np.arange(-1.0, 1.01, 0.01)
    best_threshold = 0.0
    best_f1 = 0.0

    sia = SentimentIntensityAnalyzer()

    # Precalcular los compound scores
    df['compound'] = df[text_col].apply(lambda x: sia.polarity_scores(str(x))['compound'])

    for threshold in thresholds:
        preds = df['compound'].apply(lambda x: 1 if x >= threshold else 0)
        f1 = f1_score(df[true_label_col], preds)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold

    return best_threshold, best_f1

# Definir el mejor umbral

best_thresh, best_f1 = find_best_threshold(df,'target')
print(f"Best threshold: {best_thresh}, F1-score: {best_f1:.4f}")


In [None]:
# Se crea una función para clasificar el sentimiento de las reseñas

def classify_sentiment(text,threshold=0):
  # Remover stop words
  stop_words = set(stopwords.words('english'))
  words = word_tokenize(text)
  filtered_words = [word for word in words if word.lower() not in stop_words]
  text = ' '.join(filtered_words)
  sentiment_score = sia.polarity_scores(text) # Genera scores en relación con si el texto es positivo, negativo o neutro
  compound_score = sentiment_score['compound'] # Resumen del sentimiento en general
  if compound_score >= threshold: # Clasifica según sea el sentimiento general
    return 1
  else:
    return 0

# Aplicar función a los datos

df['sentiment_score'] = df['clean_review_text'].\
                        apply(lambda x: sia.polarity_scores(str(x))['compound'])

df['sentiment'] = df['clean_review_text'].\
                  apply(lambda text: classify_sentiment(text, threshold=best_thresh))

In [None]:
print(df[['clean_review_text', 'sentiment_score', 'sentiment']].head())

### Evaluación

In [None]:
# Matriz de confusion

print('Matriz de confusión: NLTK')

cm = confusion_matrix(df['target'],
                      df['sentiment'])

print("Matriz de confusión:")
print(cm)

# Accuracy score entre 'rating_class' y 'sentiment

print('Accuracy Score: NLTK')

accuracy = accuracy_score(df['target'],
                          df['sentiment'])

print(f"Accuracy: {accuracy:.2f}")

# Accuracy score por variable

for sentiment in df['sentiment'].unique():
    validation_df = df[df['sentiment'] == sentiment]
    accuracy = accuracy_score(df['sentiment'],
                              df['target'])
    print(f"Accuracy for {sentiment}: {accuracy:.2f}")

# Classification report

report_nltk = classification_report(df['target'],df['sentiment'],target_names=['Negativa', 'Positiva'],output_dict=True)
print("\n Reporte de Clasificación: NLTK")
print(pd.DataFrame.from_dict(report_nltk).round(2))

In [None]:
# Etiquetas reales
y_true = df['target']

# Puntajes continuos
y_scores = df['sentiment_score']

# Calcular puntos ROC
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc_nltk = auc(fpr, tpr)

# Graficar curva ROC
print(f"AUC: {roc_auc_nltk:.2f}")
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc_nltk:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Curva ROC - NLTK')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
# Calcular precisión, recall y Average Precision
precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
avg_precision = average_precision_score(y_true, y_scores)

# Graficar curva Precision-Recall
print(f"Average Precision (AP): {avg_precision:.2f}")
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}', color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall - NLTK')
plt.legend()
plt.grid(True)
plt.show()


## Análisis de sentimiento con RNN

### Importar paquetes necesarios

In [None]:
import tensorflow as tf # Tensor flow
from tensorflow.keras.models import Sequential, Model #  Modelo secuencial
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, Dropout # Layers requeridas
from tensorflow.keras.preprocessing.text import Tokenizer # Creación de tokens
from tensorflow.keras.preprocessing.sequence import pad_sequences # Normalización de texto
from tensorflow.keras.callbacks import EarlyStopping # EarlyStopping para evitar el overfitting
from sklearn.model_selection import train_test_split # Train/Test split


### Modelo

In [None]:
# Train/Test Split

X_train = df_train['clean_review_text']
y_train = df_train['target']
X_test = df_test['clean_review_text']
y_test = df_test['target']


# Crear tokens y ajustar las secuencias
max_vocab = 10000 # Tamaño máximo de palabras a tomar en cuenta del texto

lengths = []
for index, text in df['clean_review_text'].items():
    lengths.append(len(text.split()))

max_len = int(np.percentile(lengths, 95)) # Utilizando un largo que cubra al menos un 95% de las reviews.

tokenizer = Tokenizer(num_words=max_vocab, oov_token="<OOV>") # OOV: Out of vocabulary
tokenizer.fit_on_texts(df_train['clean_review_text'])

sequences = tokenizer.texts_to_sequences(df_train['clean_review_text'])

padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded, df_train['target'], test_size=0.2, random_state=42)

# Crear el modelo
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=max_vocab, output_dim=100, input_length=max_len))
model_rnn.add(SimpleRNN(64))
model_rnn.add(Dense(1, activation='sigmoid'))

# Compilar el modelo

model_rnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Ver el resumen del modelo

model_rnn.summary()

# Earlystopping para evitar overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Entrenar el modelo
history = model_rnn.fit(X_train, y_train, validation_data=(X_test, y_test),
                        epochs=25, batch_size=128, callbacks=[early_stop],
                        class_weight=class_weights_dict)


### Evaluación

In [None]:
# Obtener las probabilidades
y_probs_rnn = model_rnn.predict(X_test)

# Convertir probabilidades a clases binarias
y_pred_rnn = (y_probs_rnn > 0.5).astype(int).flatten()

# Mostrar el classification report
report_rnn = classification_report(y_test, y_pred_rnn,target_names=['Negativa', 'Positiva'],output_dict=True)
print("\n Reporte de Clasificación: RNN")
print(classification_report(y_test, y_pred_rnn,target_names=['Negativa', 'Positiva']))

#  Evaluar el modelo
loss, accuracy = model_rnn.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")



In [None]:
# Visualizar accuracy vs val_accuracy

plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
# Obtener probabilidades de clase positiva
y_probs = model_rnn.predict(X_test).ravel()  # .ravel() para convertir a vector 1D

# Calcular curva ROC y AUC
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc_rnn = auc(fpr, tpr)
print(f"AUC: {roc_auc_rnn:.2f}")

# Graficar
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc_rnn:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Asegurarse de que las probabilidades estén en una dimensión compatible
y_probs_rnn = y_probs_rnn.ravel()

# Calcular precisión y recall
precision, recall, thresholds = precision_recall_curve(y_test, y_probs_rnn)
avg_precision = average_precision_score(y_test, y_probs_rnn)

# Graficar curva Precision-Recall
print(f"Average Precision (AP): {avg_precision:.2f}")
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}', color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall - Modelo RNN')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Guardar el modelo completo
model_rnn.save('/content/drive/MyDrive/Capstone IMMUNE /Entregables/Modelos/model_rnn.keras')

## Análisis de sentimiento con BERT

### Importar los paquetes necesarios

In [None]:
from sentence_transformers import SentenceTransformer  # Para convertir oraciones en embeddings
from sklearn.model_selection import train_test_split #split Train/Test
from sklearn.linear_model import LogisticRegression # Regresión logística
from sklearn.metrics import classification_report, confusion_matrix # Para evaluación del modelo

### Modelo

In [None]:
# Train/Test Split

X_train = df_train['clean_review_text']
y_train = df_train['target']
X_test = df_test['clean_review_text']
y_test = df_test['target']

In [None]:
# Generar los BERT Embeddings

model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# Codificar X_train y X_test

X_train_encoded = model.encode(df_train['clean_review_text'].astype(str).tolist(), show_progress_bar=True)

X_test_encoded = model.encode(df_test['clean_review_text'].astype(str).tolist(), show_progress_bar=True)


# Entrenar el classifier

clf_bert = LogisticRegression(max_iter=1000, class_weight='balanced')
clf_bert.fit(X_train_encoded, y_train)

### Evaluar el modelo

In [None]:
y_pred = clf_bert.predict(X_test_encoded)

# Matriz de confusion

cm = confusion_matrix(y_test,
                      y_pred)
print("Matriz de confusión:")
print(cm)

# Classification report

report_bert = classification_report(y_test, y_pred,target_names=['Negativa', 'Positiva'],output_dict=True)
print('n\Classification report: BERT')
print(classification_report(y_test, y_pred,target_names=['Negativa', 'Positiva']))


In [None]:
# Obtener las probabilidades para cada clase
y_probs = clf_bert.predict_proba(X_test_encoded)

# Calcular ROC y AUC
fpr, tpr, thresholds = roc_curve(y_test, y_probs[:, 1])  # Selecciona la clase positiva (segunda columna)
roc_auc_bert = auc(fpr, tpr)

# Mostrar el AUC
print(f"AUC: {roc_auc_bert:.2f}")

# Graficar la curva ROC
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc_bert:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Obtener probabilidades de clase positiva
y_probs_bert = y_probs[:, 1]

# Calcular precisión, recall y average precision
precision, recall, thresholds = precision_recall_curve(y_test, y_probs_bert)
avg_precision = average_precision_score(y_test, y_probs_bert)

# Graficar curva Precision-Recall
print(f"Average Precision (AP): {avg_precision:.2f}")
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}', color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall - Modelo BERT')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Guardar el modelo

import joblib

# Guardar el clasificador
joblib.dump(clf_bert, '/content/drive/MyDrive/Capstone IMMUNE /Entregables/Modelos/clf_bert.pkl')


## Análisis de sentimiento con Hugging Face Transformers (DistilBERT)


### Importar los paquetes necesarios

In [None]:
!pip install -q transformers
!pip install -q datasets

In [None]:
import tensorflow as tf
from transformers import (AutoTokenizer,TFAutoModelForSequenceClassification,
                          DataCollatorWithPadding, create_optimizer)
from datasets import Dataset
from tensorflow.keras.callbacks import EarlyStopping

### Modelo

In [None]:
# Train/test split
X_train = df_train['clean_review_text']
y_train = df_train['target']
X_test = df_test['clean_review_text']
y_test = df_test['target']

# Convertir el dataframe a un Hugging Face Dataset
hf_train_dataset = Dataset.from_pandas(df_train[["clean_review_text", "target"]])
hf_test_dataset = Dataset.from_pandas(df_test[["clean_review_text", "target"]])

# Cargar el tokenizador y el modelo
model_name = "distilbert-base-uncased"
tokenizer_hft = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model_hft = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenizar los datos
def tokenize_function(example):
    return tokenizer_hft(example["clean_review_text"], truncation=True)

tokenized_train_dataset = hf_train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = hf_test_dataset.map(tokenize_function, batched=True)

# Preparar el data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer_hft, return_tensors="tf")

# Convertir datasets a tf.data.Dataset
tf_train_set = tokenized_train_dataset.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["target"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = tokenized_test_dataset.to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    label_cols=["target"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

# Crear un EarlyStopping compatible con el modelo

class SafeEarlyStopping(EarlyStopping):
    def _implements_train_batch_hooks(self): return True
    def _implements_test_batch_hooks(self): return True
    def _implements_predict_batch_hooks(self): return True

# Compilar y entrenar el modelo
optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=25627)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ["accuracy"]

model_hft.compile(optimizer=optimizer, loss=loss, metrics=metrics)

early_stop = SafeEarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model_hft.fit(
    tf_train_set,
    validation_data=tf_test_set,
    epochs=10,
    class_weight=class_weights_dict,
    callbacks=[early_stop]
)


### Evaluar el modelo

In [None]:
# Visualizar accuracy vs val_accuracy

plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
# Predecir con el modelo entrenado de Hugging Face
y_pred_probs = model_hft.predict(tf_test_set).logits
y_pred = np.argmax(y_pred_probs, axis=1)

# Convertir etiquetas verdaderas del tf_dataset
y_true = np.concatenate([y for x, y in tf_test_set], axis=0)


# Reporte de clasificación
report_hft = classification_report(y_true, y_pred, target_names=['Negativa', 'Positiva'],output_dict=True)
print("\n Reporte de Clasificación: Hugging Face Transformers")
print(classification_report(y_true, y_pred, target_names=['Negativa', 'Positiva']))


In [None]:
# Calcular las probabilidades de la clase positiva
y_pred_probs = tf.nn.softmax(y_pred_probs, axis=-1).numpy()[:, 1]  # Probabilidades de la clase positiva

# Calcular ROC y AUC
fpr, tpr, thresholds = roc_curve(y_true, y_pred_probs)
roc_auc_hft = auc(fpr, tpr)

# Mostrar el AUC
print(f"AUC: {roc_auc_hft:.2f}")

# Graficar la curva ROC
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc_hft:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
# Calcular precisión, recall y average precision
precision, recall, thresholds = precision_recall_curve(y_true, y_pred_probs)
avg_precision = average_precision_score(y_true, y_pred_probs)

# Graficar curva Precision-Recall
print(f"Average Precision (AP): {avg_precision:.2f}")
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}', color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall - Modelo HFT')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Guardar modelo y tokenizador

model_hft.save_pretrained('/content/drive/MyDrive/Capstone IMMUNE /Entregables/Modelos/hft_model')
tokenizer_hft.save_pretrained('/content/drive/MyDrive/Capstone IMMUNE /Entregables/Modelos/hft_tokenizer')



## Análisis de sentimiento con LSTM

### Importar paquetes necesarios

In [None]:
import tensorflow as tf #Importar Tensorflow
from tensorflow.keras.models import Sequential #  Modelo secuencial
from tensorflow.keras.layers import (Embedding, LSTM, Bidirectional, Dense,
                                     Dropout, Conv1D, GlobalMaxPooling1D) # Layers requeridas
from tensorflow.keras.preprocessing.text import Tokenizer # Creación de tokens
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau # Para evitar el overfitting
from tensorflow.keras import regularizers # Para regularización
import tensorflow.keras.backend as K # Para crear una métrica personalizada
from tensorflow.keras.preprocessing.sequence import pad_sequences # Normalización de texto

from sklearn.model_selection import train_test_split # Train/Test split
from sklearn.preprocessing import LabelEncoder # Codificar etiquetas

### Modelo

In [None]:
# Train/test split

X_train = df_train['clean_review_text']
y_train = df_train['target']
X_test = df_test['clean_review_text']
y_test = df_test['target']


# Crear tokens y ajustar las secuencias

max_vocab = 10000 # Tamaño máximo de palabras a tomar en cuenta del texto

lengths = []

for index, text in df['clean_review_text'].items():
    lengths.append(len(text.split()))

max_len = int(np.percentile(lengths, 95)) # Utilizando un largo que cubra al menos un 95% de las reviews.

tokenizer_lstm = Tokenizer(num_words=max_vocab, oov_token="<OOV>") # OOV: Out of vocabulary
tokenizer_lstm.fit_on_texts(df_train['clean_review_text'])

sequences = tokenizer_lstm.texts_to_sequences(df_train['clean_review_text'])

padded = pad_sequences(sequences,
                       maxlen=max_len,
                       padding='post',
                       truncating='post')

#  Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded,
                                                    df_train['target'],
                                                    test_size=0.2,
                                                    random_state=42)

# Función F1 personalizada
def f1_metric(y_true, y_pred):
    y_true = K.cast(y_true, 'float32')
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true * y_pred, 'float32'))
    fp = K.sum(K.cast((1 - y_true) * y_pred, 'float32'))
    fn = K.sum(K.cast(y_true * (1 - y_pred), 'float32'))
    precision = tp / (tp + fp + K.epsilon())
    recall = tp / (tp + fn + K.epsilon())
    return 2 * (precision * recall) / (precision + recall + K.epsilon())

# Tokenización
max_vocab = 20000
lengths = df['clean_review_text'].apply(lambda x: len(x.split()))
max_len = int(np.percentile(lengths, 95))

tokenizer = Tokenizer(num_words=max_vocab, oov_token="<OOV>")
tokenizer.fit_on_texts(df_train['clean_review_text'])

X_sequences = tokenizer.texts_to_sequences(df_train['clean_review_text'])
X_padded = pad_sequences(X_sequences, maxlen=max_len, padding='post', truncating='post')

# Split
X_train, X_test, y_train, y_test = train_test_split(X_padded, df_train['target'], test_size=0.2, random_state=42)

# Modelo
model_lstm = Sequential([
    Embedding(max_vocab, 128, input_length=max_len),
    Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(loss='binary_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy', f1_metric])

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1, min_lr=1e-6)

# Entrenamiento
history_lstm = model_lstm.fit(X_train, y_train,
                         validation_data=(X_test, y_test),
                         epochs=12,
                         batch_size=64,
                         class_weight=class_weights_dict,
                         callbacks=[early_stop, lr_reduce])


  ### Evaluar el modelo

In [None]:
# Visualizar accuracy vs val_accuracy

plt.plot(history_lstm.history['accuracy'], label='accuracy')
plt.plot(history_lstm.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
# Predecir con el modelo entrenado de LSTM

y_pred_probs = model_lstm.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype("int32")

# Reporte de clasificación

report_lstm = classification_report(y_test, y_pred, target_names=['Negativa', 'Positiva'],output_dict=True)
print("\Reporte de clasificación: LSTM")
print(classification_report(y_test, y_pred, target_names=['Negativa', 'Positiva']))

In [None]:
# Calcular ROC y AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)
roc_auc_lstm = auc(fpr, tpr)

# Mostrar AUC
print(f"AUC: {roc_auc_lstm:.2f}")

# Graficar la curva ROC
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (AUC = {roc_auc_lstm:.2f})")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

# Calcular precisión, recall y average precision
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_probs)
avg_precision = average_precision_score(y_test, y_pred_probs)

# Graficar curva Precision-Recall
print(f"Average Precision (AP): {avg_precision:.2f}")
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}', color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Curva Precision-Recall - Modelo LSTM')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Guardar el modelo LSTM entrenado
model_lstm.save('/content/drive/MyDrive/Capstone IMMUNE /Entregables/Modelos/lstm_model.keras')

# Guardar el tokenizer
tokenizer_path = '/content/drive/MyDrive/Capstone IMMUNE /Entregables/Modelos/tokenizer.json'
tokenizer_json = tokenizer_lstm.to_json()

# Guardar el tokenizer en un archivo JSON
with open(tokenizer_path, 'w') as f:
    f.write(tokenizer_json)

# Sección 4: Comparación entre modelos

In [None]:
# Función para extraer las metricas de los reportes

# Función para extraer métricas
def extract_metrics(report, model_name, roc_auc):
    return {
        "Modelo": model_name,
        "Precision Negativa": report["Negativa"]["precision"],
        "Recall Negativa": report["Negativa"]["recall"],
        "F1-score Negativa": report["Negativa"]["f1-score"],
        "Precision Positiva": report["Positiva"]["precision"],
        "Recall Positiva": report["Positiva"]["recall"],
        "F1-score Positiva": report["Positiva"]["f1-score"],
        "Precision (macro avg)": report["macro avg"]["precision"],
        "Recall (macro avg)": report["macro avg"]["recall"],
        "F1-score (macro avg)": report["macro avg"]["f1-score"],
        "Accuracy": report["accuracy"],
        "ROC AUC": roc_auc
    }


In [None]:
# Diccionario de modelos
report_dict = {
    "Regresión Logística": report_rl,
    "SVM": report_svm,
    "Naive Bayes": report_nb,
    "Árbol de Decisión": report_dt,
    "XGBoost": report_xgb,
    "SVCLinear": report_svcl,
    "Clasificador NLTK": report_nltk,
    "RNN": report_rnn,
    "BERT": report_bert,
    "HuggingFace Transformers": report_hft,
    "LSTM": report_lstm,
}

roc_auc_dict = {
    "Regresión Logística": roc_auc_rl,
    "SVM": roc_auc_svm,
    "Naive Bayes": roc_auc_nb,
    "Árbol de Decisión": roc_auc_dt,
    "XGBoost": roc_auc_xgb,
    "SVCLinear": roc_auc_svcl,
    "Clasificador NLTK": roc_auc_nltk,
    "RNN": roc_auc_rnn,
    "BERT": roc_auc_bert,
    "HuggingFace Transformers": roc_auc_hft,
    "LSTM": roc_auc_lstm,
}




In [None]:
# Construcción del DataFrame comparativo
comparative_results = []

for model_name in report_dict:
    report = report_dict[model_name]
    roc_auc = roc_auc_dict[model_name]
    comparative_results.append(extract_metrics(report, model_name, roc_auc))

df_comparative = pd.DataFrame(comparative_results).set_index("Modelo").round(3)

# Mostrar tabla
display(df_comparative.sort_values(by='ROC AUC',ascending=False))

# Guardar la tabla

df_comparative.to_csv('/content/drive/MyDrive/Capstone IMMUNE /Datasets/comparativa de modelos.csv')



In [None]:
# Ordenar por ROC AUC
df_sorted = df_comparative.sort_values(by="ROC AUC", ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=df_sorted, x=df_sorted.index, y="ROC AUC", palette="viridis")
plt.title("Comparación de ROC AUC por Modelo", fontsize=14)
plt.ylabel("ROC AUC")
plt.xlabel("Modelo")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Ordenamos las filas por alguna métrica relevante
metric_sort = "ROC AUC"
df_sorted = df_comparative.sort_values(by=metric_sort, ascending=False)

# Configuración del mapa de calor
plt.figure(figsize=(12, 7))
sns.heatmap(
    df_sorted,
    annot=True, fmt=".2f", cmap="YlGnBu", linewidths=0.5,
    cbar_kws={'label': 'Valor de Métrica'}
)

plt.title("Métricas por Modelo", fontsize=14, weight='bold')
plt.xlabel("Métricas")
plt.ylabel("Modelo")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig("heatmap_metricas_modelos.png", dpi=300)
plt.show()
