In [1]:
!pip install scikit-learn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score




In [2]:
# === Montar google drive ===
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Cargar datos de entrenamiento y prueba
df_train = pd.read_excel("/content/drive/MyDrive/MachineLearning/TRAIN/GPT/grouped_trainytrial.xlsx")
df_test = pd.read_excel("/content/drive/MyDrive/MachineLearning/TEST/grouped_data_test.xlsx")




**Entrenar el modelo**

In [8]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    balanced_accuracy_score, classification_report
)
import joblib

# === PREPROCESAMIENTO ===
label_encoder = LabelEncoder()
scaler = MinMaxScaler()

# === Selección de características ===
columnas_seleccionadas = [
    'toxicity',

    'POS','NEU','NEG',

    'alegria', 'tristeza' ,'miedo','disgusto','enojo', 'sorpresa',

    'yo', 'me', 'mi', 'mí',

    'num_palabras_largas',
    #'num_signos_puntuacion'
    #'num_palabras_mayusculas',
    #'num_palabras_primera_mayuscula',
    'negaciones',

    #'hora'

    #'Medicamento_ansiedad','Medicamento_depresion',

    'falta de motivacion',
    'aislamiento social',
    'pensamientos suicidas',
    #'baja autoestima',

    #'insomnio',
    'pensamientos acelerados',
    #'ataques de panico',
]

# === Extraer X e y ===
X_train = df_train[columnas_seleccionadas].copy()
y_train = label_encoder.fit_transform(df_train['label'])

# === Escalado (solo de las columnas seleccionadas) ===
X_train[['num_palabras_largas', 'negaciones']] = scaler.fit_transform(
    X_train[['num_palabras_largas', 'negaciones']]
)


# === SPLIT VALIDACIÓN ===
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.3, random_state=42
)

# === HIPERPARÁMETROS MANUALES PARA GBM ===
modelo_gbm = GradientBoostingClassifier(
    loss='log_loss',
    learning_rate=0.1,
    n_estimators=100,
    max_depth=3,
    subsample=1.0,
    max_features=None,
    min_samples_split=2,
    random_state=42
)

# === ENTRENAMIENTO ===
modelo_gbm.fit(X_train_split, y_train_split)

# === PREDICCIÓN Y EVALUACIÓN ===
y_val_pred = modelo_gbm.predict(X_val_split)

accuracy = accuracy_score(y_val_split, y_val_pred)
precision = precision_score(y_val_split, y_val_pred, average='weighted')
recall = recall_score(y_val_split, y_val_pred, average='weighted')
f1 = f1_score(y_val_split, y_val_pred, average='weighted')
balanced_accuracy = balanced_accuracy_score(y_val_split, y_val_pred)

# === RESULTADOS ===
print("\nResultados del modelo GBM:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Balanced Accuracy: {balanced_accuracy:.4f}")
print("\nReporte de Clasificación:")
print(classification_report(y_val_split, y_val_pred))

# === GUARDAR MODELO Y TRANSFORMADORES ===
joblib.dump(modelo_gbm, 'gbm_model_manual.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(columnas_seleccionadas, 'columnas_seleccionadas.pkl')
joblib.dump(label_encoder.classes_, 'clases_modelo.pkl')

print("\nModelo GBM guardado exitosamente.")


Resultados del modelo GBM:
Accuracy: 0.8219
Precision: 0.8219
Recall: 0.8219
F1 Score: 0.8219
Balanced Accuracy: 0.8097

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86        76
           1       0.78      0.78      0.78        51
           2       0.79      0.79      0.79        19

    accuracy                           0.82       146
   macro avg       0.81      0.81      0.81       146
weighted avg       0.82      0.82      0.82       146


Modelo GBM guardado exitosamente.


**Pruebas con TEST**

In [9]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# === 1. Cargar modelo y transformadores ===
modelo_gbm = joblib.load('gbm_model_manual.pkl')  # Cargar el modelo GBM
scaler = joblib.load('scaler.pkl')
columnas_seleccionadas = joblib.load('columnas_seleccionadas.pkl')
label_encoder = joblib.load('label_encoder.pkl')  # Opcional, pero recomendado

# === 2. Preprocesar conjunto de prueba ===
X_test = df_test.reindex(columns=columnas_seleccionadas, fill_value=0)

# === Escalar solo columnas que fueron normalizadas ===
columnas_a_escalar = ['num_palabras_mayusculas', 'num_palabras_largas', 'negaciones']
columnas_presentes = [col for col in columnas_a_escalar if col in X_test.columns]
X_test.loc[:, columnas_presentes] = scaler.transform(X_test[columnas_presentes])

# === Codificar etiquetas reales si fue usado LabelEncoder ===
y_test = label_encoder.transform(df_test['label']) if hasattr(label_encoder, 'classes_') else df_test['label']

# === 3. Predicción ===
y_pred = modelo_gbm.predict(X_test)

# === 4. Métricas ===
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Resultados finales en el conjunto de prueba (GBM):")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nReporte de Clasificación:")

print(classification_report(y_test, y_pred))


Resultados finales en el conjunto de prueba (GBM):
Accuracy: 0.8200
Precision: 0.8251
Recall: 0.8200
F1 Score: 0.8168

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.87      0.94      0.90       200
           1       0.69      0.76      0.72       100
           2       0.88      0.64      0.74       100

    accuracy                           0.82       400
   macro avg       0.81      0.78      0.79       400
weighted avg       0.83      0.82      0.82       400



 0.07246377 0.01811594 0.04347826 0.04891304 0.3442029  0.09782609
 0.01811594 0.13224638 0.11956522 0.0615942  0.05434783 0.09057971
 0.04347826 0.21014493 0.0307971  0.14673913 0.05253623 0.08152174
 0.10507246 0.125      0.23550725 0.04166667 0.04166667 0.1557971
 0.25905797 0.07608696 0.03442029 0.25181159 0.06702899 0.05978261
 0.20833333 0.05978261 0.05434783 0.04347826 0.20108696 0.08333333
 0.04347826 0.04891304 0.15036232 0.20652174 0.04710145 0.01630435
 0.1576087  0.15036232 0.10688406 0.01449275 0.0307971  0.08695652
 0.10326087 0.02898551 0.33514493 0.05797101 0.08514493 0.06702899
 0.04710145 0.22101449 0.10144928 0.11050725 0.03804348 0.02717391
 0.04891304 0.01449275 0.02536232 0.03804348 0.15398551 0.11231884
 0.02355072 0.08152174 0.0634058  0.04891304 0.10507246 0.09057971
 0.19746377 0.13224638 0.02173913 0.02536232 0.16123188 0.08695652
 0.08333333 0.00905797 0.01449275 0.07246377 0.03623188 0.04347826
 0.06702899 0.32427536 0.13043478 0.01449275 0.01268116 0.01992