In [None]:
!pip install scikit-learn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score


In [None]:
# === Montar google drive ===
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Cargar datos de entrenamiento y prueba
df_train = pd.read_excel("LinkXlsx")
df_test = pd.read_excel("LinkXlsx")

**Entrenar el modelo**

In [None]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    balanced_accuracy_score, classification_report
)
import joblib

# === PREPROCESAMIENTO ===
label_encoder = LabelEncoder()
scaler = MinMaxScaler()

# === Selección de características ===
columnas_seleccionadas = [
    'toxicity',

    'POS','NEU','NEG',

    'alegria', 'tristeza' ,'miedo','disgusto','enojo', 'sorpresa',

    'yo', 'me', 'mi', 'mí',

    'num_palabras_largas',
    #'num_signos_puntuacion'
    #'num_palabras_mayusculas',
    #'num_palabras_primera_mayuscula',
    'negaciones',

    #'hora'

    #'Medicamento_ansiedad','Medicamento_depresion',

    'falta de motivacion',
    'aislamiento social',
    'pensamientos suicidas',
    #'baja autoestima',

    #'insomnio',
    'pensamientos acelerados',
    #'ataques de panico',
]

# === Extraer X e y ===
X_train = df_train[columnas_seleccionadas].copy()
y_train = label_encoder.fit_transform(df_train['label'])

# === Escalado (solo de las columnas seleccionadas) ===
X_train[['num_palabras_largas', 'negaciones']] = scaler.fit_transform(
    X_train[['num_palabras_largas', 'negaciones']]
)


# === SPLIT VALIDACIÓN ===
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.3, random_state=42
)

# === HIPERPARÁMETROS MANUALES PARA GBM ===
modelo_gbm = GradientBoostingClassifier(
    loss='log_loss',
    learning_rate=0.1,
    n_estimators=100,
    max_depth=3,
    subsample=1.0,
    max_features=None,
    min_samples_split=2,
    random_state=42
)

# === ENTRENAMIENTO ===
modelo_gbm.fit(X_train_split, y_train_split)

# === PREDICCIÓN Y EVALUACIÓN ===
y_val_pred = modelo_gbm.predict(X_val_split)

accuracy = accuracy_score(y_val_split, y_val_pred)
precision = precision_score(y_val_split, y_val_pred, average='weighted')
recall = recall_score(y_val_split, y_val_pred, average='weighted')
f1 = f1_score(y_val_split, y_val_pred, average='weighted')
balanced_accuracy = balanced_accuracy_score(y_val_split, y_val_pred)

# === RESULTADOS ===
print("\nResultados del modelo GBM:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Balanced Accuracy: {balanced_accuracy:.4f}")
print("\nReporte de Clasificación:")
print(classification_report(y_val_split, y_val_pred))

# === GUARDAR MODELO Y TRANSFORMADORES ===
joblib.dump(modelo_gbm, 'gbm_model_manual.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(columnas_seleccionadas, 'columnas_seleccionadas.pkl')
joblib.dump(label_encoder.classes_, 'clases_modelo.pkl')

print("\nModelo GBM guardado exitosamente.")

**Pruebas con TEST**

In [None]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# === 1. Cargar modelo y transformadores ===
modelo_gbm = joblib.load('gbm_model_manual.pkl')  # Cargar el modelo GBM
scaler = joblib.load('scaler.pkl')
columnas_seleccionadas = joblib.load('columnas_seleccionadas.pkl')
label_encoder = joblib.load('label_encoder.pkl')  # Opcional, pero recomendado

# === 2. Preprocesar conjunto de prueba ===
X_test = df_test.reindex(columns=columnas_seleccionadas, fill_value=0)

# === Escalar solo columnas que fueron normalizadas ===
columnas_a_escalar = ['num_palabras_mayusculas', 'num_palabras_largas', 'negaciones']
columnas_presentes = [col for col in columnas_a_escalar if col in X_test.columns]
X_test.loc[:, columnas_presentes] = scaler.transform(X_test[columnas_presentes])

# === Codificar etiquetas reales si fue usado LabelEncoder ===
y_test = label_encoder.transform(df_test['label']) if hasattr(label_encoder, 'classes_') else df_test['label']

# === 3. Predicción ===
y_pred = modelo_gbm.predict(X_test)

# === 4. Métricas ===
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Resultados finales en el conjunto de prueba (GBM):")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

print("\nReporte de Clasificación:")

print(classification_report(y_test, y_pred))
