In [None]:
## Montar google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Cargar el archivo desde Google Drive
import pandas as pd

file_path =  "linkXlsx"

# Cargar el archivo Excel en un DataFrame
df = pd.read_excel(file_path)

In [None]:
!pip uninstall -y cupy


**Entrena el modelo**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from lightgbm import LGBMClassifier

# üîí Asegurar reproducibilidad total
np.random.seed(42)


# ‚úÖ Caracter√≠sticas seleccionadas
columnas_seleccionadas = [
    'POS', 'NEU', 'NEG', 'tristeza', 'miedo', 'disgusto', 'enojo', 'sorpresa', 'alegria',
     'toxicity',
    'me', 'mi', 'yo',
   # 'm√≠',
    'num_palabras_mayusculas',
    'num_palabras_largas',
  # 'num_signos_puntuacion',
    'negaciones',
    'falta de motivacion',
   'aislamiento social',
    'pensamientos suicidas',
   # 'baja autoestima',
  #  'insomnio',
   # 'ataques de panico',
    #'pensamientos acelerados'

]

# ‚öôÔ∏è Normalizaci√≥n de variables num√©ricas espec√≠ficas
scaler = MinMaxScaler()
cols_to_normalize = ['num_palabras_largas', 'num_palabras_mayusculas', 'num_signos_puntuacion']
cols_to_normalize = [col for col in cols_to_normalize if col in df.columns]
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])

# üì¶ Separar variables independientes y dependiente
X = df[columnas_seleccionadas]
y = LabelEncoder().fit_transform(df['label'])  # Asegurarse que contenga [0, 1, 2]

# ‚úÇÔ∏è Divisi√≥n estratificada
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42)

# üß† Modelo con hiperpar√°metros definidos (config base)
model = LGBMClassifier(
    objective='multiclass',
    num_class=3,
    boosting_type='gbdt',
    n_estimators=100,
    num_leaves=31,
    learning_rate=0.05,
    feature_fraction=0.7,
    verbosity=-1,
    random_state=42,
    class_weight='balanced'

)


# üöÄ Entrenamiento
model.fit(X_train, y_train)

# üîç Predicci√≥n
y_pred = model.predict(X_test)

# üìä M√©tricas
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred, average='weighted'):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1-Score:  {f1_score(y_test, y_pred, average='weighted'):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


**Busqueda de mejores hiperparametros**

In [None]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Definir la cuadr√≠cula de hiperpar√°metros
param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200],
    'boosting_type': ['gbdt'],
    'objective': ['multiclass'],
    'feature_fraction': [0.8, 0.7, 0.6],
   'random_state':[42],
    'num_class': [len(set(y_train))],  # N√∫mero de clases en el conjunto de datos
}

# Inicializar el modelo LGBM
lgbm = LGBMClassifier()

# Configurar GridSearchCV
grid_search = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    cv=5,  # Validaci√≥n cruzada con 5 particiones
    scoring='f1_weighted',  # M√©trica de evaluaci√≥n
    verbose=1
)

# Ajustar el modelo con los datos de entrenamiento
grid_search.fit(X_train, y_train)

# Imprimir los mejores par√°metros y la mejor puntuaci√≥n
print("Mejores hiperpar√°metros encontrados:", grid_search.best_params_)
print("Mejor F1 Score obtenido en validaci√≥n cruzada:", grid_search.best_score_)

# Obtener el mejor modelo ajustado
best_model = grid_search.best_estimator_

# Evaluar el mejor modelo en el conjunto de prueba
y_pred = best_model.predict(X_test)

# Calcular m√©tricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Mostrar resultados
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)


**Validacion cruzada**

In [None]:
from sklearn.model_selection import cross_validate
from lightgbm import LGBMClassifier

# Inicializar el modelo LGBM
lgbm_best = LGBMClassifier(
 objective='multiclass',
    num_class=3,
    boosting_type='gbdt',
    n_estimators=100,
    num_leaves=31,
    learning_rate=0.1,
    feature_fraction=0.8,
    verbosity=-1,
    random_state=42
)

# Configurar m√©tricas para la validaci√≥n cruzada
scoring_metrics = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']

# Realizar validaci√≥n cruzada con m√∫ltiples m√©tricas
cv_results = cross_validate(lgbm_best, X_train, y_train, cv=5, scoring=scoring_metrics, return_train_score=False)

# Calcular promedios y desviaciones est√°ndar
mean_accuracy = cv_results['test_accuracy'].mean()
std_accuracy = cv_results['test_accuracy'].std()
mean_precision = cv_results['test_precision_weighted'].mean()
std_precision = cv_results['test_precision_weighted'].std()
mean_recall = cv_results['test_recall_weighted'].mean()
std_recall = cv_results['test_recall_weighted'].std()
mean_f1 = cv_results['test_f1_weighted'].mean()
std_f1 = cv_results['test_f1_weighted'].std()

# Mostrar resultados
print("Resultados de Validaci√≥n Cruzada con los Mejores Hiperpar√°metros para LGBM:")
print(f"Accuracy Promedio: {mean_accuracy:.4f} ¬± {std_accuracy:.4f}")
print(f"Precision Promedio: {mean_precision:.4f} ¬± {std_precision:.4f}")
print(f"Recall Promedio: {mean_recall:.4f} ¬± {std_recall:.4f}")
print(f"F1 Score Promedio: {mean_f1:.4f} ¬± {std_f1:.4f}")


In [None]:
import matplotlib.pyplot as plt
import lightgbm as lgb

# Par√°metros para estilo de gr√°fico
plt.figure(figsize=(10, 6))
ax = lgb.plot_importance(
    model,
    max_num_features=20,           # Top 20 caracter√≠sticas
    importance_type='gain',        # Importancia basada en ganancia
    height=0.5,                    # Altura de cada barra
    figsize=(10, 6),
    title='Importancia de caracter√≠sticas (gain)',
    xlabel='Ganancia total',
    ylabel='Caracter√≠sticas',
    grid=False
)
plt.tight_layout()
plt.show()
