In [None]:
## Montar google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Cargar el archivo desde Google Drive
import pandas as pd


# Especifica la ruta completa del archivo en tu Google Drive
file_path = 'linkXlsx'

# Cargar el archivo Excel en un DataFrame
df = pd.read_excel(file_path)

**Random Forest**

In [None]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from lightgbm import LGBMClassifier

nltk.download('stopwords')
nltk.download('wordnet')

# Preprocesamiento de texto
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Eliminar números
    text = ''.join([char for char in text if not char.isdigit()])
    # Tokenizar palabras
    words = text.split()
    # Eliminar stopwords
    stop_words = set(stopwords.words('spanish'))
    words = [word for word in words if word not in stop_words]
    # Lematización
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Aplicar preprocesamiento
df['message_cleaned'] = df['message'].apply(preprocess_text)

# Separar características y etiquetas
X = df['message_cleaned']
y = df['label']

# Transformar en Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)



###############################################################################################
# Modelo Random Forest
###############################################################################################

rf_model = RandomForestClassifier(
    max_depth=12,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=500,
    random_state=42,
    class_weight='balanced'
)
rf_model.fit(X_train, y_train)

# Predicción
y_pred = rf_model.predict(X_test)

# # Evaluación
# accuracy = accuracy_score(y_test, y_pred)
# print(f'\nAccuracy: {accuracy * 100:.2f}%')

# # Reporte de clasificación
# print("\nReporte de clasificación:")
# print(classification_report(y_test, y_pred, target_names=['Ninguno', 'Depresión', 'Ansiedad']))


# Calcular métricas adicionales
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Imprimir resultados
print("\nResultados de las métricas:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Reporte de clasificación detallado
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred, target_names=['Ninguno', 'Depresión', 'Ansiedad']))


**LGBM**

In [None]:
# Convertir la matriz de características a tipo flotante
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

# Modelo LightGBM
lgbm_model = LGBMClassifier(
    objective='multiclass',
    num_class=3,
    boosting_type='gbdt',
    n_estimators=100,
    num_leaves=31,
    learning_rate=0.05,
    feature_fraction=0.7,
    verbosity=-1,
    random_state=42,
    class_weight='balanced'
)

# Entrenamiento
lgbm_model.fit(X_train, y_train)

# Predicción
y_pred = lgbm_model.predict(X_test)

# Calcular métricas adicionales
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Imprimir resultados
print("\nResultados de las métricas:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Reporte de clasificación detallado
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred, target_names=['Ninguno', 'Depresión', 'Ansiedad']))


** **texto en negrita**XG BOOST**

In [None]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from xgboost import XGBClassifier

nltk.download('stopwords')
nltk.download('wordnet')

# Preprocesamiento de texto
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Eliminar números
    text = ''.join([char for char in text if not char.isdigit()])
    # Tokenizar palabras
    words = text.split()
    # Eliminar stopwords
    stop_words = set(stopwords.words('spanish'))
    words = [word for word in words if word not in stop_words]
    # Lematización
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Aplicar preprocesamiento
df['message_cleaned'] = df['message'].apply(preprocess_text)

# Separar características y etiquetas
X = df['message_cleaned']
y = df['label']

# Transformar en Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

# Convertir a tipo float32 (requerido por XGBoost)
X_train = X_train.toarray().astype(np.float32)
X_test = X_test.toarray().astype(np.float32)

# Modelo XGBoost
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    eval_metric='mlogloss',
    random_state=42,
    use_label_encoder=False
)

# Entrenamiento
xgb_model.fit(X_train, y_train)

# Predicción
y_pred = xgb_model.predict(X_test)

# Calcular métricas adicionales
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Imprimir resultados
print("\nResultados de las métricas:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Reporte de clasificación detallado
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred, target_names=['Ninguno', 'Depresión', 'Ansiedad']))


GBM - NUEVO EN EL TOP

In [None]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

# Preprocesamiento de texto
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Eliminar números
    text = ''.join([char for char in text if not char.isdigit()])
    # Tokenizar palabras
    words = text.split()
    # Eliminar stopwords
    stop_words = set(stopwords.words('spanish'))
    words = [word for word in words if word not in stop_words]
    # Lematización
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Aplicar preprocesamiento
df['message_cleaned'] = df['message'].apply(preprocess_text)

# Separar características y etiquetas
X = df['message_cleaned']
y = df['label']

# Transformar en Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

# Convertir a tipo float64 (GradientBoostingClassifier lo prefiere así)
X_train = X_train.toarray().astype(np.float64)
X_test = X_test.toarray().astype(np.float64)

# Modelo Gradient Boosting
gbm_model = GradientBoostingClassifier(
    loss='log_loss',
   learning_rate=0.1,
   n_estimators=100,
   max_depth=3,
   subsample=1.0,
   max_features=None,
   random_state=42
)

# Entrenamiento
gbm_model.fit(X_train, y_train)

# Predicción
y_pred = gbm_model.predict(X_test)

# Calcular métricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Imprimir resultados
print("\nResultados de las métricas:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Reporte de clasificación
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred, target_names=['Ninguno', 'Depresión', 'Ansiedad']))
