In [1]:
## Montar google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Cargar el archivo desde Google Drive
import pandas as pd


# Especifica la ruta completa del archivo en tu Google Drive
#file_path = '/content/drive/MyDrive/MachineLearning/Mensajes Concatenados.xlsx'
#file_path = '/content/drive/MyDrive/Tesis- Borradores/FASE 1/TEST/Mensajes Concatenados.xlsx'
file_path = '/content/drive/MyDrive/MachineLearning/TRAIN/GPT/Mensajes Concatenados_trainytrial.xlsx'

# Cargar el archivo Excel en un DataFrame
df = pd.read_excel(file_path)

**Random Forest**

In [6]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from lightgbm import LGBMClassifier

nltk.download('stopwords')
nltk.download('wordnet')

# Preprocesamiento de texto
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Eliminar números
    text = ''.join([char for char in text if not char.isdigit()])
    # Tokenizar palabras
    words = text.split()
    # Eliminar stopwords
    stop_words = set(stopwords.words('spanish'))
    words = [word for word in words if word not in stop_words]
    # Lematización
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Aplicar preprocesamiento
df['message_cleaned'] = df['message'].apply(preprocess_text)

# Separar características y etiquetas
X = df['message_cleaned']
y = df['label']

# Transformar en Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)



###############################################################################################
# Modelo Random Forest
###############################################################################################

rf_model = RandomForestClassifier(
    max_depth=12,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=500,
    random_state=42,
    class_weight='balanced'
)
rf_model.fit(X_train, y_train)

# Predicción
y_pred = rf_model.predict(X_test)

# # Evaluación
# accuracy = accuracy_score(y_test, y_pred)
# print(f'\nAccuracy: {accuracy * 100:.2f}%')

# # Reporte de clasificación
# print("\nReporte de clasificación:")
# print(classification_report(y_test, y_pred, target_names=['Ninguno', 'Depresión', 'Ansiedad']))


# Calcular métricas adicionales
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Imprimir resultados
print("\nResultados de las métricas:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Reporte de clasificación detallado
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred, target_names=['Ninguno', 'Depresión', 'Ansiedad']))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...



Resultados de las métricas:
Accuracy: 0.6701
Precision: 0.7304
Recall: 0.6701
F1 Score: 0.6504

Reporte de Clasificación:
              precision    recall  f1-score   support

     Ninguno       0.59      0.91      0.72        45
   Depresión       0.82      0.38      0.52        37
    Ansiedad       0.91      0.67      0.77        15

    accuracy                           0.67        97
   macro avg       0.78      0.65      0.67        97
weighted avg       0.73      0.67      0.65        97



**LGBM**

In [7]:
# Convertir la matriz de características a tipo flotante
X_train = X_train.astype(np.float32)
X_test = X_test.astype(np.float32)

# Modelo LightGBM
lgbm_model = LGBMClassifier(
    objective='multiclass',
    num_class=3,
    boosting_type='gbdt',
    n_estimators=100,
    num_leaves=31,
    learning_rate=0.05,
    feature_fraction=0.7,
    verbosity=-1,
    random_state=42,
    class_weight='balanced'
)

# Entrenamiento
lgbm_model.fit(X_train, y_train)

# Predicción
y_pred = lgbm_model.predict(X_test)

# Calcular métricas adicionales
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Imprimir resultados
print("\nResultados de las métricas:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Reporte de clasificación detallado
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred, target_names=['Ninguno', 'Depresión', 'Ansiedad']))





Resultados de las métricas:
Accuracy: 0.6804
Precision: 0.6949
Recall: 0.6804
F1 Score: 0.6741

Reporte de Clasificación:
              precision    recall  f1-score   support

     Ninguno       0.67      0.76      0.71        45
   Depresión       0.76      0.51      0.61        37
    Ansiedad       0.62      0.87      0.72        15

    accuracy                           0.68        97
   macro avg       0.68      0.71      0.68        97
weighted avg       0.69      0.68      0.67        97





** **texto en negrita**XG BOOST**

In [None]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from xgboost import XGBClassifier

nltk.download('stopwords')
nltk.download('wordnet')

# Preprocesamiento de texto
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Eliminar números
    text = ''.join([char for char in text if not char.isdigit()])
    # Tokenizar palabras
    words = text.split()
    # Eliminar stopwords
    stop_words = set(stopwords.words('spanish'))
    words = [word for word in words if word not in stop_words]
    # Lematización
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Aplicar preprocesamiento
df['message_cleaned'] = df['message'].apply(preprocess_text)

# Separar características y etiquetas
X = df['message_cleaned']
y = df['label']

# Transformar en Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

# Convertir a tipo float32 (requerido por XGBoost)
X_train = X_train.toarray().astype(np.float32)
X_test = X_test.toarray().astype(np.float32)

# Modelo XGBoost
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    eval_metric='mlogloss',
    random_state=42,
    use_label_encoder=False
)

# Entrenamiento
xgb_model.fit(X_train, y_train)

# Predicción
y_pred = xgb_model.predict(X_test)

# Calcular métricas adicionales
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Imprimir resultados
print("\nResultados de las métricas:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Reporte de clasificación detallado
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred, target_names=['Ninguno', 'Depresión', 'Ansiedad']))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
Parameters: { "use_label_encoder" } are not used.




Resultados de las métricas:
Accuracy: 0.7742
Precision: 0.7920
Recall: 0.7742
F1 Score: 0.7752

Reporte de Clasificación:
              precision    recall  f1-score   support

     Ninguno       0.77      0.82      0.79        44
   Depresión       0.89      0.69      0.77        35
    Ansiedad       0.63      0.86      0.73        14

    accuracy                           0.77        93
   macro avg       0.76      0.79      0.76        93
weighted avg       0.79      0.77      0.78        93



GBM - NUEVO EN EL TOP

In [9]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

# Preprocesamiento de texto
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Eliminar puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Eliminar números
    text = ''.join([char for char in text if not char.isdigit()])
    # Tokenizar palabras
    words = text.split()
    # Eliminar stopwords
    stop_words = set(stopwords.words('spanish'))
    words = [word for word in words if word not in stop_words]
    # Lematización
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Aplicar preprocesamiento
df['message_cleaned'] = df['message'].apply(preprocess_text)

# Separar características y etiquetas
X = df['message_cleaned']
y = df['label']

# Transformar en Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

# Convertir a tipo float64 (GradientBoostingClassifier lo prefiere así)
X_train = X_train.toarray().astype(np.float64)
X_test = X_test.toarray().astype(np.float64)

# Modelo Gradient Boosting
gbm_model = GradientBoostingClassifier(
    loss='log_loss',
   learning_rate=0.1,
   n_estimators=100,
   max_depth=3,
   subsample=1.0,
   max_features=None,
   random_state=42
)

# Entrenamiento
gbm_model.fit(X_train, y_train)

# Predicción
y_pred = gbm_model.predict(X_test)

# Calcular métricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

# Imprimir resultados
print("\nResultados de las métricas:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Reporte de clasificación
print("\nReporte de Clasificación:")
print(classification_report(y_test, y_pred, target_names=['Ninguno', 'Depresión', 'Ansiedad']))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Resultados de las métricas:
Accuracy: 0.6598
Precision: 0.6769
Recall: 0.6598
F1 Score: 0.6534

Reporte de Clasificación:
              precision    recall  f1-score   support

     Ninguno       0.61      0.76      0.67        45
   Depresión       0.75      0.49      0.59        37
    Ansiedad       0.71      0.80      0.75        15

    accuracy                           0.66        97
   macro avg       0.69      0.68      0.67        97
weighted avg       0.68      0.66      0.65        97

