# **Importar Librerias Necesarias**

In [0]:
%pip install emoji
%pip install wordcloud
%pip install matplotlib
%pip install tensorflow

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import warnings
warnings.filterwarnings("ignore")

import re
import emoji
import builtins

import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from wordcloud import WordCloud
from IPython.display import display, HTML

from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StringType, IntegerType, NumericType, ArrayType, MapType

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, GRU, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from pyspark.ml import Pipeline
from pyspark.ml.functions import vector_to_array
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import Tokenizer, StopWordsRemover, RegexTokenizer, HashingTF, VectorAssembler, StandardScaler
from pyspark.ml.classification import NaiveBayes, LogisticRegression, LinearSVC

# **Crear Sesión Spark**

In [0]:
spark = SparkSession.builder \
    .appName("EDA_Suicide_Watch") \
    .getOrCreate()

# **Cargar Tabla Procesada Unity Catalog**

Aqui se realizará la carga del conjunto de datos procesado directamente desde la tabla registrada en el entorno de trabajo de Spark. Una vez importada, se muestra su contenido para verificar que la lectura se haya realizado correctamente y para obtener una primera vista general de las filas y columnas disponibles que fuerón procesadas en la etapa anterior.

In [0]:
df_features = spark.table("workspace.suicide_detection.suicide_detection_features")
display(df_features.toPandas())

Unnamed: 0,text_token_numeric,ent_person_ct,ent_location_ct,ent_med_ct,ent_substance_ct,ent_method_ct,ent_qty_ct,ent_actionrisk_ct,class
0,"[40, 169, 23, 86, 691, 80, 68, 17659, 677, 86,...",0,0,0,0,0,0,1,1
1,"[86, 106, 1, 734, 36, 31, 98, 2190, 86, 106, 9...",0,0,0,1,1,4,3,1
2,"[22, 3508, 184, 983, 788, 31, 22, 0, 0, 0, 0, ...",0,0,0,0,2,0,0,1
3,"[3638, 43, 95, 13, 1227, 761, 6, 5645, 1, 1396...",2,2,1,0,3,0,3,1
4,"[2746, 402, 136, 17, 1299, 62, 6, 267, 329, 57...",0,0,0,0,0,2,0,1
...,...,...,...,...,...,...,...,...,...
210747,"[8881, 1, 583, 291, 120, 9, 28, 90, 4, 3048, 0...",0,0,0,0,0,0,0,0
210748,"[40, 4, 28, 15, 150, 56, 4641, 1527, 171, 343,...",0,0,0,0,0,0,0,0
210749,"[227, 27, 972, 4636, 220, 13055, 4, 3229, 1172...",0,0,0,0,0,0,0,0
210750,"[202, 115, 7721, 744, 2610, 335, 1, 5, 105, 14...",0,0,0,0,0,0,0,0


---


---
#<center> **Modelamiento**</center>
---


---


En esta etapa se llevará a cabo la construcción del proceso de modelamiento a partir del conjunto de datos ya preprocesado. Se prepararán las variables necesarias para el aprendizaje automático, se realizará la división del dataset en conjuntos de entrenamiento y prueba, y se definirán diferentes enfoques de modelado que permitirán entrenar modelos clásicos y basados en aprendizaje profundo.

El objetivo de esta fase será dejar configurada la base técnica para el entrenamiento de múltiples modelos, asegurando coherencia en los datos de entrada y flexibilidad para incorporar distintas arquitecturas y estrategias de aprendizaje dentro del marco metodológico CRISP-DM.

###**Escalado/Normalización Variables NER**

En este apartado se realizará el escalado de las variables NER con el objetivo de unificar sus magnitudes y evitar que diferencias de escala influyan en el proceso de entrenamiento. Este ajuste permitirá que todas las entidades aporten información de forma equilibrada dentro de los modelos, dejando el conjunto de datos preparado para estos.

In [0]:
# Columnas NER a escalar
ner_columns = [
    "ent_person_ct",
    "ent_location_ct",
    "ent_med_ct",
    "ent_substance_ct",
    "ent_method_ct",
    "ent_qty_ct",
    "ent_actionrisk_ct"
]

# Ensamblar NER en un vector
assembler_ner = VectorAssembler(
    inputCols=ner_columns,
    outputCol="ner_features"
)

df_features = assembler_ner.transform(df_features)

# Escalador (Sin centrar → No negativos)
scaler_ner = StandardScaler(
    inputCol="ner_features",
    outputCol="ner_features_scaled",
    withMean=False,
    withStd=True
)

# Ajustar escalador sobre todo el dataset
scaler_ner_model = scaler_ner.fit(df_features)

# Aplicar escalado
df_features = scaler_ner_model.transform(df_features)

# Convertir vector escalado a array
df_features = df_features.withColumn(
    "ner_scaled_array",
    vector_to_array(col("ner_features_scaled"))
)

# Sobrescribir columnas NER originales
for i, col_name in enumerate(ner_columns):
    df_features = df_features.withColumn(
        col_name,
        col("ner_scaled_array")[i]
    )

# Limpiar columnas auxiliares
df_features = df_features.drop(
    "ner_features",
    "ner_features_scaled",
    "ner_scaled_array"
)

# Mover columna class al final
cols = df_features.columns
cols_without_label = [c for c in cols if c != "class"]
df_features = df_features.select(cols_without_label + ["class"])

# Mostrar tabla
display(df_features.limit(10).toPandas())

Unnamed: 0,text_token_numeric,ent_person_ct,ent_location_ct,ent_med_ct,ent_substance_ct,ent_method_ct,ent_qty_ct,ent_actionrisk_ct,class
0,"[40, 169, 23, 86, 691, 80, 68, 17659, 677, 86,...",0.0,0.0,0.0,0.0,0.0,0.0,0.829913,1
1,"[86, 106, 1, 734, 36, 31, 98, 2190, 86, 106, 9...",0.0,0.0,0.0,2.319094,1.096491,2.919182,2.489738,1
2,"[22, 3508, 184, 983, 788, 31, 22, 0, 0, 0, 0, ...",0.0,0.0,0.0,0.0,2.192982,0.0,0.0,1
3,"[3638, 43, 95, 13, 1227, 761, 6, 5645, 1, 1396...",2.945541,2.906082,6.047559,0.0,3.289473,0.0,2.489738,1
4,"[2746, 402, 136, 17, 1299, 62, 6, 267, 329, 57...",0.0,0.0,0.0,0.0,0.0,1.459591,0.0,1
5,"[117, 3, 4, 657, 323, 106, 45, 11, 75, 106, 14...",0.0,1.453041,0.0,0.0,1.096491,0.729795,0.0,1
6,"[14, 90, 607, 6692, 28, 90, 132, 197, 21, 11, ...",0.0,0.0,0.0,0.0,0.0,0.729795,0.0,1
7,"[79, 85, 116, 24, 256, 43, 113, 43, 56, 219, 1...",1.472771,2.906082,0.0,0.0,0.0,2.189386,3.319651,1
8,"[2859, 121, 59, 172, 133, 12, 678, 95, 168, 32...",0.0,1.453041,0.0,0.0,0.0,0.729795,2.489738,1
9,"[1044, 18, 689, 6, 4907, 3133, 327, 140, 3346,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


###**División Datos Entrenamiento (Train) & Prueba (Test)**

En este apartado se realizará la separación del dataset en conjuntos de entrenamiento y prueba, garantizando que ambas clases se mantengan representadas de forma equilibrada. Esta división permitirá contar con datos suficientes para entrenar los modelos y, al mismo tiempo, reservar un subconjunto independiente que será utilizado posteriormente para validar su comportamiento.

In [0]:
# Definir proporciones
train_ratio = 0.8
test_ratio = 0.2
seed = 42

# Separar por clase
df_class_0 = df_features.filter("class = 0")
df_class_1 = df_features.filter("class = 1")

# Split independiente por clase
train_0, test_0 = df_class_0.randomSplit([train_ratio, test_ratio], seed=seed)
train_1, test_1 = df_class_1.randomSplit([train_ratio, test_ratio], seed=seed)

# Unir conjuntos estratificados
train_df = train_0.union(train_1)
test_df = test_0.union(test_1)

# Verificar tamaños
train_count = train_df.count()
test_count = test_df.count()
total_count = train_count + test_count

# Crear tabla resumen
split_summary_df = spark.createDataFrame([
    Row(Conjunto="Entrenamiento", Registros=train_count, Proporcion="80%"),
    Row(Conjunto="Prueba", Registros=test_count, Proporcion="20%"),
    Row(Conjunto="Total", Registros=total_count, Proporcion="100%")
])

# Mostrar tabla
display(split_summary_df.toPandas())

Unnamed: 0,Conjunto,Registros,Proporcion
0,Entrenamiento,168732,80%
1,Prueba,42020,20%
2,Total,210752,100%


###**Modelo Clasificación Naive Bayes (Texto)**

En este apartado se construirá y entrenará un modelo Naive Bayes utilizando exclusivamente la información textual del conjunto de datos, permitiendo capturar patrones de frecuencia relevantes para la clasificación. Una vez entrenado el modelo con el conjunto de entrenamiento, se generarán predicciones sobre el conjunto de prueba. Finalmente, se calcularán métricas generales de desempeño que permitirán tener una referencia inicial del comportamiento del modelo dentro del proceso de modelamiento, sin entrar aún en una etapa formal de evaluación comparativa.

In [0]:
# Transformación de texto a features
hashing_tf_nb_text = HashingTF(
    inputCol="text_token_numeric",
    outputCol="features",
    numFeatures=2**18
)

# Modelo Naive Bayes
nb_text = NaiveBayes(
    featuresCol="features",
    labelCol="class",
    smoothing=1.0,
    modelType="multinomial"
)

# Pipeline
nb_text_pipeline = Pipeline(stages=[
    hashing_tf_nb_text,
    nb_text
])

# Entrenamiento
nb_text_model = nb_text_pipeline.fit(train_df)

# Predicciones
nb_text_predictions = nb_text_model.transform(test_df)

# Métricas
accuracy_nb_text = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="accuracy"
).evaluate(nb_text_predictions)

precision_nb_text = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="weightedPrecision"
).evaluate(nb_text_predictions)

recall_nb_text = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="weightedRecall"
).evaluate(nb_text_predictions)

f1_nb_text = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="f1"
).evaluate(nb_text_predictions)

# Crear tabla resultados
metrics_nb_text = pd.DataFrame({
    "Modelo": ["Naive Bayes - Texto"],
    "Accuracy": [builtins.round(accuracy_nb_text, 4)],
    "Precision": [builtins.round(precision_nb_text, 4)],
    "Recall": [builtins.round(recall_nb_text, 4)],
    "F1-score": [builtins.round(f1_nb_text, 4)]
})

# Mostrar tabla
display(metrics_nb_text)

Unnamed: 0,Modelo,Accuracy,Precision,Recall,F1-score
0,Naive Bayes - Texto,0.8265,0.8489,0.8265,0.824


###**Modelo Clasificación Naive Bayes (Texto + NER)**

En este apartado se construirá un modelo Naive Bayes que integrará información textual con variables estructuradas provenientes del reconocimiento de entidades nombradas (NER), que se combinará con los conteos de entidades, permitiendo que el modelo incorpore tanto el contenido semántico como señales adicionales de contexto. Una vez ensambladas estas fuentes de información, el modelo será entrenado utilizando el conjunto de entrenamiento y posteriormente se generarán predicciones sobre el conjunto de prueba. De esta forma, se obtendrá una referencia del comportamiento del modelo al incorporar características textuales y estructuradas dentro de la etapa de modelamiento.

In [0]:
# Transformación de texto a features
hashing_tf_nb_text_ner = HashingTF(
    inputCol="text_token_numeric",
    outputCol="text_features",
    numFeatures=2**18
)

# Ensamblar Texto + NER
ner_columns_nb = [
    "ent_person_ct",
    "ent_location_ct",
    "ent_med_ct",
    "ent_substance_ct",
    "ent_method_ct",
    "ent_qty_ct",
    "ent_actionrisk_ct"
]

assembler_nb_text_ner = VectorAssembler(
    inputCols=["text_features"] + ner_columns_nb,
    outputCol="features"
)

# Modelo Naive Bayes
nb_text_ner = NaiveBayes(
    featuresCol="features",
    labelCol="class",
    modelType="multinomial",
    smoothing=1.0
)

# Pipeline completo
nb_text_ner_pipeline = Pipeline(stages=[
    hashing_tf_nb_text_ner,
    assembler_nb_text_ner,
    nb_text_ner
])

# Entrenamiento
nb_text_ner_model = nb_text_ner_pipeline.fit(train_df)

# Predicciones
nb_text_ner_predictions = nb_text_ner_model.transform(test_df)

# Métricas
accuracy_nb_text_ner = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="accuracy"
).evaluate(nb_text_ner_predictions)

precision_nb_text_ner = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="weightedPrecision"
).evaluate(nb_text_ner_predictions)

recall_nb_text_ner = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="weightedRecall"
).evaluate(nb_text_ner_predictions)

f1_nb_text_ner = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="f1"
).evaluate(nb_text_ner_predictions)

# Crear tabla resultados
metrics_nb_text_ner = pd.DataFrame({
    "Modelo": ["Naive Bayes - Texto + NER"],
    "Accuracy": [builtins.round(accuracy_nb_text_ner, 4)],
    "Precision": [builtins.round(precision_nb_text_ner, 4)],
    "Recall": [builtins.round(recall_nb_text_ner, 4)],
    "F1-score": [builtins.round(f1_nb_text_ner, 4)]
})

# Mostrar tabla
display(metrics_nb_text_ner)

Unnamed: 0,Modelo,Accuracy,Precision,Recall,F1-score
0,Naive Bayes - Texto + NER,0.8326,0.8539,0.8326,0.8304


###**Modelo Clasificación Logistic Regression (Texto)**

En este apartado se construirá un modelo de regresión logística utilizando únicamente la información proveniente del texto, que permita al algoritmo identificar patrones relevantes asociados a cada clase. El modelo será entrenado empleando el conjunto de entrenamiento y posteriormente se generarán predicciones sobre el conjunto de prueba, quedando incorporado como uno de los enfoques base dentro de la etapa de modelamiento.

In [0]:
# Transformación de texto a features
hashing_tf_lr_text = HashingTF(
    inputCol="text_token_numeric",
    outputCol="features",
    numFeatures=2**18
)

# Modelo Logistic Regression
lr_text = LogisticRegression(
    featuresCol="features",
    labelCol="class",
    maxIter=50,
    regParam=0.0,
    elasticNetParam=0.0
)

# Pipeline
lr_pipeline_text = Pipeline(stages=[
    hashing_tf_lr_text,
    lr_text
])

# Entrenamiento
lr_model_text = lr_pipeline_text.fit(train_df)

# Predicciones
lr_predictions_text = lr_model_text.transform(test_df)

# Métricas
accuracy_lr_text = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="accuracy"
).evaluate(lr_predictions_text)

precision_lr_text = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="weightedPrecision"
).evaluate(lr_predictions_text)

recall_lr_text = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="weightedRecall"
).evaluate(lr_predictions_text)

f1_lr_text = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="f1"
).evaluate(lr_predictions_text)

# Crear tabla resultados
metrics_lr_text_df = pd.DataFrame([{
    "Modelo": "Logistic Regression - Texto",
    "Accuracy": builtins.round(accuracy_lr_text, 4),
    "Precision": builtins.round(precision_lr_text, 4),
    "Recall": builtins.round(recall_lr_text, 4),
    "F1-score": builtins.round(f1_lr_text, 4)
}])

# Mostrar tabla
display(metrics_lr_text_df)

Unnamed: 0,Modelo,Accuracy,Precision,Recall,F1-score
0,Logistic Regression - Texto,0.9177,0.9179,0.9177,0.9177


###**Modelo Clasificación Logistic Regression (Texto + NER)**

En este apartado se desarrollará un modelo de regresión logística que incorporará tanto la información textual como variables estructuradas derivadas del reconocimiento de entidades nombradas (NER). Esta integración permitirá enriquecer la representación de los datos, combinando el contenido semántico del texto con señales adicionales provenientes de entidades relevantes. El modelo será entrenado utilizando el conjunto de entrenamiento previamente definido y se aplicará posteriormente sobre el conjunto de prueba, quedando integrado dentro de la etapa de modelamiento como una alternativa que aprovecha múltiples fuentes de información.

In [0]:
# Transformación de texto a features
hashing_tf_lr_text_ner = HashingTF(
    inputCol="text_token_numeric",
    outputCol="text_features",
    numFeatures=2**18
)

# Ensamblar Texto + NER
ner_columns_lr = [
    "ent_person_ct",
    "ent_location_ct",
    "ent_med_ct",
    "ent_substance_ct",
    "ent_method_ct",
    "ent_qty_ct",
    "ent_actionrisk_ct"
]

assembler_lr_text_ner = VectorAssembler(
    inputCols=["text_features"] + ner_columns_lr,
    outputCol="features"
)

# Modelo Logistic Regression
lr_text_ner = LogisticRegression(
    featuresCol="features",
    labelCol="class",
    maxIter=50,
    regParam=0.0,
    elasticNetParam=0.0
)

# Pipeline
lr_pipeline_text_ner = Pipeline(stages=[
    hashing_tf_lr_text_ner,
    assembler_lr_text_ner,
    lr_text_ner
])

# Entrenamiento
lr_model_text_ner = lr_pipeline_text_ner.fit(train_df)

# Predicciones
lr_predictions_text_ner = lr_model_text_ner.transform(test_df)

# Métricas
accuracy_lr_text_ner = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="accuracy"
).evaluate(lr_predictions_text_ner)

precision_lr_text_ner = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="weightedPrecision"
).evaluate(lr_predictions_text_ner)

recall_lr_text_ner = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="weightedRecall"
).evaluate(lr_predictions_text_ner)

f1_lr_text_ner = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="f1"
).evaluate(lr_predictions_text_ner)

# Crear tabla resultados
metrics_lr_text_ner_df = pd.DataFrame([{
    "Modelo": "Logistic Regression - Texto + NER",
    "Accuracy": builtins.round(accuracy_lr_text_ner, 4),
    "Precision": builtins.round(precision_lr_text_ner, 4),
    "Recall": builtins.round(recall_lr_text_ner, 4),
    "F1-score": builtins.round(f1_lr_text_ner, 4)
}])

# Mostrar tabla
display(metrics_lr_text_ner_df)

Unnamed: 0,Modelo,Accuracy,Precision,Recall,F1-score
0,Logistic Regression - Texto + NER,0.9184,0.9185,0.9184,0.9184


###**Modelo Clasificación Support Vector Machine (Texto)**

En este apartado se construirá un modelo de Support Vector Machine (SVM) utilizando únicamente la información textual del conjunto de datos, que permitirá al modelo identificar fronteras de separación entre las clases. El modelo será entrenado con el conjunto de entrenamiento y posteriormente se aplicará sobre el conjunto de prueba, incorporándose como una alternativa robusta dentro de la etapa de modelamiento para la detección de patrones en datos textuales.

In [0]:
# Transformación de texto a features
hashing_tf_svm_text = HashingTF(
    inputCol="text_token_numeric",
    outputCol="features",
    numFeatures=2**18
)

# Modelo SVM Lineal
svm_text_model = LinearSVC(
    featuresCol="features",
    labelCol="class",
    maxIter=100,
    regParam=0.1
)

# Pipeline
svm_text_pipeline = Pipeline(stages=[
    hashing_tf_svm_text,
    svm_text_model
])

# Entrenamiento
svm_text_trained_model = svm_text_pipeline.fit(train_df)

# Predicciones
svm_text_predictions = svm_text_trained_model.transform(test_df)

# Métricas
svm_text_accuracy = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="accuracy"
).evaluate(svm_text_predictions)

svm_text_precision = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="weightedPrecision"
).evaluate(svm_text_predictions)

svm_text_recall = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="weightedRecall"
).evaluate(svm_text_predictions)

svm_text_f1 = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="f1"
).evaluate(svm_text_predictions)

# Crear tabla resultados
svm_text_metrics_df = pd.DataFrame({
    "Modelo": ["Support Vector Machine - Texto"],
    "Accuracy": [builtins.round(svm_text_accuracy, 4)],
    "Precision": [builtins.round(svm_text_precision, 4)],
    "Recall": [builtins.round(svm_text_recall, 4)],
    "F1-score": [builtins.round(svm_text_f1, 4)]
})

# Mostrar tabla
display(svm_text_metrics_df)

Unnamed: 0,Modelo,Accuracy,Precision,Recall,F1-score
0,Support Vector Machine - Texto,0.9276,0.9305,0.9276,0.9275


###**Modelo Clasificación Support Vector Machine (Texto + NER)**

En este apartado se desarrollará un modelo de Support Vector Machine (SVM) que integrará la representación numérica del texto con variables estructuradas derivadas del reconocimiento de entidades nombradas (NER). Esta combinación permitirá enriquecer la información disponible para el modelo, incorporando tanto el contenido semántico del texto como señales adicionales relacionadas con entidades relevantes. El modelo será entrenado utilizando el conjunto de entrenamiento y posteriormente se aplicará sobre el conjunto de prueba, quedando integrado dentro de la etapa de modelamiento como una aproximación que combina características textuales y estructuradas.

In [0]:
# Transformación de texto a features
hashing_tf_svm_ner = HashingTF(
    inputCol="text_token_numeric",
    outputCol="text_features",
    numFeatures=2**18
)

# Ensamblar Texto + NER
ner_columns = [
    "ent_person_ct",
    "ent_location_ct",
    "ent_med_ct",
    "ent_substance_ct",
    "ent_method_ct",
    "ent_qty_ct",
    "ent_actionrisk_ct"
]

assembler_svm_ner = VectorAssembler(
    inputCols=["text_features"] + ner_columns,
    outputCol="features"
)

# Modelo SVM Lineal
svm_ner_model = LinearSVC(
    featuresCol="features",
    labelCol="class",
    maxIter=100,
    regParam=0.1
)

# Pipeline
svm_ner_pipeline = Pipeline(stages=[
    hashing_tf_svm_ner,
    assembler_svm_ner,
    svm_ner_model
])

# Entrenamiento
svm_ner_trained_model = svm_ner_pipeline.fit(train_df)

# Predicciones
svm_ner_predictions = svm_ner_trained_model.transform(test_df)

# Métricas
svm_ner_accuracy = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="accuracy"
).evaluate(svm_ner_predictions)

svm_ner_precision = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="weightedPrecision"
).evaluate(svm_ner_predictions)

svm_ner_recall = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="weightedRecall"
).evaluate(svm_ner_predictions)

svm_ner_f1 = MulticlassClassificationEvaluator(
    labelCol="class",
    predictionCol="prediction",
    metricName="f1"
).evaluate(svm_ner_predictions)

# Crear tabla resultados
svm_ner_metrics_df = pd.DataFrame({
    "Modelo": ["Support Vector Machine (SVM) - Texto + NER"],
    "Accuracy": [builtins.round(svm_ner_accuracy, 4)],
    "Precision": [builtins.round(svm_ner_precision, 4)],
    "Recall": [builtins.round(svm_ner_recall, 4)],
    "F1-score": [builtins.round(svm_ner_f1, 4)]
})

# Mostrar tabla
display(svm_ner_metrics_df)

Unnamed: 0,Modelo,Accuracy,Precision,Recall,F1-score
0,Support Vector Machine (SVM) - Texto + NER,0.9301,0.9328,0.9301,0.93


###**Modelo Deep Learning Convolutional Neural Network (Texto)**

En este apartado se construirá un modelo de red neuronal convolucional (CNN) utilizando únicamente la información textual que ha sido convertido a valores númericos. El texto será representado mediante secuencias numéricas que permitirán al modelo capturar patrones locales y combinaciones relevantes de palabras dentro de los mensajes. El modelo será entrenado con el conjunto de entrenamiento y posteriormente se aplicará sobre el conjunto de prueba, incorporándose dentro de la etapa de modelamiento como un enfoque de deep learning orientado a la detección automática de patrones complejos en texto.

In [0]:
# Convertir Spark DataFrames a Pandas / NumPy
train_pd_cnn_text = train_df.select(
    "text_token_numeric", "class"
).toPandas()

test_pd_cnn_text = test_df.select(
    "text_token_numeric", "class"
).toPandas()

# Datos de entrenamiento convertidos a arrays NumPy
x_train_cnn_text = np.array(train_pd_cnn_text["text_token_numeric"].tolist())
y_train_cnn_text = train_pd_cnn_text["class"].values

# Datos de prueba convertidos a arrays NumPy
x_test_cnn_text = np.array(test_pd_cnn_text["text_token_numeric"].tolist())
y_test_cnn_text = test_pd_cnn_text["class"].values

# Parámetros del modelo
seq_len = x_train_cnn_text.shape[1]
vocab_size = 2**18
embed_dim = 128

# Definición del modelo CNN
cnn_text_model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embed_dim,
        input_length=seq_len,
        mask_zero=True
    ),
    Conv1D(
        filters=128,
        kernel_size=5,
        activation="relu"
    ),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid")
])

# Compilación del modelo CNN
cnn_text_model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

# Entrenamiento
cnn_text_history = cnn_text_model.fit(
    x_train_cnn_text,
    y_train_cnn_text,
    epochs=3,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predicciones
y_pred_prob_cnn_text = cnn_text_model.predict(x_test_cnn_text)
y_pred_cnn_text = (y_pred_prob_cnn_text > 0.5).astype(int).ravel()

# Métricas
cnn_text_accuracy  = accuracy_score(y_test_cnn_text, y_pred_cnn_text)
cnn_text_precision = precision_score(y_test_cnn_text, y_pred_cnn_text)
cnn_text_recall    = recall_score(y_test_cnn_text, y_pred_cnn_text)
cnn_text_f1        = f1_score(y_test_cnn_text, y_pred_cnn_text)

# Crear tabla resultados
cnn_text_metrics_df = pd.DataFrame({
    "Modelo": ["Convolutional Neural Network (CNN) - Texto"],
    "Accuracy": [builtins.round(cnn_text_accuracy, 4)],
    "Precision": [builtins.round(cnn_text_precision, 4)],
    "Recall": [builtins.round(cnn_text_recall, 4)],
    "F1-score": [builtins.round(cnn_text_f1, 4)]
})

# Mostrar tabla
display(cnn_text_metrics_df)

Epoch 1/3


2025-12-14 23:55:11.807642: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 349880832 exceeds 10% of free system memory.
2025-12-14 23:55:13.042195: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 134217728 exceeds 10% of free system memory.
2025-12-14 23:55:13.079166: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 134217728 exceeds 10% of free system memory.
2025-12-14 23:55:13.259125: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 134217728 exceeds 10% of free system memory.
2025-12-14 23:55:13.339534: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:84] Allocation of 134217728 exceeds 10% of free system memory.


[1m   1/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:57:56[0m 3s/step - accuracy: 0.4375 - loss: 0.6989[1m   2/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m14:03[0m 356ms/step - accuracy: 0.4531 - loss: 0.6974[1m   3/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m13:21[0m 338ms/step - accuracy: 0.4722 - loss: 0.6965[1m   4/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m12:42[0m 322ms/step - accuracy: 0.4889 - loss: 0.6954[1m   5/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m12:34[0m 319ms/step - accuracy: 0.5030 - loss: 0.6942[1m   6/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m12:22[0m

[1m2292/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m23s[0m 294ms/step - accuracy: 0.9664 - loss: 0.0882[1m2293/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m23s[0m 294ms/step - accuracy: 0.9664 - loss: 0.0882[1m2294/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m23s[0m 294ms/step - accuracy: 0.9664 - loss: 0.0882[1m2295/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m22s[0m 294ms/step - accuracy: 0.9664 - loss: 0.0882[1m2296/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m22s[0m 294ms/step - accuracy: 0.9664 - loss: 0.0882

Unnamed: 0,Modelo,Accuracy,Precision,Recall,F1-score
0,Convolutional Neural Network (CNN) - Texto,0.9465,0.9395,0.9559,0.9476


###**Modelo Deep Learning Long Short-Term Memory (Texto)**

En este apartado se desarrollará un modelo de red neuronal recurrente del tipo Long Short-Term Memory (LSTM) utilizando exclusivamente la información textual. Este enfoque permitirá capturar dependencias secuenciales y relaciones de contexto a lo largo del texto, aspectos relevantes en mensajes donde el significado depende del orden de las palabras. El modelo será entrenado empleando el conjunto de entrenamiento y posteriormente se aplicará sobre el conjunto de prueba, integrándose dentro de la etapa de modelamiento como una alternativa de deep learning orientada al análisis secuencial del lenguaje.

In [0]:
# Convertir Spark DataFrames a Pandas / NumPy
train_pd_lstm_text = train_df.select(
    "text_token_numeric", "class"
).toPandas()

test_pd_lstm_text = test_df.select(
    "text_token_numeric", "class"
).toPandas()

# Datos de entrenamiento convertidos a arrays NumPy
x_train_lstm_text = np.array(train_pd_lstm_text["text_token_numeric"].tolist())
y_train_lstm_text = train_pd_lstm_text["class"].values

# Datos de prueba convertidos a arrays NumPy
x_test_lstm_text = np.array(test_pd_lstm_text["text_token_numeric"].tolist())
y_test_lstm_text = test_pd_lstm_text["class"].values

# Parámetros del modelo
seq_len = x_train_lstm_text.shape[1]
vocab_size = 2**18
embed_dim = 128

# Definición del modelo LSTM
lstm_text_model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embed_dim,
        mask_zero=True
    ),
    LSTM(
        units=128,
        return_sequences=False
    ),
    Dropout(0.5),
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid")
])

# Definición del modelo LSTM
lstm_text_model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

# Entrenamiento
lstm_text_history = lstm_text_model.fit(
    x_train_lstm_text,
    y_train_lstm_text,
    epochs=3,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predicciones
y_pred_prob_lstm_text = lstm_text_model.predict(x_test_lstm_text)
y_pred_lstm_text = (y_pred_prob_lstm_text > 0.5).astype(int).ravel()

# Métricas
lstm_text_accuracy  = accuracy_score(y_test_lstm_text, y_pred_lstm_text)
lstm_text_precision = precision_score(y_test_lstm_text, y_pred_lstm_text)
lstm_text_recall    = recall_score(y_test_lstm_text, y_pred_lstm_text)
lstm_text_f1        = f1_score(y_test_lstm_text, y_pred_lstm_text)

# Crear tabla resultados
lstm_text_metrics_df = pd.DataFrame({
    "Modelo": ["Long Short-Term Memory (LSTM) - Texto"],
    "Accuracy": [builtins.round(lstm_text_accuracy, 4)],
    "Precision": [builtins.round(lstm_text_precision, 4)],
    "Recall": [builtins.round(lstm_text_recall, 4)],
    "F1-score": [builtins.round(lstm_text_f1, 4)]
})

# Mostrar tabla
display(lstm_text_metrics_df)

Epoch 1/3
[1m   1/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:27:35[0m 5s/step - accuracy: 0.5156 - loss: 0.6925[1m   2/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m24:31[0m 621ms/step - accuracy: 0.5234 - loss: 0.6927[1m   3/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m24:14[0m 614ms/step - accuracy: 0.5347 - loss: 0.6926[1m   4/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m22:42[0m 575ms/step - accuracy: 0.5387 - loss: 0.6926[1m   5/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m22:39[0m 574ms/step - accuracy: 0.5379 - loss: 0.6926[1m   6/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1

[1m2264/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m1:00[0m 554ms/step - accuracy: 0.9657 - loss: 0.0908[1m2265/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m59s[0m 554ms/step - accuracy: 0.9657 - loss: 0.0908 [1m2266/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m59s[0m 554ms/step - accuracy: 0.9657 - loss: 0.0908[1m2267/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m58s[0m 554ms/step - accuracy: 0.9657 - loss: 0.0908[1m2268/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m58s[0m 554ms/step - accuracy: 0.9657 - loss: 0.0908

Unnamed: 0,Modelo,Accuracy,Precision,Recall,F1-score
0,Long Short Term Memory (LSTM) - Texto,0.9516,0.9541,0.9501,0.9521


###**Modelo Deep Learning Gated Recurrent Unit (Texto)**

En este apartado se implementará un modelo de red neuronal recurrente del tipo Gated Recurrent Unit (GRU) utilizando únicamente la información textual. Este enfoque permitirá modelar dependencias secuenciales de manera eficiente, capturando el contexto del texto con una arquitectura más compacta frente a otras redes recurrentes. El modelo será entrenado con el conjunto de entrenamiento y posteriormente se aplicará sobre el conjunto de prueba, integrándose dentro de la etapa de modelamiento como una alternativa de deep learning enfocada en el procesamiento secuencial del lenguaje.

In [0]:
# Convertir Spark DataFrames a Pandas / NumPy
train_pd_gru_text = train_df.select(
    "text_token_numeric", "class"
).toPandas()

test_pd_gru_text = test_df.select(
    "text_token_numeric", "class"
).toPandas()

# Datos de entrenamiento convertidos a arrays NumPy
x_train_gru_text = np.array(train_pd_gru_text["text_token_numeric"].tolist())
y_train_gru_text = train_pd_gru_text["class"].values

# Datos de prueba convertidos a arrays NumPy
x_test_gru_text = np.array(test_pd_gru_text["text_token_numeric"].tolist())
y_test_gru_text = test_pd_gru_text["class"].values

# Parámetros del modelo
seq_len = x_train_gru_text.shape[1]
vocab_size = 2**18
embed_dim = 128

# Definición del modelo GRU
gru_text_model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=embed_dim,
        mask_zero=True
    ),
    GRU(
        units=128,
        return_sequences=False
    ),
    Dropout(0.5),
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid")
])

# Definición del modelo GRU
gru_text_model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

# Entrenamiento
gru_text_history = gru_text_model.fit(
    x_train_gru_text,
    y_train_gru_text,
    epochs=3,
    batch_size=64,
    validation_split=0.1,
    verbose=1
)

# Predicciones
y_pred_prob_gru_text = gru_text_model.predict(x_test_gru_text)
y_pred_gru_text = (y_pred_prob_gru_text > 0.5).astype(int).ravel()

# Métricas
gru_text_accuracy  = accuracy_score(y_test_gru_text, y_pred_gru_text)
gru_text_precision = precision_score(y_test_gru_text, y_pred_gru_text)
gru_text_recall    = recall_score(y_test_gru_text, y_pred_gru_text)
gru_text_f1        = f1_score(y_test_gru_text, y_pred_gru_text)

# Crear tabla resultados
gru_text_metrics_df = pd.DataFrame({
    "Modelo": ["Gated Recurrent Unit (GRU) - Texto"],
    "Accuracy": [builtins.round(gru_text_accuracy, 4)],
    "Precision": [builtins.round(gru_text_precision, 4)],
    "Recall": [builtins.round(gru_text_recall, 4)],
    "F1-score": [builtins.round(gru_text_f1, 4)]
})

# Mostrar tabla
display(gru_text_metrics_df)

Epoch 1/3
[1m   1/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:50:19[0m 4s/step - accuracy: 0.4844 - loss: 0.6942[1m   2/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m20:58[0m 531ms/step - accuracy: 0.5078 - loss: 0.6936[1m   3/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m22:12[0m 562ms/step - accuracy: 0.5191 - loss: 0.6933[1m   4/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m22:15[0m 564ms/step - accuracy: 0.5231 - loss: 0.6932[1m   5/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m22:25[0m 568ms/step - accuracy: 0.5254 - loss: 0.6930[1m   6/2373[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1

[1m2264/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m59s[0m 550ms/step - accuracy: 0.9668 - loss: 0.0875 [1m2265/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m59s[0m 550ms/step - accuracy: 0.9668 - loss: 0.0875[1m2266/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m58s[0m 550ms/step - accuracy: 0.9668 - loss: 0.0875[1m2267/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m58s[0m 550ms/step - accuracy: 0.9668 - loss: 0.0875[1m2268/2373[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m57s[0m 550ms/step - accuracy: 0.9668 - loss: 0.0875

Unnamed: 0,Modelo,Accuracy,Precision,Recall,F1-score
0,Gated Recurrent Unit (GRU) - Texto,0.9512,0.9481,0.956,0.952
