In [3]:
# =========================================
# 0. Librerías
# =========================================
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [5]:
# =========================================
# 1. Cargar y preparar datos
# =========================================
CSV_PATH = "/home/cesar/corpus-sintetico.csv"   # ajusta si lo cambias de lugar
MAX_VOCAB = 60_000
MAX_LEN   = 120

df = pd.read_csv(CSV_PATH).dropna(subset=["Review"])

# ----- codificadores -----
enc_pol  = LabelEncoder().fit(df["Polarity"])
enc_typ  = LabelEncoder().fit(df["Type"])
enc_town = LabelEncoder().fit(df["Town"])

y_pol   = tf.keras.utils.to_categorical(enc_pol.transform(df["Polarity"]), 5)
y_type  = tf.keras.utils.to_categorical(enc_typ.transform(df["Type"]), 3)
y_town  = tf.keras.utils.to_categorical(enc_town.transform(df["Town"]), 40)

train_idx, val_idx = train_test_split(
    np.arange(len(df)),
    test_size=0.1, random_state=42,
    stratify=enc_pol.transform(df["Polarity"])
)

# ----- tokenización -----
tok = Tokenizer(num_words=MAX_VOCAB, oov_token="[OOV]")
tok.fit_on_texts(df.loc[train_idx, "Review"])

def to_seq(texts):
    return pad_sequences(
        tok.texts_to_sequences(texts),
        maxlen=MAX_LEN,
        padding="post",      # ←--- relleno al final
        truncating="post"    #     y corte al final,
    )

X_train = to_seq(df.loc[train_idx, "Review"])
X_val   = to_seq(df.loc[val_idx, "Review"])

y_train = {
    "dense_pol":  y_pol [train_idx],
    "dense_type": y_type[train_idx],
    "dense_town": y_town[train_idx]
}
y_val = {
    "dense_pol":  y_pol [val_idx],
    "dense_type": y_type[val_idx],
    "dense_town": y_town[val_idx]
}


In [7]:
# =========================================
# 3. Modelo con Mecanismo de Atención + Bi-LSTM
# =========================================
EMB_DIM = 300
LSTM_UNITS = 128
ATTENTION_HEADS = 4

# ---- Capa de Atención Personalizada ----
class MultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads, head_dim, **kwargs):
        super().__init__(**kwargs)
        self.num_heads = num_heads
        self.head_dim = head_dim
        
    def build(self, input_shape):
        self.query = self.add_weight(
            shape=(input_shape[-1], self.num_heads * self.head_dim),
            initializer='glorot_uniform',
            name='query'
        )
        self.key = self.add_weight(
            shape=(input_shape[-1], self.num_heads * self.head_dim),
            initializer='glorot_uniform',
            name='key'
        )
        self.value = self.add_weight(
            shape=(input_shape[-1], self.num_heads * self.head_dim),
            initializer='glorot_uniform',
            name='value'
        )
        
    def call(self, inputs, mask=None):
        batch_size = tf.shape(inputs)[0]
        
        # Proyecciones
        Q = tf.matmul(inputs, self.query)
        K = tf.matmul(inputs, self.key)
        V = tf.matmul(inputs, self.value)
        
        # Reshape para múltiples cabezas
        Q = tf.reshape(Q, [batch_size, -1, self.num_heads, self.head_dim])
        K = tf.reshape(K, [batch_size, -1, self.num_heads, self.head_dim])
        V = tf.reshape(V, [batch_size, -1, self.num_heads, self.head_dim])
        
        # Atención escalada
        scores = tf.einsum('bqhd,bkhd->bhqk', Q, K) / tf.math.sqrt(tf.cast(self.head_dim, tf.float32))
        
        # Convertir el mask a float32 para evitar incompatibilidad de tipo
        if mask is not None:
            mask = tf.cast(mask, dtype=tf.float32)  # Convierte el mask a float32
            scores += (mask[:, None, None, :] * -1e9)
            
        attn_weights = tf.nn.softmax(scores, axis=-1)
        output = tf.einsum('bhqk,bkhd->bqhd', attn_weights, V)
        output = tf.reshape(output, [batch_size, -1, self.num_heads * self.head_dim])
        
        return output

# ---- Arquitectura Principal ----
inputs = tf.keras.layers.Input(shape=(MAX_LEN,), dtype="int32")
x = tf.keras.layers.Embedding(MAX_VOCAB, EMB_DIM, mask_zero=True)(inputs)

# Bi-LSTM
lstm_out = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(LSTM_UNITS, return_sequences=True)
)(x)

# Mecanismo de Atención
attn_out = MultiHeadSelfAttention(
    num_heads=ATTENTION_HEADS,
    head_dim=64,
    name="self_attention"
)(lstm_out)

# Pooling Jerárquico
avg_pool = tf.keras.layers.GlobalAveragePooling1D()(attn_out)
max_pool = tf.keras.layers.GlobalMaxPooling1D()(attn_out)
concat = tf.keras.layers.concatenate([avg_pool, max_pool])
x = tf.keras.layers.Dropout(0.4)(concat)

# Capas Específicas por Tarea
def build_task_branch(input_layer, units, num_classes, name):
    branch = tf.keras.layers.Dense(units, activation='relu')(input_layer)
    branch = tf.keras.layers.Dropout(0.2)(branch)
    return tf.keras.layers.Dense(num_classes, activation='softmax', name=name)(branch)

out_pol = build_task_branch(x, 64, 5, "dense_pol")
out_type = build_task_branch(x, 32, 3, "dense_type")
out_town = build_task_branch(x, 128, 40, "dense_town")

model = tf.keras.Model(inputs, [out_pol, out_type, out_town])

# Compilación
model.compile(
    optimizer=tf.keras.optimizers.Adam(2e-4),
    loss={
        "dense_pol": "categorical_crossentropy",
        "dense_type": "categorical_crossentropy",
        "dense_town": "categorical_crossentropy"
    },
    loss_weights={"dense_pol": 2.0, "dense_type": 1.0, "dense_town": 3.0},
    metrics=["accuracy"]
)

model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 120)]                0         []                            
                                                                                                  
 embedding_1 (Embedding)     (None, 120, 300)             1800000   ['input_2[0][0]']             
                                                          0                                       
                                                                                                  
 bidirectional_1 (Bidirecti  (None, 120, 256)             439296    ['embedding_1[0][0]']         
 onal)                                                                                            
                                                                                              

In [8]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

# Asumir que MAX_LEN y MAX_VOCAB están definidos previamente
# Asumir que X_train, X_test, y_train_pol, y_train_type, y_train_town están listos para ser usados.

# Ejemplo de datos (reemplázalo con tus propios datos)
# X_train = np.array(...)  # Secuencias de texto preprocesadas
# y_train_pol = np.array(...)  # Etiquetas de polaridad
# y_train_type = np.array(...)  # Etiquetas de tipo
# y_train_town = np.array(...)  # Etiquetas de locación

# Dividir los datos en entrenamiento y validación
X_train, X_val, y_train_pol, y_val_pol, y_train_type, y_val_type, y_train_town, y_val_town = train_test_split(
    X_train, y_train_pol, y_train_type, y_train_town, test_size=0.2, random_state=42
)

# Ajusta el tamaño del vocabulario y longitud máxima de la secuencia
MAX_LEN = 120  # La longitud de las secuencias
MAX_VOCAB = 10000  # Tamaño del vocabulario

# Definir el modelo nuevamente si no está en memoria
model = tf.keras.Model(inputs, [out_pol, out_type, out_town])

# Compilación del modelo (si no se ha hecho antes)
model.compile(
    optimizer=tf.keras.optimizers.Adam(2e-4),
    loss={
        "dense_pol": "categorical_crossentropy",
        "dense_type": "categorical_crossentropy",
        "dense_town": "categorical_crossentropy"
    },
    loss_weights={"dense_pol": 2.0, "dense_type": 1.0, "dense_town": 3.0},
    metrics=["accuracy"]
)

# Entrenamiento del modelo
history = model.fit(
    X_train, 
    {"dense_pol": y_train_pol, "dense_type": y_train_type, "dense_town": y_train_town},  # Salidas
    epochs=10,  # Número de épocas
    batch_size=64,  # Tamaño del batch
    validation_data=(X_val, {"dense_pol": y_val_pol, "dense_type": y_val_type, "dense_town": y_val_town}),
    verbose=1
)

# Guardar el modelo entrenado
model.save("modelo_atencion_bi_lstm.h5")

# Evaluar el modelo en el conjunto de validación
val_loss, val_accuracy = model.evaluate(
    X_val, 
    {"dense_pol": y_val_pol, "dense_type": y_val_type, "dense_town": y_val_town},
    verbose=1
)

print(f'Pérdida en validación: {val_loss}')
print(f'Precisión en validación: {val_accuracy}')



NameError: name 'y_train_pol' is not defined

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# =========================================
# 1. Cargar y preparar datos
# =========================================
CSV_PATH = "/home/cesar/Descargas/full_dataset_combined.csv"   # ajusta si lo cambias de lugar
MAX_VOCAB = 60_000
MAX_LEN   = 120

# df = (
#    pd.read_csv("/home/cesar/Descargas/full_dataset_combined.csv")
#      .rename(columns={"clean": "Review"})
#)

# Cargar el dataframe
df = pd.read_csv(CSV_PATH).dropna(subset=["clean"]) # Review o clean

# ----- codificadores -----
enc_pol  = LabelEncoder().fit(df["Polarity"])
enc_typ  = LabelEncoder().fit(df["Type"])
enc_town = LabelEncoder().fit(df["Town"])

# Codificación de las etiquetas
y_pol   = tf.keras.utils.to_categorical(enc_pol.transform(df["Polarity"]), 5)
y_type  = tf.keras.utils.to_categorical(enc_typ.transform(df["Type"]), 3)
y_town  = tf.keras.utils.to_categorical(enc_town.transform(df["Town"]), 40)

# División de datos en entrenamiento y validación
train_idx, val_idx = train_test_split(
    np.arange(len(df)),
    test_size=0.1, random_state=42,
    stratify=enc_pol.transform(df["Polarity"])
)

# ----- tokenización -----
tok = Tokenizer(num_words=MAX_VOCAB, oov_token="[OOV]")
tok.fit_on_texts(df.loc[train_idx, "clean"])

# Función para convertir textos a secuencias
def to_seq(texts):
    return pad_sequences(
        tok.texts_to_sequences(texts),
        maxlen=MAX_LEN,
        padding="post",      # ←--- relleno al final
        truncating="post"    #     y corte al final
    )

# Convertir los textos a secuencias
X_train = to_seq(df.loc[train_idx, "clean"])
X_val   = to_seq(df.loc[val_idx, "clean"])

# Preparamos las etiquetas para el entrenamiento
y_train = {
    "dense_pol":  y_pol[train_idx],
    "dense_type": y_type[train_idx],
    "dense_town": y_town[train_idx]
}

y_val = {
    "dense_pol":  y_pol[val_idx],
    "dense_type": y_type[val_idx],
    "dense_town": y_town[val_idx]
}

# =========================================
# 2. Modelo con Mecanismo de Atención + Bi-LSTM
# =========================================
EMB_DIM = 300
LSTM_UNITS = 128
ATTENTION_HEADS = 4

# ---- Capa de Atención Personalizada ----
class MultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads, head_dim, **kwargs):
        super().__init__(**kwargs)
        self.num_heads = num_heads
        self.head_dim = head_dim
        
    def build(self, input_shape):
        self.query = self.add_weight(
            shape=(input_shape[-1], self.num_heads * self.head_dim),
            initializer='glorot_uniform',
            name='query'
        )
        self.key = self.add_weight(
            shape=(input_shape[-1], self.num_heads * self.head_dim),
            initializer='glorot_uniform',
            name='key'
        )
        self.value = self.add_weight(
            shape=(input_shape[-1], self.num_heads * self.head_dim),
            initializer='glorot_uniform',
            name='value'
        )
        
    def call(self, inputs, mask=None):
        batch_size = tf.shape(inputs)[0]
        
        # Proyecciones
        Q = tf.matmul(inputs, self.query)
        K = tf.matmul(inputs, self.key)
        V = tf.matmul(inputs, self.value)
        
        # Reshape para múltiples cabezas
        Q = tf.reshape(Q, [batch_size, -1, self.num_heads, self.head_dim])
        K = tf.reshape(K, [batch_size, -1, self.num_heads, self.head_dim])
        V = tf.reshape(V, [batch_size, -1, self.num_heads, self.head_dim])
        
        # Atención escalada
        scores = tf.einsum('bqhd,bkhd->bhqk', Q, K) / tf.math.sqrt(tf.cast(self.head_dim, tf.float32))
        
        # Convertir el mask a float32 para evitar incompatibilidad de tipo
        if mask is not None:
            mask = tf.cast(mask, dtype=tf.float32)  # Convierte el mask a float32
            scores += (mask[:, None, None, :] * -1e9)
            
        attn_weights = tf.nn.softmax(scores, axis=-1)
        output = tf.einsum('bhqk,bkhd->bqhd', attn_weights, V)
        output = tf.reshape(output, [batch_size, -1, self.num_heads * self.head_dim])
        
        return output

# ---- Arquitectura Principal ----
inputs = tf.keras.layers.Input(shape=(MAX_LEN,), dtype="int32")
x = tf.keras.layers.Embedding(MAX_VOCAB, EMB_DIM, mask_zero=True)(inputs)

# Bi-LSTM
lstm_out = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(LSTM_UNITS, return_sequences=True)
)(x)

# Mecanismo de Atención
attn_out = MultiHeadSelfAttention(
    num_heads=ATTENTION_HEADS,
    head_dim=64,
    name="self_attention"
)(lstm_out)

# Pooling Jerárquico
avg_pool = tf.keras.layers.GlobalAveragePooling1D()(attn_out)
max_pool = tf.keras.layers.GlobalMaxPooling1D()(attn_out)
concat = tf.keras.layers.concatenate([avg_pool, max_pool])
x = tf.keras.layers.Dropout(0.4)(concat)

# Capas Específicas por Tarea
def build_task_branch(input_layer, units, num_classes, name):
    branch = tf.keras.layers.Dense(units, activation='relu')(input_layer)
    branch = tf.keras.layers.Dropout(0.2)(branch)
    return tf.keras.layers.Dense(num_classes, activation='softmax', name=name)(branch)

out_pol = build_task_branch(x, 64, 5, "dense_pol")
out_type = build_task_branch(x, 32, 3, "dense_type")
out_town = build_task_branch(x, 128, 40, "dense_town")

model = tf.keras.Model(inputs, [out_pol, out_type, out_town])

# Compilación del modelo
model.compile(
    optimizer=tf.keras.optimizers.Adam(2e-4),
    loss={
        "dense_pol": "categorical_crossentropy",
        "dense_type": "categorical_crossentropy",
        "dense_town": "categorical_crossentropy"
    },
    loss_weights={"dense_pol": 2.0, "dense_type": 1.0, "dense_town": 3.0},
    metrics=["accuracy"]
)

# Resumen del modelo
model.summary()

# =========================================
# 3. Entrenamiento del modelo
# =========================================
history = model.fit(
    X_train, 
    y_train,  # Salidas de entrenamiento
    epochs=10,  # Número de épocas
    batch_size=64,  # Tamaño del batch
    validation_data=(X_val, y_val),  # Conjunto de validación
    verbose=1
)

# Guardar el modelo entrenado
model.save("modelo_atencion_bi_lstm.h5")

# Evaluar el modelo en el conjunto de validación
val_loss, val_accuracy = model.evaluate(
    X_val, 
    y_val,
    verbose=1
)

print(f'Pérdida en validación: {val_loss}')
print(f'Precisión en validación: {val_accuracy}')


2025-05-07 18:37:14.507308: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-07 18:37:14.507381: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-07 18:37:14.509123: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-07 18:37:14.517491: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-05-07 18:38:05.907665: I exter

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 120)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 120, 300)             1800000   ['input_1[0][0]']             
                                                          0                                       
                                                                                                  
 bidirectional (Bidirection  (None, 120, 256)             439296    ['embedding[0][0]']           
 al)                                                                                              
                                                                                              

2025-05-07 18:38:08.299374: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 262677600 exceeds 10% of free system memory.
2025-05-07 18:38:08.617909: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 87559200 exceeds 10% of free system memory.


Epoch 1/10


2025-05-07 18:38:08.752649: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 87559200 exceeds 10% of free system memory.
2025-05-07 18:38:14.272183: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	for Tuple type infernce function 0
	while inferring type of node 'cond_38/output/_24'
2025-05-07 18:38:14.950853: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
2025-05-07 18:38:15.854270: I external/local_xla/xla/service/



2025-05-07 18:45:25.949534: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 29186880 exceeds 10% of free system memory.


Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

In [10]:
from sklearn.metrics import f1_score

# Realiza las predicciones del modelo en el conjunto de validación
y_pred = model.predict(X_val)

# Para cada tarea, se realiza el cálculo del F1-score
y_pred_pol = np.argmax(y_pred[0], axis=-1)  # Predicciones de polaridad
y_pred_type = np.argmax(y_pred[1], axis=-1)  # Predicciones de tipo
y_pred_town = np.argmax(y_pred[2], axis=-1)  # Predicciones de localidad

# Las etiquetas verdaderas
y_true_pol = np.argmax(y_val["dense_pol"], axis=-1)
y_true_type = np.argmax(y_val["dense_type"], axis=-1)
y_true_town = np.argmax(y_val["dense_town"], axis=-1)

# Cálculo del F1-score para cada tarea (Macro F1 por tarea)
f1_pol = f1_score(y_true_pol, y_pred_pol, average='macro')
f1_type = f1_score(y_true_type, y_pred_type, average='macro')
f1_town = f1_score(y_true_town, y_pred_town, average='macro')

# Cálculo del Macro F1 general (promedio de los tres F1s)
macro_f1 = (f1_pol + f1_type + f1_town) / 3

# Imprimir los resultados
print(f"F1 Polarity (Macro): {f1_pol}")
print(f"F1 Type (Macro): {f1_type}")
print(f"F1 Town (Macro): {f1_town}")
print(f"Macro F1: {macro_f1}")


F1 Polarity (Macro): 0.16363778986480731
F1 Type (Macro): 0.21243740256950247
F1 Town (Macro): 0.016172297907785307
Macro F1: 0.130749163447365


In [11]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Realiza las predicciones del modelo en el conjunto de validación
y_pred = model.predict(X_val)

# Predicciones de polaridad (usando np.argmax para obtener la clase con mayor probabilidad)
y_pred_pol = np.argmax(y_pred[0], axis=-1)

# Etiquetas verdaderas de polaridad
y_true_pol = np.argmax(y_val["dense_pol"], axis=-1)

# Genera la matriz de confusión para la polaridad
conf_matrix = confusion_matrix(y_true_pol, y_pred_pol)

# Crear un heatmap con seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=enc_pol.classes_, yticklabels=enc_pol.classes_)
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.title("Matriz de Confusión - Polaridad")
plt.show()



ModuleNotFoundError: No module named 'seaborn'

In [13]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# =========================================
# 1. Cargar y preparar datos
# =========================================
CSV_PATH = "/home/cesar/corpus-sintetico.csv"  # Ajusta el archivo CSV
MAX_LEN = 120  # Longitud máxima de los textos

# Cargar el dataset
df = pd.read_csv(CSV_PATH)

# Eliminar filas con valores nulos en la columna "Review"
df = df.dropna(subset=["Review"])

# Asegurarse de que todos los valores en la columna 'Review' sean cadenas de texto
df["Review"] = df["Review"].astype(str)

# ----- codificadores -----
enc_pol = LabelEncoder().fit(df["Polarity"])
enc_typ = LabelEncoder().fit(df["Type"])
enc_town = LabelEncoder().fit(df["Town"])

y_pol = enc_pol.transform(df["Polarity"])
y_type = enc_typ.transform(df["Type"])
y_town = enc_town.transform(df["Town"])

# Dividir en conjuntos de entrenamiento y validación
train_idx, val_idx = train_test_split(
    np.arange(len(df)),
    test_size=0.1, random_state=42,
    stratify=y_pol
)

# =========================================
# 2. Cargar el tokenizador y el modelo preentrenado BETO
# =========================================
model_name = "dccuchile/bert-base-spanish-wwm-cased"  # Nombre del modelo BETO
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)  # Ajusta el número de etiquetas según tus datos

# =========================================
# 3. Preprocesar los datos (tokenización y padding)
# =========================================
def encode_texts(texts):
    # Tokenizar las secuencias de texto
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LEN, return_tensors="pt")
    return encodings

# Preprocesar las secuencias de entrenamiento y validación
X_train_enc = encode_texts(df.loc[train_idx, "Review"])
X_val_enc = encode_texts(df.loc[val_idx, "Review"])

# Crear el dataset de PyTorch
train_dataset = TensorDataset(
    X_train_enc["input_ids"].squeeze(),  # Eliminar las dimensiones extra
    X_train_enc["attention_mask"].squeeze(),
    torch.tensor(y_pol[train_idx])
)

val_dataset = TensorDataset(
    X_val_enc["input_ids"].squeeze(),
    X_val_enc["attention_mask"].squeeze(),
    torch.tensor(y_pol[val_idx])
)

# Crear los DataLoader para el entrenamiento y validación
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/dccuchile/bert-base-spanish-wwm-cased/e131a95091c777bbd45250fb647fec415010cb8cd1ad6e1d59babeb82a0be360?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1746644344&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NjY0NDM0NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9kY2N1Y2hpbGUvYmVydC1iYXNlLXNwYW5pc2gtd3dtLWNhc2VkL2UxMzFhOTUwOTFjNzc3YmJkNDUyNTBmYjY0N2ZlYzQxNTAxMGNiOGNkMWFkNmUxZDU5YmFiZWI4MmEwYmUzNjA%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=UN5JYqio%7E3axV8VsrS4XYN9wfiM%7E-sexg4hImyuscKmkNFAWlvVClaCs%7EPwhnJVL%7ELGC-Efzsftqqkqleyy7AWBDzL8yOLbRcrvHKvOSzYx7sqrovjyezKQdxZjN2ixigGapSwDtR7KwYx90Qu7qjpturE3rO%7ElmP2KSjs6YPIp23kdQYvM7XjGe8XSHsJuL0GwY7MLPYpvkO7lLCdT7D1aWjOgzX62LhGVUjKuPNpDRQhZhvE-kVfjW5oeJh5bRkzw4lx0adUf

OSError: dccuchile/bert-base-spanish-wwm-cased does not appear to have a file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt or flax_model.msgpack.

In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.data import Dataset
import numpy as np

# =========================================
# 1. Cargar y preparar datos
# =========================================
CSV_PATH = "/home/cesar/corpus-sintetico.csv"  # Ajusta el archivo CSV
MAX_LEN = 120  # Longitud máxima de los textos

# Cargar el dataset
df = pd.read_csv(CSV_PATH)

# Eliminar filas con valores nulos en la columna "Review"
df = df.dropna(subset=["Review"])

# Asegurarse de que todos los valores en la columna 'Review' sean cadenas de texto
df["Review"] = df["Review"].astype(str)

# ----- codificadores -----
enc_pol = LabelEncoder().fit(df["Polarity"])
enc_typ = LabelEncoder().fit(df["Type"])
enc_town = LabelEncoder().fit(df["Town"])

y_pol = enc_pol.transform(df["Polarity"])
y_type = enc_typ.transform(df["Type"])
y_town = enc_town.transform(df["Town"])

# Dividir en conjuntos de entrenamiento y validación
train_idx, val_idx = train_test_split(
    np.arange(len(df)),
    test_size=0.1, random_state=42,
    stratify=y_pol
)

# =========================================
# 2. Cargar el tokenizador y el modelo preentrenado BETO
# =========================================
model_name = "dccuchile/bert-base-spanish-wwm-cased"  # Nombre del modelo BETO
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=5)

# =========================================
# 3. Preprocesar los datos (tokenización y padding)
# =========================================
def encode_texts(texts):
    # Asegurarse de que 'texts' sea una lista de cadenas
    if isinstance(texts, pd.Series):  # Si es una Serie de pandas, convertirla a lista
        texts = texts.tolist()
    
    # Tokenizar las secuencias de texto
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LEN, return_tensors="tf")
    return encodings

# Preprocesar las secuencias de entrenamiento y validación
X_train_enc = encode_texts(df.loc[train_idx, "Review"])  # Convierte a lista si es necesario
X_val_enc = encode_texts(df.loc[val_idx, "Review"])  # Convierte a lista si es necesario


# Convertir a tensores de TensorFlow
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_train_enc),
    tf.convert_to_tensor(y_pol[train_idx], dtype=tf.int32)
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(X_val_enc),
    tf.convert_to_tensor(y_pol[val_idx], dtype=tf.int32)
))

# Configurar los datasets para la eficiencia
train_dataset = train_dataset.batch(32).shuffle(100)
val_dataset = val_dataset.batch(32)

# =========================================
# 4. Entrenamiento del modelo
# =========================================
optimizer = Adam(learning_rate=2e-5)

# Entrenar el modelo
epochs = 3
for epoch in range(epochs):
    model.fit(train_dataset, validation_data=val_dataset, epochs=1)

    print(f"Epoch {epoch + 1}/{epochs} completed.")


2025-05-07 12:28:07.296453: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-07 12:28:07.296493: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-07 12:28:07.320185: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-07 12:28:07.357784: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-05-07 12:28:19.422272: I exter

RuntimeError: You must compile your model before training/testing. Use `model.compile(optimizer, loss)`.