In [1]:
#Traducto Ingles a Español
#Alejandro Pardo/Michael lisker
#INSTALACIONES NECESARIAS

!pip install kagglehub --quiet

# IMPORTACIONES
import os
import re
import string
import random
import kagglehub
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization, MultiHeadAttention, Dense, LayerNormalization


In [2]:
# DESCARGA DEL DATASET DESDE KAGGLEHUB
path = kagglehub.dataset_download("tejasurya/eng-spanish")
print("Archivos disponibles:", os.listdir(path))

# LECTURA DEL ARCHIVO .txt
archivo = os.path.join(path, "spa.txt")
with open(archivo, encoding="utf-8") as f:
    ejemplos = f.read().split("\n")

# PROCESAMIENTO DEL DATASET
dataset = []
for linea in ejemplos:
    partes = linea.split("\t")
    if len(partes) >= 2:
        ingles = partes[0]
        espanol = "[start] " + partes[1] + " [end]"
        dataset.append((ingles, espanol))

# DIVISIÓN DEL DATASET
random.shuffle(dataset)
val_split = int(0.15 * len(dataset))
train_pairs = dataset[:-2 * val_split]
val_pairs = dataset[-2 * val_split:-val_split]
test_pairs = dataset[-val_split:]

Archivos disponibles: ['spa-eng', 'spa.txt']


In [3]:
# LIMPIEZA Y VECTORIZACIÓN
caracteres_a_eliminar = string.punctuation + "¿"
caracteres_a_eliminar = caracteres_a_eliminar.replace("[", "").replace("]", "")

def estandarizacion(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(lowercase, f"[{re.escape(caracteres_a_eliminar)}]", "")

vocab_size = 15000
sequence_length = 20

vectorizacion_entrada = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)

vectorizacion_salida = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=estandarizacion,
)

# DATOS DE TEXTO
train_entrada_texts = [pair[0] for pair in train_pairs]
train_salida_texts = [pair[1] for pair in train_pairs]
val_entrada_texts = [pair[0] for pair in val_pairs]
val_salida_texts = [pair[1] for pair in val_pairs]

# ADAPTACIÓN DE VECTORIZADORES
vectorizacion_entrada.adapt(train_entrada_texts)
vectorizacion_salida.adapt(train_salida_texts)


In [4]:
# FUNCIONES PARA FORMATEAR LOS DATOS
def formato(inputs, targets):
    entrada = vectorizacion_entrada(inputs)
    salida = vectorizacion_salida(targets)
    return {"entrada": entrada, "salida": salida[:, :-1]}, salida[:, 1:]

batch_size = 128

train_ds = tf.data.Dataset.from_tensor_slices((train_entrada_texts, train_salida_texts))
train_ds = train_ds.batch(batch_size).map(formato).prefetch(tf.data.AUTOTUNE)

val_ds = tf.data.Dataset.from_tensor_slices((val_entrada_texts, val_salida_texts))
val_ds = val_ds.batch(batch_size).map(formato).prefetch(tf.data.AUTOTUNE)

# POSICIONAL EMBEDDING
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=embed_dim)
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def get_config(self):
        config = super().get_config()
        config.update({
            "sequence_length": self.sequence_length,
            "vocab_size": self.vocab_size,
            "embed_dim": self.embed_dim,
        })
        return config


In [5]:
# ENCODER
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.attention = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([
            Dense(dense_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)



In [6]:
# DECODER
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.attention_1 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([
            Dense(dense_dim, activation="relu"),
            Dense(embed_dim),
        ])
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.layernorm_3 = LayerNormalization()
        self.supports_masking = True

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, seq_length = input_shape[0], input_shape[1]
        i = tf.range(seq_length)[:, tf.newaxis]
        j = tf.range(seq_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, seq_length, seq_length))
        mult = tf.concat([[batch_size], [1], [1]], axis=0)
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
        else:
            padding_mask = mask

        attention_output_1 = self.attention_1(inputs, inputs, inputs, attention_mask=causal_mask)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(attention_output_1, encoder_outputs, encoder_outputs, attention_mask=padding_mask)
        attention_output_2 = self.layernorm_2(attention_output_1 + attention_output_2)

        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)


In [7]:
# CONSTRUCCIÓN DEL MODELO TRANSFORMER
embed_dim = 128
dense_dim = 512
num_heads = 4

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="entrada")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="salida")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)

transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [None]:
# COMPILACIÓN Y ENTRENAMIENTO
transformer.compile(optimizer="rmsprop",
                    loss="sparse_categorical_crossentropy",
                    metrics=["accuracy"])  # Solo se usa 'accuracy'

transformer.fit(train_ds,
                epochs=30,
                validation_data=val_ds,
                validation_freq=1)  # La precisión de validación se mostrará automáticamente



Epoch 1/30
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 67ms/step - accuracy: 0.6982 - loss: 2.8994 - val_accuracy: 0.7648 - val_loss: 1.4906
Epoch 2/30
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 46ms/step - accuracy: 0.7787 - loss: 1.4379 - val_accuracy: 0.8215 - val_loss: 1.0963
Epoch 3/30
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 45ms/step - accuracy: 0.8265 - loss: 1.0959 - val_accuracy: 0.8491 - val_loss: 0.9082
Epoch 4/30
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 44ms/step - accuracy: 0.8494 - loss: 0.9274 - val_accuracy: 0.8585 - val_loss: 0.8366
Epoch 5/30
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 44ms/step - accuracy: 0.8620 - loss: 0.8290 - val_accuracy: 0.8674 - val_loss: 0.7773
Epoch 6/30
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 44ms/step - accuracy: 0.8710 - loss: 0.7625 - val_accuracy: 0.8719 - val_loss: 0.7375
Epoch 7/30
[1m7

In [None]:
# CELDA PARA TRADUCIR UNA PALABRA INGRESADA

# Función para traducir una palabra en inglés al español
def traducir_palabra(entrada_texto):
    # Preprocesar la entrada (vectorizarla)
    entrada_vectorizada = vectorizacion_entrada([entrada_texto])

    # Iniciar la secuencia de salida con el token [start]
    decoder_input = tf.constant([[vectorizacion_salida.vocabulary_size() - 2]])  # [start] token

    # Generar la traducción palabra por palabra
    traduccion = []
    for _ in range(sequence_length):  # Limitar la longitud de la traducción
        # Hacer la predicción
        prediccion = transformer.predict([entrada_vectorizada, decoder_input])

        # Obtener la siguiente palabra (con el índice con mayor probabilidad)
        prediccion_idx = tf.argmax(prediccion[0], axis=-1)[-1].numpy()

        # Convertir el índice en palabra
        palabra_predicha = vectorizacion_salida.get_vocabulary()[prediccion_idx]

        # Si la predicción es el token [end], terminamos
        if palabra_predicha == "[end]":
            break

        # Añadir la palabra predicha a la traducción
        traduccion.append(palabra_predicha)

        # Actualizar la entrada del decoder con la palabra predicha
        decoder_input = tf.concat([decoder_input, tf.constant([[prediccion_idx]])], axis=-1)

    # Unir las palabras en una cadena
    return " ".join(traduccion)

# Interfaz de entrada de la palabra a traducir
entrada_usuario = input("Introduce una palabra o frase en inglés para traducir: ")

# Traducir y mostrar la traducción
traduccion = traducir_palabra(entrada_usuario)
print(f"Traducción al español: {traduccion}")
