<a href="https://colab.research.google.com/github/MarioSigal/Aprendizaje-Automatico-I-y-II/blob/main/TP_2_Aprendizaje_Automatico.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Imports


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from typing import List, Dict, Any
import numpy as np

#DATA SET


##TOKENIZER & BERT EMBEDDINGS

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertModel.from_pretrained("bert-base-multilingual-cased")
model.eval()

In [None]:
def get_multilingual_token_embedding(token: str):
  """
    Devuelve el embedding (estático) para el token.
  """
  token_id = tokenizer.convert_tokens_to_ids(token)
  if token_id is None or token_id == tokenizer.unk_token_id:
    print(f"❌ El token '{token}' no pertenece al vocabulario de multilingual BERT.")
    return None
  embedding_vector = model.embeddings.word_embeddings.weight[token_id]
  print(f"✅ Token: '{token}' | ID: {token_id}")
  print(f"Embedding shape: {embedding_vector.shape}")
  return embedding_vector

In [None]:
texto = "a qué hora pasa el ciento siete"
tokens = tokenizer.tokenize(texto)
print(tokens)
# ['a', 'qué', 'hora', 'pasa', 'el', 'cien', '##to', 'siete']
tokens_id = tokenizer.convert_tokens_to_ids(tokens)
print(tokens_id)
# [169, 38188, 24301, 26088, 10125, 99485, 10340, 28394]
embedding_vector = model.embeddings.word_embeddings.weight[tokens_id]
print(embedding_vector.shape)
# torch.Size([8, 768]

##RNN Unidireccional

####Encoder

In [None]:
class EncoderUnidireccional(nn.Module):
    def __init__(self, embedding_dim=768, hidden_dim=256, num_layers=2, dropout=0.3):
        super(EncoderUnidireccional, self).__init__()
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=False  # unidireccional
        )

    def forward(self, embeddings):
        """
        embeddings: tensor de forma (batch_size, seq_len, embedding_dim)
        """
        outputs, (hidden, cell) = self.lstm(embeddings)
        # outputs: (batch_size, seq_len, hidden_dim)
        # hidden: (num_layers, batch_size, hidden_dim)
        # cell:   (num_layers, batch_size, hidden_dim)
        return outputs, (hidden, cell)

####Decoder

In [None]:
class DecoderUnidireccional(nn.Module):
    def __init__(self, hidden_dim=256, num_layers=2, dropout=0.3):
        super(DecoderUnidireccional, self).__init__()
        self.lstm = nn.LSTM(
            input_size=hidden_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=False  # unidireccional
        )

        # Capa feed-forward para cada problema
        self.punt_inicial_ff = nn.Linear(hidden_dim, 2)
        self.punt_final_ff = nn.Linear(hidden_dim, 4)
        self.capital_ff = nn.Linear(hidden_dim, 4)

        # Función de activación para cada problema
        self.punt_inicial_sigmoid = nn.Sigmoid()
        self.punt_final_softmax = nn.Softmax(dim=4)
        self.capital_softmax = nn.Softmax(dim=4)


    def forward(self, encoder_outputs, hidden, cell):
        """
        encoder_outputs: (batch_size, seq_len, hidden_dim)
        hidden, cell: del encoder
        """
        outputs, _ = self.lstm(encoder_outputs, (hidden, cell))

        punt_inicial_logits = self.punt_inicial_sigmoid(self.punt_inicial_ff(outputs))
        punt_final_logits = self.punt_final_sofmax(self.punt_final_ff(outputs))
        capital_logits = self.capital_sofmax(self.capital_ff(outputs))

        return {
            "puntuación inicial": punt_inicial_logits,
            "puntuación final": punt_final_logits,
            "capitalización": capital_logits,
        }


####Encoder–Decoder

In [None]:
class PunctuationRestorationModel(nn.Module):
    def __init__(self, embedding_dim=768, hidden_dim=256, num_layers=2, dropout=0.3):
        super(PunctuationRestorationModel, self).__init__()
        self.encoder = EncoderUnidireccional(embedding_dim, hidden_dim, num_layers, dropout)
        self.decoder = DecoderUnidireccional(hidden_dim, num_layers, dropout)

    def forward(self, embeddings):
        encoder_outputs, (hidden, cell) = self.encoder(embeddings)
        predictions = self.decoder(encoder_outputs, hidden, cell)
        return predictions


## RNN Bidireccional


#### Encoder


In [None]:
class EncoderBidireccional(nn.Module):
    def __init__(self, embedding_dim=768, hidden_dim=256, num_layers=2, dropout=0.3):
        super(EncoderBidireccional, self).__init__()
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True  # bidireccional
        )

    def forward(self, embeddings):
        """
        embeddings: tensor de forma (batch_size, seq_len, embedding_dim)
        """
        outputs, (hidden, cell) = self.lstm(embeddings)
        # outputs: (batch_size, seq_len, hidden_dim)
        # hidden: (num_layers, batch_size, hidden_dim)
        # cell:   (num_layers, batch_size, hidden_dim)
        return outputs, (hidden, cell)

####Decoder

In [None]:
class DecoderBidireccional(nn.Module):
    def __init__(self, hidden_dim=256, num_layers=2, dropout=0.3):
        super(DecoderBidireccional, self).__init__()
        self.lstm = nn.LSTM(
            input_size=hidden_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True  # bidireccional
        )

        # Capa feed-forward para cada problema
        self.punt_inicial_ff = nn.Linear(hidden_dim, 2)
        self.punt_final_ff = nn.Linear(hidden_dim, 4)
        self.capital_ff = nn.Linear(hidden_dim, 4)

        # Función de activación para cada problema
        self.punt_inicial_sigmoid = nn.Sigmoid()
        self.punt_final_softmax = nn.Softmax(dim=4)
        self.capital_softmax = nn.Softmax(dim=4)


    def forward(self, encoder_outputs, hidden, cell):
        """
        encoder_outputs: (batch_size, seq_len, hidden_dim)
        hidden, cell: del encoder
        """
        outputs, _ = self.lstm(encoder_outputs, (hidden, cell))

        punt_inicial_logits = self.punt_inicial_sigmoid(self.punt_inicial_ff(outputs))
        punt_final_logits = self.punt_final_sofmax(self.punt_final_ff(outputs))
        capital_logits = self.capital_sofmax(self.capital_ff(outputs))

        return {
            "puntuación inicial": punt_inicial_logits,
            "puntuación final": punt_final_logits,
            "capitalización": capital_logits,
        }


####Encoder-Decoder

In [None]:
class PunctuationRestorationModel(nn.Module):
    def __init__(self, embedding_dim=768, hidden_dim=256, num_layers=2, dropout=0.3):
        super(PunctuationRestorationModel, self).__init__()
        self.encoder = EncoderBidireccional(embedding_dim, hidden_dim, num_layers, dropout)
        self.decoder = DecoderBidireccional(hidden_dim, num_layers, dropout)

    def forward(self, embeddings):
        encoder_outputs, (hidden, cell) = self.encoder(embeddings)
        predictions = self.decoder(encoder_outputs, hidden, cell)
        return predictions
