In [24]:
# import libraries
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
import torch
import os
import spacy
import re
from typing import Tuple, Any, List

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [5]:
df_train = pd.read_csv("data/train.csv")
df_val = pd.read_csv("data/val.csv")
df_test = pd.read_csv("data/test.csv")

In [7]:
nlp = spacy.load("en_core_web_sm")

CONTRACTIONS = {
    "n't": "not", 
    "'ll": "will", 
    "'re": "are", 
    "'ve": "have", 
    "'m": "am", 
    "'d": "would", 
    "'s": "is", 
    "won't": "will not", 
    "can't": "cannot"
}
IRRELEVANT_WORDS = {"wow", "oops", "ah", "ugh", "yay", "mhm", "`"}

def replace_contractions(text):
    for contraction, replacement in CONTRACTIONS.items():
        text = re.sub(r"\b" + re.escape(contraction) + r"\b", replacement, text)
    return text

def process_tokens(text):
    text = replace_contractions(text)
    text = text.replace("-", "")
    # Crear un objeto Doc de spaCy para cada palabra
    doc = nlp(text)  # Unir las palabras en una cadena y procesarlas con spaCy
    
    # Filtrar tokens: eliminar puntuación, stopwords y lematizar
    processed_words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.text.lower() not in IRRELEVANT_WORDS]
    
    return processed_words

df_train["tokens"] = df_train["sentence"].apply(process_tokens)
df_test["tokens"] = df_test["sentence"].apply(process_tokens)
df_val["tokens"] = df_val["sentence"].apply(process_tokens)

In [8]:
df_train.to_csv(f"data/train_token.csv",index=False)
df_test.to_csv(f"data/test_token.csv",index=False)
df_val.to_csv(f"data/val_token.csv",index=False)

In [16]:
glove = spacy.load('en_core_web_lg') 

In [18]:
# Crear diccionario word -> index
word_to_index = {word: i for i, word in enumerate(glove.vocab.strings)}

def word2idx(embedding_dict, tweet):
    indices = [embedding_dict[word] for word in tweet if word in embedding_dict]
    return torch.tensor(indices) if indices else torch.tensor([])

# Prueba
print(word2idx(word_to_index, ["hello", "Teresa", "world"]))

tensor([520871, 366396, 700279])


In [39]:
class OntoNotesDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.tokens = df["tokens"].tolist()

        df["tags"] = pd.to_numeric(df["tags"], errors="coerce") 
        df["tags"].fillna(0, inplace=True)  
        df["tags"] = df["tags"].astype(int)
        self.tags: torch.Tensor = torch.tensor(df["tags"].values, dtype=torch.float32)
        
        self.SA: torch.Tensor = torch.tensor(df["SA"].values, dtype=torch.int)

    def __len__(self) -> int:
        """Returns the length of the dataset."""
        # TODO: Complete the len function
        return len(self.tokens)

    def __getitem__(self, idx: int) -> Tuple[List[List[str]], List[int]]:
        
        # TODO: Complete the getitem function
        token: str = self.tokens[idx]
        tag: torch.Tensor = self.tags[idx]
        sa: torch.Tensor = self.SA[idx]
        return token, tag, sa

In [None]:
def collate_fn(batch: List[Tuple[List[str], int]]) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    
    glove = spacy.load('en_core_web_lg')
    word_to_index = {word: i for i, word in enumerate(glove.vocab.strings)} 

    # Ordenar por longitud de la secuencia (descendente)
    batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
    texts, labels = zip(*batch)

    # Convertir palabras a índices
    texts_indx = [word2idx(word_to_index, text) for text in texts if word2idx(word_to_index, text).nelement() > 0]

    # Longitudes de cada secuencia
    lengths = torch.tensor([len(text) for text in texts_indx], dtype=torch.long)

    # Padding a la misma longitud
    texts_padded = pad_sequence(texts_indx, batch_first=True, padding_value=0)

    # Convertir labels a tensor
    labels = torch.tensor(labels[:len(texts_indx)], dtype=torch.int)

    return texts_padded, labels, lengths

In [40]:
df_train["tags"].head(10)

0                                  [0 0 0 0 0 0 0 0 0]
1                    [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
2                                  [0 0 0 0 0 0 0 0 0]
3    [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
4    [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
5                                        [0 0 0 0 0 0]
6    [2 3 0 0 4 5 0 6 0 7 8 0 0 0 0 0 0 0 0 0 0 0 0...
7    [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4...
8    [ 0  0  0  0  0  0  0  0  0  0  9 10 10  0  0 ...
9    [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 5 0 0 0 0...
Name: tags, dtype: object

In [41]:
df_train = pd.read_csv("data/train_token.csv")
df_val = pd.read_csv("data/val_token.csv")
df_test = pd.read_csv("data/test_token.csv")

tr_dataset = OntoNotesDataset(df_train)
vl_dataset = OntoNotesDataset(df_val)
ts_dataset = OntoNotesDataset(df_test)

In [43]:
batch_size = 64

train_dataloader: DataLoader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader: DataLoader = DataLoader(vl_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_dataloader: DataLoader = DataLoader(ts_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

<__main__.OntoNotesDataset at 0x7f8a0689e190>