In [41]:
# import libraries
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
import torch
import os
import spacy
import re
from typing import Tuple, Any, List

In [42]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [43]:
df_train = pd.read_csv("data/train.csv")
df_val = pd.read_csv("data/val.csv")
df_test = pd.read_csv("data/test.csv")

In [46]:
import spacy
import re

nlp = spacy.load("en_core_web_sm")

CONTRACTIONS = {
    "n't": "not", "'ll": "will", "'re": "are", "'ve": "have", "'m": "am", 
    "'d": "would", "'s": "is", "won't": "will not", "can't": "cannot"
}
IRRELEVANT_WORDS = {"wow", "oops", "ah", "ugh", "yay", "mhm", "`"}


import re

def fix_tags_string(x):
    if isinstance(x, str):
        # Reemplaza múltiples espacios o tabs con una coma
        x_clean = re.sub(r"\s+", ",", x.strip())
        # Divide por coma y elimina los elementos vacíos
        nums = [int(n) for n in x_clean.strip("[]").split(",") if n.strip() != ""]
        return nums
    return x  # Si ya es lista o algo raro, lo deja igual


df_train["tags"] = df_train["tags"].apply(fix_tags_string)
df_val["tags"] = df_val["tags"].apply(fix_tags_string)
df_test["tags"] = df_test["tags"].apply(fix_tags_string)

def replace_contractions(text):
    for contraction, replacement in CONTRACTIONS.items():
        text = re.sub(r"\b" + re.escape(contraction) + r"\b", replacement, text)
    return text

def process_sentence_and_align_tags(sentence, original_tags):
    sentence = replace_contractions(sentence)
    sentence = sentence.replace("-", "")
    doc = nlp(sentence)

    processed_tokens = []
    aligned_tags = []

    tag_idx = 0
    for token in doc:
        if token.is_punct or token.is_space or token.text.lower() in IRRELEVANT_WORDS:
            tag_idx += 1  # Skip both token and its tag
            continue
        if token.is_stop:
            tag_idx += 1
            continue

        processed_tokens.append(token.lemma_)

        if tag_idx < len(original_tags):
            aligned_tags.append(original_tags[tag_idx])
            tag_idx += 1
        else:
            # If a new token was added from contraction expansion or similar, assign tag 0
            aligned_tags.append(0)

    return processed_tokens, aligned_tags

df_train[["tokens", "tags"]] = df_train.apply(
    lambda row: pd.Series(process_sentence_and_align_tags(row["sentence"], row["tags"])), axis=1
)
df_val[["tokens", "tags"]] = df_val.apply(
    lambda row: pd.Series(process_sentence_and_align_tags(row["sentence"], row["tags"])), axis=1
)
df_test[["tokens", "tags"]] = df_test.apply(
    lambda row: pd.Series(process_sentence_and_align_tags(row["sentence"], row["tags"])), axis=1
)



In [64]:
df_train.to_csv(f"data/train_token.csv",index=False)
df_test.to_csv(f"data/test_token.csv",index=False)
df_val.to_csv(f"data/val_token.csv",index=False)

In [65]:
glove = spacy.load('en_core_web_lg') 

In [66]:
# Crear diccionario word -> index
word_to_index = {word: i for i, word in enumerate(glove.vocab.strings)}

def word2idx(embedding_dict, tweet):
    indices = [embedding_dict[word] for word in tweet if word in embedding_dict]
    return torch.tensor(indices) if indices else torch.tensor([])

# Prueba
print(word2idx(word_to_index, ["hello", "Teresa", "world"]))

tensor([520871, 366396, 700279])


In [67]:
class OntoNotesDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.tokens = df["tokens"].tolist()

        df["tags"] = pd.to_numeric(df["tags"], errors="coerce") 
        df["tags"].fillna(0, inplace=True)  
        df["tags"] = df["tags"].astype(int)
        self.tags: torch.Tensor = torch.tensor(df["tags"].values, dtype=torch.float32)
        
        self.SA: torch.Tensor = torch.tensor(df["SA"].values, dtype=torch.int)

    def __len__(self) -> int:
        """Returns the length of the dataset."""
        # TODO: Complete the len function
        return len(self.tokens)

    def __getitem__(self, idx: int) -> Tuple[List[str], torch.Tensor, torch.Tensor]:
        
        # TODO: Complete the getitem function
        token: str = self.tokens[idx]
        tag: torch.Tensor = self.tags[idx]
        sa: torch.Tensor = self.SA[idx]
        return token, tag, sa

In [70]:
def collate_fn(batch) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    
    glove = spacy.load('en_core_web_lg')
    word_to_index = {word: i for i, word in enumerate(glove.vocab.strings)} 

    # Ordenar por longitud de la secuencia (descendente)
    batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
    texts, labels, sa = zip(*batch)

    # Convertir palabras a índices
    texts_indx = [word2idx(word_to_index, text) for text in texts if word2idx(word_to_index, text).nelement() > 0]

    # Longitudes de cada secuencia
    lengths = torch.tensor([len(text) for text in texts_indx], dtype=torch.long)

    # Padding a la misma longitud
    texts_padded = pad_sequence(texts_indx, batch_first=True, padding_value=0)
    tags_padded = pad_sequence(labels, batch_first=True, padding_value=0)

    # Convertir labels a tensor
    labels = torch.tensor(labels[:len(texts_indx)], dtype=torch.int)

    return texts_padded, tags_padded, sa, lengths

In [71]:
df_train["tags"].head(10)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: tags, dtype: int64

In [53]:
df_train = pd.read_csv("data/train_token.csv")
df_val = pd.read_csv("data/val_token.csv")
df_test = pd.read_csv("data/test_token.csv")

tr_dataset = OntoNotesDataset(df_train)
vl_dataset = OntoNotesDataset(df_val)
ts_dataset = OntoNotesDataset(df_test)

In [57]:
batch_size = 64

train_dataloader: DataLoader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)
val_dataloader: DataLoader = DataLoader(vl_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)
test_dataloader: DataLoader = DataLoader(ts_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

In [58]:
i = 0
for inputs, tags, sa, lengths in train_dataloader:
    print(inputs)
    print(sa)
    print(tags)

    i += 1
    if i == 6:
        break



RuntimeError: ArrayRef: invalid slice, N = 1; size = 0

In [61]:
df_test["tags"].isnull().sum()  # Revisa si hay valores nulos


0