In [1]:
# import libraries
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import numpy as np
import torch
import os
import spacy
import re
from typing import Tuple, Any, List

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [78]:
df_train = pd.read_csv("data/train.csv")
df_val = pd.read_csv("data/val.csv")
df_test = pd.read_csv("data/test.csv")

In [None]:
import spacy
import re
import ast

nlp = spacy.load("en_core_web_sm")

CONTRACTIONS = {
    "n't": "not", "'ll": "will", "'re": "are", "'ve": "have", "'m": "am", 
    "'d": "would", "'s": "is", "won't": "will not", "can't": "cannot"
}
IRRELEVANT_WORDS = {"wow", "oops", "ah", "ugh", "yay", "mhm", "`"}


import re

def fix_tags_string(x):
    if isinstance(x, str):
        # Reemplaza múltiples espacios o tabs con una coma
        x_clean = re.sub(r"\s+", ",", x.strip())
        # Divide por coma y elimina los elementos vacíos
        nums = [int(n) for n in x_clean.strip("[]").split(",") if n.strip() != ""]
        return nums
    return x  # Si ya es lista o algo raro, lo deja igual


df_train["tags"] = df_train["tags"].apply(fix_tags_string)
df_val["tags"] = df_val["tags"].apply(fix_tags_string)
df_test["tags"] = df_test["tags"].apply(fix_tags_string)

def replace_contractions(text):
    for contraction, replacement in CONTRACTIONS.items():
        text = re.sub(r"\b" + re.escape(contraction) + r"\b", replacement, text)
    return text

def process_sentence_and_align_tags(sentence, original_tags):
    sentence = replace_contractions(sentence)
    sentence = sentence.replace("-", "")
    doc = nlp(sentence)

    processed_tokens = []
    aligned_tags = []

    tag_idx = 0
    for token in doc:
        if token.is_punct or token.is_space or token.text.lower() in IRRELEVANT_WORDS:
            tag_idx += 1  # Skip both token and its tag
            continue
        if token.is_stop:
            tag_idx += 1
            continue

        processed_tokens.append(token.lemma_)

        if tag_idx < len(original_tags):
            aligned_tags.append(original_tags[tag_idx])
            tag_idx += 1
        else:
            # If a new token was added from contraction expansion or similar, assign tag 0
            aligned_tags.append(0)

    return processed_tokens, aligned_tags

df_train[["tokens", "tags"]] = df_train.apply(
    lambda row: pd.Series(process_sentence_and_align_tags(row["sentence"], row["tags"])), axis=1
)
df_val[["tokens", "tags"]] = df_val.apply(
    lambda row: pd.Series(process_sentence_and_align_tags(row["sentence"], row["tags"])), axis=1
)
df_test[["tokens", "tags"]] = df_test.apply(
    lambda row: pd.Series(process_sentence_and_align_tags(row["sentence"], row["tags"])), axis=1
)



In [167]:
df_train.to_csv(f"data/train_token.csv",index=False)
df_test.to_csv(f"data/test_token.csv",index=False)
df_val.to_csv(f"data/val_token.csv",index=False)

In [3]:
glove = spacy.load('en_core_web_lg') 

In [4]:
word_to_index = {word: i for i, word in enumerate(glove.vocab.strings)}

def word2idx(embedding_dict, tweet):
    indices = [embedding_dict[word] for word in tweet if word in embedding_dict]
    if not indices:
        # Si la secuencia está vacía, devolvemos un índice de padding (0)
        indices = [0]  # O cualquier otro valor de padding adecuado
    return torch.tensor(indices)

# Prueba
print(word2idx(word_to_index, ["hello", "Teresa", "world"]))
print(word2idx(word_to_index,["it is"]))
print(word2idx(word_to_index,["it","is"]))

tensor([520871, 366396, 700279])
tensor([0])
tensor([537834, 537344])


In [5]:
import ast

class OntoNotesDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.tokens = df["tokens"].tolist()

        df["tags"] = df["tags"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
        self.tags = [torch.tensor(t, dtype=torch.float32) for t in df["tags"].tolist()]
        
        self.SA: torch.Tensor = torch.tensor(df["SA"].values, dtype=torch.int)

    def __len__(self) -> int:
        """Returns the length of the dataset."""
        # TODO: Complete the len function
        return len(self.tokens)

    def __getitem__(self, idx: int) -> Tuple[List[str], torch.Tensor, torch.Tensor]:
        
        # TODO: Complete the getitem function
        token: str = self.tokens[idx]
        tag: torch.Tensor = self.tags[idx]
        sa: torch.Tensor = self.SA[idx]
        return token, tag, sa

In [18]:
ENTITY2INDEX = {
    "O": 0,
    "B-CARDINAL": 1,
    "B-DATE": 2,
    "I-DATE": 3,
    "B-PERSON": 4,
    "I-PERSON": 5,
    "B-NORP": 6,
    "B-GPE": 7,
    "I-GPE": 8,
    "B-LAW": 9,
    "I-LAW": 10,
    "B-ORG": 11,
    "I-ORG": 12, 
    "B-PERCENT": 13,
    "I-PERCENT": 14, 
    "B-ORDINAL": 15, 
    "B-MONEY": 16, 
    "I-MONEY": 17, 
    "B-WORK_OF_ART": 18, 
    "I-WORK_OF_ART": 19, 
    "B-FAC": 20, 
    "B-TIME": 21, 
    "I-CARDINAL": 22, 
    "B-LOC": 23, 
    "B-QUANTITY": 24, 
    "I-QUANTITY": 25, 
    "I-NORP": 26, 
    "I-LOC": 27, 
    "B-PRODUCT": 28, 
    "I-TIME": 29, 
    "B-EVENT": 30,
    "I-EVENT": 31,
    "I-FAC": 32,
    "B-LANGUAGE": 33,
    "I-PRODUCT": 34,
    "I-ORDINAL": 35,
    "I-LANGUAGE": 36
}

In [95]:
NUM_NER_CLASSES = 37
NUM_SA_CLASSES = 3

def collate_fn(batch) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    
    glove = spacy.load('en_core_web_lg')
    word_to_index = {word: i for i, word in enumerate(glove.vocab.strings)} 

    # Ordenar por longitud de la secuencia (descendente)
    batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
    texts, labels, sa = zip(*batch)

    # Convertir palabras a índices
    texts_indx = [word2idx(word_to_index, text) for text in texts if word2idx(word_to_index, text).nelement() > 0]

    # Padding a la misma longitud
    texts_padded = pad_sequence(texts_indx, batch_first=True, padding_value=0)
    tags_padded = pad_sequence(labels, batch_first=True, padding_value=ENTITY2INDEX["O"]).long()

    # Longitudes de cada secuencia
    lengths = torch.tensor([len(text) for text in texts_padded], dtype=torch.long)
    
    # One hot NER
    batch_size, max_len = tags_padded.shape
    tags_onehot = torch.zeros((batch_size, max_len, NUM_NER_CLASSES), dtype=torch.float)

    for i in range(batch_size):
        for j in range(lengths[i]):
            label_idx = tags_padded[i, j].item()
            tags_onehot[i, j, label_idx] = 1.0
        
    # One hot SA
    batch_size = len(sa)
    sa_onehot = torch.zeros((batch_size, NUM_SA_CLASSES), dtype=torch.float)

    for i in range(batch_size):
        label_idx = sa[i].item()
        sa_onehot[i, label_idx] = 1.0

    return texts_padded, tags_onehot, sa_onehot.unsqueeze(1), lengths

In [83]:
df_train["tags"].head(10)

0                                         [0, 0, 0, 0]
1                                   [0, 0, 0, 0, 0, 0]
2                                      [0, 0, 0, 0, 0]
3    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4                             [0, 0, 0, 0, 0, 0, 0, 0]
5                                               [0, 0]
6    [0, 5, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
7     [0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
8             [0, 0, 0, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0]
9                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Name: tags, dtype: object

In [96]:
df_train = pd.read_csv("data/train_token.csv")
df_val = pd.read_csv("data/val_token.csv")
df_test = pd.read_csv("data/test_token.csv")

df_train["tokens"] = df_train["tokens"].apply(ast.literal_eval)
df_train["tags"] = df_train["tags"].apply(ast.literal_eval)

df_val["tokens"] = df_val["tokens"].apply(ast.literal_eval)
df_val["tags"] = df_val["tags"].apply(ast.literal_eval)

df_test["tokens"] = df_test["tokens"].apply(ast.literal_eval)
df_test["tags"] = df_test["tags"].apply(ast.literal_eval)

df_train.head(10)

Unnamed: 0,tokens,tags,sentence,SA
0,"[People, start, business, reason]","[0, 0, 0, 0]",People start their own businesses for many rea...,1
1,"[chance, fill, sale, tax, record, rarely]","[0, 0, 0, 0, 0, 0]",But a chance to fill out sales - tax records i...,1
2,"[red, tape, bugaboo, small, business]","[0, 0, 0, 0, 0]",Red tape is the bugaboo of small business .,0
3,"[ironically, person, want, run, business, prob...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Ironically , the person who wants to run his o...",0
4,"[business, owner, face, mound, form, regulatio...","[0, 0, 0, 0, 0, 0, 0, 0]",Yet every business owner has to face the mound...,1
5,"[hope, change]","[0, 0]",There is hope of change .,2
6,"[week, Sen., Malcolm, Wallop, LRB, R., Wyo, rr...","[0, 5, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Last week , Sen. Malcolm Wallop -LRB- R. , Wyo...",1
7,"[great, federal, regulation, mean, large, enti...","[0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",`` A great many federal regulations are meant ...,1
8,"[lawmaker, busy, try, revive, recently, lapse,...","[0, 0, 0, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0]",Other lawmakers are busy trying to revive the ...,1
9,"[optimistic, entrepreneur, await, promise, lan...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","Thus , optimistic entrepreneurs await a promis...",2


In [85]:
type(df_train["tokens"].iloc[0])

list

In [97]:
tr_dataset = OntoNotesDataset(df_train)
vl_dataset = OntoNotesDataset(df_val)
ts_dataset = OntoNotesDataset(df_test)

In [60]:
df_train.head(10)

Unnamed: 0,tokens,tags,sentence,SA
0,"[People, start, business, reason]","[0, 0, 0, 0]",People start their own businesses for many rea...,1
1,"[chance, fill, sale, tax, record, rarely]","[0, 0, 0, 0, 0, 0]",But a chance to fill out sales - tax records i...,1
2,"[red, tape, bugaboo, small, business]","[0, 0, 0, 0, 0]",Red tape is the bugaboo of small business .,0
3,"[ironically, person, want, run, business, prob...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Ironically , the person who wants to run his o...",0
4,"[business, owner, face, mound, form, regulatio...","[0, 0, 0, 0, 0, 0, 0, 0]",Yet every business owner has to face the mound...,1
5,"[hope, change]","[0, 0]",There is hope of change .,2
6,"[week, Sen., Malcolm, Wallop, LRB, R., Wyo, rr...","[0, 5, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","Last week , Sen. Malcolm Wallop -LRB- R. , Wyo...",1
7,"[great, federal, regulation, mean, large, enti...","[0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",`` A great many federal regulations are meant ...,1
8,"[lawmaker, busy, try, revive, recently, lapse,...","[0, 0, 0, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0]",Other lawmakers are busy trying to revive the ...,1
9,"[optimistic, entrepreneur, await, promise, lan...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","Thus , optimistic entrepreneurs await a promis...",2


In [98]:
batch_size = 64

train_dataloader: DataLoader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)
val_dataloader: DataLoader = DataLoader(vl_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)
test_dataloader: DataLoader = DataLoader(ts_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

In [99]:
i = 0
for inputs, tags, sa, lengths in train_dataloader:
    #print(inputs)
    #print(sa)
    #print(tags)

    i += 1
    if i == 2:
        break



In [16]:
lista = [684880, 660328, 642674]

def idx2word(index_tensor, index_to_word):
    return [index_to_word[i] for i in index_tensor]

index_to_word = {i: word for word, i in word_to_index.items()}

print(idx2word(lista, index_to_word))


['unrelieved', 'substantive', 'shallowness']
