# Librerías

In [1]:
from datasets import load_dataset,load_metric


from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput


import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import AdamW 

In [2]:
from utils import train_function

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Carga y preprocesado de datos

In [4]:
# Tokenizer que utilizaremos y función auxiliar para aplicarlo.

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(item):
    '''Aplica el tokenizer al elemento dado. Trunca su longitud a la 
    longitud máxima de DistilBert.
    '''
    return tokenizer(item["text"],truncation=True)

In [5]:
data = load_dataset('md_gender_bias','wizard')

Reusing dataset md_gender_bias (C:\Users\kuina\.cache\huggingface\datasets\md_gender_bias\wizard\1.0.0\8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05)


  0%|          | 0/3 [00:00<?, ?it/s]

Separo test y validación. Creo datos artificiales para las etiquetas que faltan.

In [6]:

train = data["train"]
about = train["gender"]
self = [np.random.choice(np.unique(about)) for i in range(len(about))]
partner = [np.random.choice(np.unique(about)) for i in range(len(about))]


eval = data["validation"]
aboutE = eval["gender"]
selfE = [np.random.choice(np.unique(aboutE)) for i in range(len(aboutE))]
partnerE = [np.random.choice(np.unique(aboutE)) for i in range(len(aboutE))]


In [7]:
labels_train = [ ( about[i] , self[i] , partner[i] ) for i in range(len(about))]
labels_eval = [ ( aboutE[i] , selfE[i] , partnerE[i] ) for i in range(len(aboutE))]

In [8]:
token_data = data.map(preprocess_function,batched=True)
token_data = token_data.remove_columns(["chosen_topic","text","gender"])
token_data= token_data.with_format("torch")


token_train = token_data["train"]
token_eval = token_data["validation"]

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at C:\Users\kuina\.cache\huggingface\datasets\md_gender_bias\wizard\1.0.0\8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05\cache-866d8be03765ca0b.arrow
Loading cached processed dataset at C:\Users\kuina\.cache\huggingface\datasets\md_gender_bias\wizard\1.0.0\8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05\cache-db98a93f495d1c0e.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
# Crea un dataset a partir de los datos iniciales.

class MyDataSet(Dataset):
    def __init__(self,data,labels,attention_mask):
        self.data = data
        self.labels =labels 
        self.attention = attention_mask


    def __getitem__(self,index):
        if torch.is_tensor(index):
            index = index.tolist()
            
        x = self.data[index]
        labels =  self.labels[index]
        labels = np.array([labels])

        labels = labels
        attention = self.attention[index]


        sample = {'input_ids': x,
                'attention_mask': attention,
                'labels': torch.from_numpy(labels).view(-1,3)}

        return  sample


    def __len__(self):
        return len(self.data)


Creo los dataloaders con ambos conjuntos. Aplico padding de forma dinámica haciendo uso del DataCollator

In [10]:
trainData = MyDataSet(token_train['input_ids'],labels_train,token_train['attention_mask'])
evalData = MyDataSet(token_eval['input_ids'],labels_eval,token_eval['attention_mask'])


train_dataloader = DataLoader(trainData,batch_size=16,shuffle=True,collate_fn=data_collator)
eval_dataloader = DataLoader(evalData,batch_size=16,shuffle=True,collate_fn=data_collator)

# Modelo

In [11]:
new_data = load_dataset('md_gender_bias','new_data')
tasks_names = new_data['train'].features['class_type'].names

Reusing dataset md_gender_bias (C:\Users\kuina\.cache\huggingface\datasets\md_gender_bias\new_data\1.0.0\8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05)


  0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
tasks_names = ['about','partner','self'] # Usar esto en casos que no cargue new_data

In [13]:
class Multi(nn.Module):
    def __init__(self,name,num_labels,tasks_names,dropout=0.1):
        super().__init__()
        self.num_labels = num_labels
        self.encoder = AutoModel.from_pretrained(name,num_labels=num_labels,output_attentions=True,output_hidden_states = True)
        self.taskLayer = nn.ModuleList([])
        self.taskLayer.append(nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(768,num_labels)
        ))

        # Task 0: About
        # Task 1: Self
        # Task 2: Partner
        self.tasksname = {v:k for v,k in enumerate(tasks_names)}
        # self.taskstrace = {v : [] for v in range(len(self.tasksname)) }


        
    def forward(self,input_ids = None,attention_mask = None,labels = None,output_attentions=None,output_hidden_states=None):
        


        dBertoutputs = self.encoder(input_ids,attention_mask = attention_mask,output_attentions=output_attentions,output_hidden_states = output_hidden_states)





        outputs_last_hidden_state = dBertoutputs[0]

        cls_out = outputs_last_hidden_state[:,0]

        tasks_output = {v : cls_out.clone() for v in self.tasksname.keys()}

        for layer in self.taskLayer:
            tasks_output = {v: layer(k) for v,k in tasks_output.items()}

            # self.taskstrace = {v: tasks_output[v] for v in self.taskstrace.keys()}



        loss = None

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()

            task_loss = [loss_fct(tasks_output[i] , labels[:,0][:,i]) for i in range(len(tasks_output))]


            loss = sum(task_loss)


        return SequenceClassifierOutput(loss=loss, logits=tasks_output, hidden_states=dBertoutputs.hidden_states,attentions=dBertoutputs.attentions)

In [14]:
model = Multi("distilbert-base-uncased", 3,tasks_names).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Entrenamiento

In [15]:
num_epochs = 3
train_function(model,num_epochs,train_dataloader,optimizer = AdamW(model.parameters(),lr=5e-5))

  0%|          | 0/1962 [00:00<?, ?it/s]

 33%|███▎      | 654/1962 [00:13<00:20, 64.48it/s]
 67%|██████▋   | 1308/1962 [00:22<00:09, 70.91it/s]
100%|██████████| 1962/1962 [00:32<00:00, 67.77it/s]


In [15]:
# Tras modificar
num_epochs = 3
train_function(model,num_epochs,train_dataloader,optimizer = AdamW(model.parameters(),lr=5e-5))

  0%|          | 0/1962 [00:00<?, ?it/s]

 33%|███▎      | 654/1962 [00:43<01:25, 15.32it/s]
 67%|██████▋   | 1308/1962 [01:27<00:41, 15.85it/s]
100%|██████████| 1962/1962 [02:10<00:00, 15.37it/s]


# Evaluación

## Single task

In [16]:
metric = load_metric("accuracy")

In [17]:
def eval_function_multi_single(model,metric,eval_dataloader,task = 0):
    '''Función para evaluar un modelo dado en la tarea indicada.
    
    model: modelo a evaluar
    metric: metrica a utilizar en la evaluación.
    eval_dataloader: conjunto para evaluar. Debe ser un Pytorch DataLoader. 
    task: tarea en la que queremos evaluar el modelo'''


    model.eval()
    for batch in eval_dataloader:
        batch = {k:v.to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        
        # logits = outputs.logits 
        logits = outputs.logits[task] 
        predictions = torch.argmax(logits,dim=-1)
        metric.add_batch(predictions = predictions, references = batch["labels"][:,0][:,task])

    return metric.compute()

Evaluamos en la tarea 0 (about)

In [18]:
eval_function_multi_single(model,metric,eval_dataloader,0)#

{'accuracy': 0.8081936685288641}

Tarea 1 (self)

In [19]:
eval_function_multi_single(model,metric,eval_dataloader,1) #

{'accuracy': 0.3500931098696462}

Tarea 2 (partner)

In [20]:
eval_function_multi_single(model,metric,eval_dataloader,2)#

{'accuracy': 0.3258845437616387}

## All tasks

In [24]:
def eval_function_multi(model,eval_dataloader,tasks_names):
    '''Función para evaluar un modelo dado en la tarea indicada.
    
    model: modelo a evaluar
    
    eval_dataloader: conjunto para evaluar. Debe ser un Pytorch DataLoader. 
    task_names: tareas en las que voy a evaluar mi modelo'''

    # Usa accuracy, puedo cambiar para pasarle en la entrada la metrica (o métricas) que quiera utilizar!! (la paso suelta o paso una lista de metricas)
    model.eval()
    metrics = {task : load_metric("accuracy") for task  in range(len(tasks_names))} # Se podria modificar en funcion del tipo de tarea
    for batch in eval_dataloader:
        batch = {k:v.to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        
        # logits = outputs.logits 
        logits = outputs.logits
        predictions = {task : torch.argmax(logits[task],dim=-1) for task in range(len(tasks_names))}

        for task, metric in metrics.items():
            metric.add_batch(predictions = predictions[task], references = batch["labels"][:,0][:,task])


    return {tasks_names[task] : metric.compute() for task, metric in metrics.items()}

In [25]:
eval_function_multi(model,eval_dataloader,tasks_names)

{'about': {'accuracy': 0.8081936685288641},
 'partner': {'accuracy': 0.3500931098696462},
 'self': {'accuracy': 0.3258845437616387}}

# Predicciones

In [27]:
for batch in eval_dataloader:
    batch = {k:v.to(device) for k,v in batch.items()}
    break

In [28]:
outputs = model(**batch)

In [34]:
logits = outputs.logits

Predicciones en la etiqueta about

In [36]:
preds = torch.argmax(logits[0],dim=-1)

In [37]:
preds

tensor([1, 2, 1, 0, 2, 1, 2, 2, 2, 0, 2, 0, 2, 1, 2, 2], device='cuda:0')

In [38]:
batch['labels'][:,0][:,0]

tensor([0, 2, 2, 0, 2, 1, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2], device='cuda:0')

# pruebas new_data

In [39]:
new_data

DatasetDict({
    train: Dataset({
        features: ['text', 'original', 'labels', 'class_type', 'turker_gender', 'episode_done', 'confidence'],
        num_rows: 2345
    })
})

In [41]:
new_data['train'].features

{'text': Value(dtype='string', id=None),
 'original': Value(dtype='string', id=None),
 'labels': [ClassLabel(num_classes=6, names=['ABOUT:female', 'ABOUT:male', 'PARTNER:female', 'PARTNER:male', 'SELF:female', 'SELF:male'], id=None)],
 'class_type': ClassLabel(num_classes=3, names=['about', 'partner', 'self'], id=None),
 'turker_gender': ClassLabel(num_classes=5, names=['man', 'woman', 'nonbinary', 'prefer not to say', 'no answer'], id=None),
 'episode_done': Value(dtype='bool_', id=None),
 'confidence': Value(dtype='string', id=None)}

In [43]:
one_label = 0
multi_label = 0
for item in new_data['train']['labels']:
    if len(item) == 1:
        one_label +=1
    else:
        multi_label +=1

In [46]:
print('Ejemplos con una etiqueta: ', one_label)
print('Ejemplos con varias etiquetas: ' , multi_label)

Ejemplos con una etiqueta:  2345
Ejemplos con varias etiquetas:  0
