# Librerías

In [1]:
from datasets import load_dataset,load_metric


from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput


import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import AdamW 

In [2]:
from utils import train_function

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Carga y preprocesado de datos

In [4]:
# Tokenizer que utilizaremos y función auxiliar para aplicarlo.

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(item):
    '''Aplica el tokenizer al elemento dado. Trunca su longitud a la 
    longitud máxima de DistilBert.
    '''
    return tokenizer(item["text"],truncation=True)

In [5]:
data = load_dataset('md_gender_bias','wizard')

Reusing dataset md_gender_bias (C:\Users\kuina\.cache\huggingface\datasets\md_gender_bias\wizard\1.0.0\8ae77b51acf93383161cc954b146159291beca6c979b54ce228c46db86116c05)


  0%|          | 0/3 [00:00<?, ?it/s]

Separo test y validación. Creo datos artificiales para las etiquetas que faltan.

In [6]:

train = data["train"]
about = train["gender"]
self = [np.random.choice(np.unique(about)) for i in range(len(about))]
partner = [np.random.choice(np.unique(about)) for i in range(len(about))]


eval = data["validation"]
aboutE = eval["gender"]
selfE = [np.random.choice(np.unique(aboutE)) for i in range(len(aboutE))]
partnerE = [np.random.choice(np.unique(aboutE)) for i in range(len(aboutE))]


In [7]:
labels_train = [ ( about[i] , self[i] , partner[i] ) for i in range(len(about))]
labels_eval = [ ( aboutE[i] , selfE[i] , partnerE[i] ) for i in range(len(aboutE))]

In [8]:
token_data = data.map(preprocess_function,batched=True)
token_data = token_data.remove_columns(["chosen_topic","text","gender"])
token_data= token_data.with_format("torch")


token_train = token_data["train"]
token_eval = token_data["validation"]

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
# Crea un dataset a partir de los datos iniciales.

class MyDataSet(Dataset):
    def __init__(self,data,labels,attention_mask):
        self.data = data
        self.labels =labels 
        self.attention = attention_mask


    def __getitem__(self,index):
        if torch.is_tensor(index):
            index = index.tolist()
            
        x = self.data[index]
        labels =  self.labels[index]
        labels = np.array([labels])
        # labels = labels.astype('float')
        labels = labels
        attention = self.attention[index]


        sample = {'input_ids': x,
                'attention_mask': attention,
                'labels': torch.from_numpy(labels).view(-1,3)}

        return  sample


    def __len__(self):
        return len(self.data)


Creo los dataloaders con ambos conjuntos. Aplico padding de forma dinámica haciendo uso del DataCollator

In [10]:
trainData = MyDataSet(token_train['input_ids'],labels_train,token_train['attention_mask'])
evalData = MyDataSet(token_eval['input_ids'],labels_eval,token_eval['attention_mask'])


train_dataloader = DataLoader(trainData,batch_size=16,shuffle=True,collate_fn=data_collator)
eval_dataloader = DataLoader(evalData,batch_size=16,shuffle=True,collate_fn=data_collator)

# Modelo

In [13]:
class Multi(nn.Module):
    def __init__(self,name,num_labels,dropout=0.1):
        super().__init__()
        self.num_labels = num_labels
        self.encoder = AutoModel.from_pretrained(name,num_labels=num_labels,output_attentions=True,output_hidden_states = True)
        self.task = nn.ModuleList([])
        self.task.append(nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(768,num_labels)
            # nn.Softmax()
        ))
        self.traceAbout = []
        self.traceSelf = []
        self.tracePartner = []



        
    def forward(self,input_ids = None,attention_mask = None,labels = None,output_attentions=None,output_hidden_states=None):
        


        dBertoutputs = self.encoder(input_ids,attention_mask = attention_mask,output_attentions=output_attentions,output_hidden_states = output_hidden_states)


        outputs_last_hidden_state = dBertoutputs[0]

        cls_out = outputs_last_hidden_state[:,0]

        about = cls_out.detach().clone()
        Self = cls_out.detach().clone() 
        partner = cls_out.detach().clone()  

        for layer in self.task:
            about = layer(about)
            self.traceAbout.append(about)
            Self = layer(Self)
            self.traceSelf.append(Self)
            partner = layer(partner)
            self.tracePartner.append(partner)



        loss = None

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            aboutLoss = loss_fct(about , labels[:,0][:,0])
            SelfLoss = loss_fct(Self , labels[:,0][:,1])
            partnerLoss = loss_fct(partner , labels[:,0][:,2])

            loss = aboutLoss + SelfLoss + partnerLoss


        return SequenceClassifierOutput(loss=loss, logits=(about,Self,partner), hidden_states=dBertoutputs.hidden_states,attentions=dBertoutputs.attentions)

In [14]:
model = Multi("distilbert-base-uncased", 3).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Entrenamiento

In [15]:
num_epochs = 3
train_function(model,num_epochs,train_dataloader,optimizer = AdamW(model.parameters(),lr=5e-5))

  0%|          | 0/1962 [00:00<?, ?it/s]

 33%|███▎      | 654/1962 [00:13<00:20, 64.48it/s]
 67%|██████▋   | 1308/1962 [00:22<00:09, 70.91it/s]
100%|██████████| 1962/1962 [00:32<00:00, 67.77it/s]


# Evaluación

In [18]:
metric = load_metric("accuracy")

In [16]:
def eval_function(model,metric,eval_dataloader,task = 0):
    '''Función para evaluar un modelo dado en la tarea indicada.
    
    model: modelo a evaluar
    metric: metrica a utilizar en la evaluación.
    eval_dataloader: conjunto para evaluar. Debe ser un Pytorch DataLoader. 
    task: tarea en la que queremos evaluar el modelo'''


    model.eval()
    for batch in eval_dataloader:
        batch = {k:v.to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        
        # logits = outputs.logits 
        logits = outputs.logits[task] 
        predictions = torch.argmax(logits,dim=-1)
        metric.add_batch(predictions = predictions, references = batch["labels"][:,0][:,task])

    return metric.compute()

Evaluamos en la tarea 0 (about)

In [19]:
eval_function(model,metric,eval_dataloader,0)

{'accuracy': 0.7057728119180633}

Tarea 1 (self)

In [20]:
eval_function(model,metric,eval_dataloader,1)

{'accuracy': 0.3687150837988827}

Tarea 2 (partner)

In [21]:
eval_function(model,metric,eval_dataloader,2)

{'accuracy': 0.32774674115456237}

In [22]:
for batch in train_dataloader:
    break 
batch = {k:v.to(device) for k,v in batch.items()}

In [25]:
out = model.encoder(batch['input_ids'],attention_mask = batch['attention_mask'],output_attentions=True,output_hidden_states = True)

In [27]:
out.hidden_states

7

# Predicciones

In [29]:
outputs = model(**batch)

In [30]:
about, self, partner = outputs.logits

Predicciones en la etiqueta about

In [32]:
preds = torch.argmax(about,dim=-1)

In [33]:
preds

tensor([2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], device='cuda:0')

In [35]:
batch['labels'][:,0][:,0]

tensor([0, 1, 2, 2, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2], device='cuda:0')