# Librerías

In [1]:
from transformers import AutoTokenizer, DataCollatorWithPadding,AutoModel
from transformers.modeling_outputs import SequenceClassifierOutput


import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import AdamW 



from datasets import load_dataset, load_metric

import json

In [2]:
from my_utils import train_function

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Funciones y clases auxiliares

In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
# Map text labels to numeric labels
text_to_num = {'to':{'PARTNER:female':0,'PARTNER:male':1,"PARTNER:unknown":2},
                'as':{'SELF:female':0, 'SELF:male':1,'SELF:unknown':2},
                'about':{'ABOUT:female':0,'ABOUT:male':1,'ABOUT:unknown':2}}

In [6]:
tasks_names = ['about','as','to']

In [7]:
def tokenize_dataset(dataset,tasks_names):
    token_data = {}
    for index, text in enumerate(dataset):
        tokenized = tokenizer(text,truncation=True)

        labels ={}
        for task in tasks_names:
            aux_label = [text_to_num[task][x] for x in dataset[text][f'label_{task}']]


            labels[task] = aux_label

        token_data[index] = {'text':text,
                                'input_ids':tokenized.input_ids,
                                'attention_mask':tokenized.attention_mask,
                                'labels':labels}

    return token_data

In [8]:
class MyDataSet(Dataset):
    def __init__(self,complete_data,tasks):
        self.data = complete_data
        self.tasks = tasks

    def __getitem__(self,index):
        if torch.is_tensor(index):
            index = index.tolist()
            
        x = torch.tensor(self.data[index]['input_ids'])

        raw_labels = self.data[index]['labels'] 

        labels=[]
        for task in self.tasks:
            aux = raw_labels[task]
            if len(aux)>1:
                label = np.random.choice(aux)
                if label ==2:
                    label = np.random.choice([0,1])
                labels.append(label)
            elif len(aux)==1:
                if aux[0] == 2:

                    label = np.random.choice([0,1])
                    labels.append(label)
                else:
                    labels.append(aux[0])



        labels = torch.tensor(labels)
        
        attention = torch.tensor(self.data[index]['attention_mask'])


        sample = {'input_ids': x,
                'attention_mask': attention,
                'labels': labels.view(-1,len(labels))}

        return  sample


    def __len__(self):
        return len(self.data)

# Procesado datos ConvAI2

In [9]:
with open('DependencyDataset\ConvAI2\convai_rel_complete.json','r') as f:
    data = json.load(f)

In [10]:
token_data = tokenize_dataset(data,tasks_names)

In [12]:
dataset_train = MyDataSet(complete_data=token_data,tasks=tasks_names)
dl_train = DataLoader(dataset_train,batch_size=16,shuffle=True,collate_fn=data_collator)

# Modelo

In [13]:
class Multi(nn.Module):
    def __init__(self,name,num_labels,tasks_names,dropout=0.1):
        super().__init__()
        self.num_labels = num_labels
        self.encoder = AutoModel.from_pretrained(name,num_labels=num_labels,output_attentions=True,output_hidden_states = True)
        self.taskLayer = nn.ModuleList([])
        self.taskLayer.append(nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(768,num_labels)
        ))

        # Task 0: About
        # Task 1: Self
        # Task 2: Partner
        
        self.task_to_num =  {'about':0,'as':1,'to':2}

        
        if len(tasks_names) ==1:
            self.task = self.task_to_num[tasks_names[0]]
            self.SingleTask = True
        else:
            self.SingleTask=False
            self.tasksname = {v:k for v,k in enumerate(tasks_names)}
        
    def forward(self,input_ids = None,attention_mask = None,labels = None,output_attentions=None,output_hidden_states=None):
        


        dBertoutputs = self.encoder(input_ids,attention_mask = attention_mask,output_attentions=output_attentions,output_hidden_states = output_hidden_states)





        outputs_last_hidden_state = dBertoutputs[0]

        cls_out = outputs_last_hidden_state[:,0]

        if self.SingleTask:
            tasks_output = cls_out.clone()
            for layer in self.taskLayer:
                tasks_output = layer(tasks_output)

        else:
            tasks_output = {v : cls_out.clone() for v in self.tasksname.keys()}

            for layer in self.taskLayer:
                tasks_output = {v: layer(k) for v,k in tasks_output.items()}





        loss = None

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()


            if self.SingleTask:

                loss = loss_fct(tasks_output[0] , labels[0][:,self.task].type('torch.FloatTensor').to(device)) 
            else:
                task_loss = [loss_fct(tasks_output[i] , labels[:,0][:,i]) for i in range(len(tasks_output))]
                loss = sum(task_loss)

            


        return SequenceClassifierOutput(loss=loss, logits=tasks_output, hidden_states=dBertoutputs.hidden_states,attentions=dBertoutputs.attentions)

# Prueba 3 dimensiones

Se comprueba que el modelo funcione entrenando con varias dimensiones

In [14]:
model = Multi("distilbert-base-uncased", 2,tasks_names).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
num_epochs = 1
train_function(model,num_epochs,dl_train,optimizer = AdamW(model.parameters(),lr=5e-5))

  0%|          | 0/4588 [00:00<?, ?it/s]

100%|██████████| 4588/4588 [03:53<00:00, 19.66it/s]


# Prueba 1 dimensión

Se comprueba que se pueda entrenar un modelo SingleTask para la tarea indicada

In [16]:
model = Multi("distilbert-base-uncased", 1,['to']).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
num_epochs = 1
train_function(model,num_epochs,dl_train,optimizer = AdamW(model.parameters(),lr=5e-5))

  0%|          | 0/4588 [00:00<?, ?it/s]

100%|██████████| 4588/4588 [03:53<00:00, 20.43it/s]
