In [1]:
import csv
import itertools
import numpy as np
import os
import pandas as pd
import random
import torch
import tqdm
import transformers as ppb

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import functional as F
from torch import optim

from transformers import AutoModel, AutoTokenizer, BertForMaskedLM, BertTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score


  from .autonotebook import tqdm as notebook_tqdm
2022-11-08 11:27:41.142033: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
def padding_chunk(input_id_chunks, mask_chunks):
    # get required padding length
    pad_len = 128 - input_id_chunks.shape[0]
    # check if tensor length satisfies required chunk size
    if pad_len > 0:
        # if padding length is more than 0, we must add padding
        input_id_chunks = torch.cat([input_id_chunks, torch.Tensor([0] * pad_len)])
        mask_chunks = torch.cat([mask_chunks, torch.Tensor([0] * pad_len)])
    
    return input_id_chunks, mask_chunks



In [14]:
# Helper functions
def to_int(x):
    return list(map(int,x))

def to_low(x):
    return [w.lower() for w in x]

def test_model(model, test_batches, _config):
    device = torch.device(_config['device'])
    y_t=[]
    y_p=[]
    y_t_sentence = []
    y_p_sentence = []
    logits_sentence =[]
    logits = []
    for sample in test_batches:
        if sample['text'].size(dim = 1)> 126: 
            cont_1 =0 
            cont_0 = 0
            input_id_chunks = list(sample['text'][0].split(126))
            mask_chunks =  list(sample['attention'][0].split(126))
            list_tensors = []
            # loop through each chunk
            for i in range(len(input_id_chunks)):
                input_id_chunks[i] = torch.cat([torch.tensor([101]), input_id_chunks[i], torch.tensor([102])])
                # add attention tokens to attention mask
                mask_chunks[i] = torch.cat([torch.tensor([1]), mask_chunks[i], torch.tensor([1])])
                input_id_chunks[i], mask_chunks[i] = padding_chunk(input_id_chunks[i], mask_chunks[i])
                        
                        
                x, att, y = input_id_chunks[i][None, :], mask_chunks[i][None, :], sample['label']
                x, att = x.to(torch.int64), att.to(torch.int64)

                x, y, att = x.to(device), y.to(device), att.to(device)
                y_pred = F.softmax(model(x, att).cpu().detach(),1)
                list_tensors.append(y_pred)
                
                #-----sentence
                logits_sentence.append(y_pred)
                #--------------------------
                y_pred = y_pred.argmax(1)
                #for sentence to sentence 
                y_t_sentence.append(y.cpu())
                y_p_sentence.append(y_pred)
                #---------------- 
                
                if y_pred.item() == 1:
                    cont_1 += 1
                else: 
                    cont_0 += 1
            if cont_1 > cont_0: 
                y_pred = torch.tensor([1])
            else: 
                y_pred = torch.tensor([0])
                
            my_tensor = torch.cat(list_tensors, dim=0)
            logit_app = torch.mean(my_tensor, 0, dtype= float)
            logit_app = logit_app[None,:]
            logits.append(logit_app)
            y_p.append(y_pred)
            y_t.append(y.cpu())

            
        else:
            x, y, att = sample['text'][0], sample['label'], sample['attention'][0]
            x= torch.cat([torch.tensor([101]), x, torch.tensor([102])])
            att = torch.cat([torch.tensor([1]), att, torch.tensor([1])])
            x, att = padding_chunk(x, att)
            x, att = x[None,:], att[None, :]
            x, att = x.to(torch.int64), att.to(torch.int64)
            x, y, att = x.to(device), y.to(device), att.to(device)
            #x, y, att = sample['text'].to(device), sample['label'].to(device), sample['attention'].to(device)
            y_pred = F.softmax(model(x, att).cpu().detach(),1)
            logits_sentence.append(y_pred)
            logits.append(y_pred)
            y_pred = y_pred.argmax(1)
            #for sentence
            y_t_sentence.append(y.cpu())
            y_p_sentence.append(y_pred)
            #----------------------
            y_p.append(y_pred)
            y_t.append(y.cpu())

            
    logits = torch.cat(logits)
    y_p=torch.cat(y_p)
    y_t=torch.cat(y_t)
    logits_sentence = torch.cat(logits_sentence)    
    y_p_sentence=torch.cat(y_p_sentence)
    y_t_sentence=torch.cat(y_t_sentence)
    
    return f1_score(y_t,y_p,average='binary'),f1_score(y_t_sentence,y_p_sentence,average='binary')

def get_logits(model, test_batches, _config):
    device = torch.device(_config['device'])
    logits = []
    logits_sentence = []
    for sample in test_batches:
        
        if sample['text'].size(dim = 1)> 126: 
            
            list_tensors = []
            input_id_chunks = list(sample['text'][0].split(126))
            mask_chunks =  list(sample['attention'][0].split(126))

            # loop through each chunk
            for i in range(len(input_id_chunks)):
                # add CLS and SEP tokens to input IDs
#                 if i == 0:
#                     input_id_chunks[i] = torch.cat([input_id_chunks[i], torch.tensor([102])])
#                     # add attention tokens to attention mask
#                     mask_chunks[i] = torch.cat([mask_chunks[i], torch.tensor([1])])
#                 if i == len(input_id_chunks)-1: 
#                     input_id_chunks[i] = torch.cat([torch.tensor([101]), input_id_chunks[i]])
#                     # add attention tokens to attention mask
#                     mask_chunks[i] = torch.cat([torch.tensor([1]),mask_chunks[i]])
#                     input_id_chunks[i], mask_chunks[i] = padding_chunk(input_id_chunks[i], mask_chunks[i])

                input_id_chunks[i] = torch.cat([torch.tensor([101]), input_id_chunks[i], torch.tensor([102])])
                # add attention tokens to attention mask
                mask_chunks[i] = torch.cat([torch.tensor([1]), mask_chunks[i], torch.tensor([1])])
                input_id_chunks[i], mask_chunks[i] = padding_chunk(input_id_chunks[i], mask_chunks[i])
                        
                        
                x, att, y = input_id_chunks[i][None, :], mask_chunks[i][None, :], sample['label']
                x, att = x.to(torch.int64), att.to(torch.int64)

                x, y, att = x.to(device), y.to(device), att.to(device)
                y_pred = F.softmax(model(x, att).cpu().detach(),1)
                logits_sentence.append(y_pred)
                list_tensors.append(y_pred)
            
            my_tensor = torch.cat(list_tensors, dim=0)
            
            logit_app = torch.mean(my_tensor, 0, dtype= float)
            logit_app = logit_app[None,:]
            logits.append(logit_app)
        else:
            x, y, att = sample['text'][0], sample['label'], sample['attention'][0]
            x= torch.cat([torch.tensor([101]), x, torch.tensor([102])])
            att = torch.cat([torch.tensor([1]), att, torch.tensor([1])])
            x, att = padding_chunk(x, att)
            x, att = x[None,:], att[None, :]
            x, att = x.to(torch.int64), att.to(torch.int64)
            x, y, att = x.to(device), y.to(device), att.to(device)
            #x, y, att = sample['text'].to(device), sample['label'].to(device), sample['attention'].to(device)
            y_pred = F.softmax(model(x, att).cpu().detach(),1)
            logits_sentence.append(y_pred)
            logits.append(y_pred)
    logits = torch.cat(logits)
    logits_sentence = torch.cat(logits_sentence)
    return logits, logits_sentence

In [4]:
# Define dataloader
class Mental(Dataset):
    def __init__(self, X,y = None, _config = None):
        self.X = X
        self.y = y
        self.tokenizer = BertTokenizer.from_pretrained(_config['model_name'], do_lower_case=True,
                                                       model_max_length=128)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        out=torch.tensor(self.tokenizer.encode(self.X[idx], max_length=128, pad_to_max_length=True, add_special_tokens=True))
        if(self.y):
            return {"text": out, "attention":(out!=1).float(), "label":torch.tensor(self.y[idx])}
        else:
            return {"text": out, "attention":(out!=1).float()}

class TestMental(Dataset):
    def __init__(self, X,y = None, _config = None):
        self.X = X
        self.y = y
        self.tokenizer = BertTokenizer.from_pretrained(_config['model_name'], do_lower_case=True,model_max_length=128)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        out=torch.tensor(self.tokenizer.encode(self.X[idx], add_special_tokens=False))
        if(self.y):
            return {"text": out, "attention":(out!=1).float(), "label":torch.tensor(self.y[idx])}
        else:
            return {"text": out, "attention":(out!=1).float()}

# Define model
class BERTForSequenceClassification(nn.Module):
    def __init__(self, model_name):
        super(BERTForSequenceClassification, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        
        self.drop = nn.Dropout(0.1)
        self.clf  = nn.Linear(768, 2, bias=True)
    def forward(self, x, att):
        x = self.bert(x, attention_mask = att)[1]
        x = self.drop(x)
        x = self.clf(x)
        return x

In [5]:
def train_models(_config, train, test, verbose = True):
    device = torch.device(_config['device'])
    w = _config['weights']
    lr = _config['lr']
    max_grad_norm = 1.0
    epochs = _config['epochs']
    for k in range(0, _config['n_models']):
        train_batches = DataLoader(train, batch_size = _config['train_batch_size'], shuffle = True)
        test_batches = DataLoader(test, batch_size = _config['test_batch_size'], shuffle = False)
        
        model = BERTForSequenceClassification(_config['model_name']).to(device)
        optimizer =  AdamW(model.parameters(), lr=lr, correct_bias=False)
        total_steps = len(train_batches) * epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
        criterio = nn.CrossEntropyLoss(weight = w.to(device))

        for epoch in range(epochs):
            for sample in train_batches:
                optimizer.zero_grad()
                x, y, att = sample['text'].to(device), sample['label'].to(device), sample['attention'].to(device)
                y_pred = model(x, att)
                loss = criterio(y_pred, y)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

        if(verbose):
            model.eval()
            tmp_m, tmp_m_sentence = test_model(model, test_batches, _config)
            
            print("Model historial %d \t f_1 = %.4f"%(k, tmp_m*100))
            print("Model sentence  %d \t f_1 = %.4f"%(k, tmp_m_sentence*100))
            
        
        torch.save(model.state_dict(), '/home/est_posgrado_maria.garcia/Transformers/Models/model_'+str(k)+'.pt')



# Data

In [6]:
Df_anorexia_train_128 = pd.read_csv('/home/est_posgrado_maria.garcia/Transformers/Data_post/train_anorexia_128_clean.csv')

In [7]:
Df_anorexia_train_128['text'] = Df_anorexia_train_128['text'].astype('str')

In [8]:
np.where(pd.isnull(Df_anorexia_train_128))

(array([], dtype=int64), array([], dtype=int64))

In [9]:
X_train = to_low(list(Df_anorexia_train_128['text']))
y_train = to_int(list(Df_anorexia_train_128['target']))

In [10]:
df_test = pd.read_csv('/home/est_posgrado_maria.garcia/Transformers/Data_post/test_anorexia_clean.csv')

In [11]:
# Prepare test
X_test = to_low(list(df_test['text']))
y_test = to_int(list(df_test['target']))

# Train

In [15]:
w = (lambda a, b: torch.tensor([max(a, b)/a, max(a,b)/b]))((torch.tensor(y_train)==0).sum().float(), (torch.tensor(y_train)==1).sum().float())

_config = {
    'model_name': 'bert-base-uncased',
    'train_batch_size':  128,
    'test_batch_size': 1,
    'device': 'cuda:0',
    'lr': 1e-5,
    'epochs': 3,
    'n_models': 3,
    'weights': w
}

train = Mental(X_train, y_train, _config) 
test = TestMental(X_test, y_test, _config) 



In [16]:
_config['n_models'] = 3
train_models(_config, train, test)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max 

Model historial 0 	 f_1 = 70.5882
Model sentence  0 	 f_1 = 57.2178


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model historial 1 	 f_1 = 68.6567
Model sentence  1 	 f_1 = 56.1115


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model historial 2 	 f_1 = 61.5385
Model sentence  2 	 f_1 = 57.2564


In [17]:
test_batches = DataLoader(test, batch_size = _config['test_batch_size'], shuffle = False)

In [18]:
device = torch.device(_config['device'])
model = BERTForSequenceClassification(_config['model_name']).to(device)
x = []
y = []
for i in tqdm.tqdm(range(_config['n_models'])):
    model.load_state_dict(torch.load("/home/est_posgrado_maria.garcia/Transformers/Models/model_"+str(i)+".pt"))
    model.eval()
    logit_hist, logit_sentence = get_logits(model, test_batches, _config)
    logit_hist = logit_hist.unsqueeze(1)
    logit_sentence = logit_sentence.unsqueeze(1)
    x.append(logit_hist)
    y.append(logit_sentence)
    
X = torch.cat(x,1)
Y = torch.cat(y,1)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 3/3 [28:04<00:00, 561.43s/it]


In [19]:
len(x[0])

320

In [20]:


# Majority Voting 
y_pred = (X.argmax(2).sum(1).float()/_config['n_models']).round()
f1_score(y_test, y_pred)*100

68.65671641791045

In [21]:
# Weighted Voting 
y_pred= X.sum(1).argmax(1)
f1_score(y_test, y_pred)*100


68.65671641791045