In [1]:
import csv
import itertools
import numpy as np
import os
import pandas as pd
import random
import torch
import tqdm
import transformers as ppb

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import functional as F
from torch import optim

from transformers import AutoModel, AutoTokenizer, BertForMaskedLM, BertTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score

In [2]:
# Helper functions
def to_int(x):
    return list(map(int,x))

def to_low(x):
    return [w.lower() for w in x]

def test_model(model, test_batches, _config):
    device = torch.device(_config['device'])
    y_t=[]
    y_p=[]
    logits = []
    for sample in test_batches:
        x, y, att = sample['text'].to(device), sample['label'].to(device), sample['attention'].to(device)
        y_pred = F.softmax(model(x, att).cpu().detach(),1)
        logits.append(y_pred)
        y_pred = y_pred.argmax(1)
        y_p.append(y_pred)
        y_t.append(y.cpu())
    logits = torch.cat(logits)
    y_p=torch.cat(y_p)
    y_t=torch.cat(y_t)
    return f1_score(y_t,y_p,average='binary')

def get_logits(model, test_batches, _config):
    device = torch.device(_config['device'])
    logits = []
    for sample in test_batches:
        x, att = sample['text'].to(device), sample['attention'].to(device)
        y_pred = F.softmax(model(x, att).cpu().detach(),1)
        logits.append(y_pred)
    logits = torch.cat(logits)   
    return logits

In [3]:
# Define dataloader
class MEXA3T(Dataset):
    def __init__(self, X, y = None, _config = None):
        self.X = X
        self.y = y
        self.tokenizer = BertTokenizer.from_pretrained(_config['model_name'], do_lower_case=True)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        out=torch.tensor(self.tokenizer.encode(self.X[idx], max_length=64, pad_to_max_length=True, add_special_tokens=True, truncation_strategy = 'longest_first'))
        if(self.y):
            return {"text": out, "attention":(out!=1).float(), "label":torch.tensor(self.y[idx])}
        else:
            return {"text": out, "attention":(out!=1).float()}

# Define model
class BERTForSequenceClassification(nn.Module):
    def __init__(self, model_name):
        super(BERTForSequenceClassification, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        
        self.drop = nn.Dropout(0.1)
        self.clf  = nn.Linear(768, 2, bias=True)
    def forward(self, x, att):
        x = self.bert(x, attention_mask = att)[1]
        x = self.drop(x)
        x = self.clf(x)
        return x

In [4]:
def train_models(_config, train, test, verbose = True):
    device = torch.device(_config['device'])
    w = _config['weights']
    lr = _config['lr']
    max_grad_norm = 1.0
    epochs = _config['epochs']
    for k in range(0, _config['n_models']):
        train_batches = DataLoader(train, batch_size = _config['train_batch_size'], shuffle = True)
        test_batches = DataLoader(test, batch_size = _config['test_batch_size'], shuffle = False)
        
        model = BERTForSequenceClassification(_config['model_name']).to(device)
        optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)
        total_steps = len(train_batches) * epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
        criterio = nn.CrossEntropyLoss(weight = w.to(device))

        for epoch in range(epochs):
            for sample in train_batches:
                optimizer.zero_grad()
                x, y, att = sample['text'].to(device), sample['label'].to(device), sample['attention'].to(device)
                y_pred = model(x, att)
                loss = criterio(y_pred, y)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
        if(verbose):
            model.eval()
            tmp_m = test_model(model, test_batches, _config)
            print("Model %d \t f_1 = %.4f"%(k, tmp_m*100))
        torch.save(model.state_dict(), 'Models/model_'+str(k)+'.pt')

In [5]:
df_train = pd.read_csv('Data/train.csv')
df_val  = pd.read_csv('Data/val.csv')
df_test = pd.read_csv('Data/test.csv')

# Prepare train
X_train = to_low(list(df_train['text']))+to_low(list(df_val['text']))
y_train = to_int(list(df_train['target']))+to_int(list(df_val['target']))

# Prepare test
X_test = to_low(list(df_test['text']))
y_test = to_int(list(df_test['target']))

w = (lambda a, b: torch.tensor([max(a, b)/a, max(a,b)/b]))((torch.tensor(y_train)==0).sum().float(), (torch.tensor(y_train)==1).sum().float())

_config = {
    'model_name': 'dccuchile/bert-base-spanish-wwm-uncased',
    'train_batch_size':  32,
    'test_batch_size': 128,
    'device': 'cuda:1',
    'lr': 1e-5,
    'epochs': 3,
    'n_models': 3,
    'weights': w
}

train = MEXA3T(X_train, y_train, _config) 
test = MEXA3T(X_test, y_test, _config) 

In [6]:
_config['n_models'] = 3
train_models(_config, train, test)

Model 0 	 f_1 = 78.5185
Model 1 	 f_1 = 79.6186
Model 2 	 f_1 = 79.0865


In [7]:
test_batches = DataLoader(test, batch_size = _config['test_batch_size'], shuffle = False)

In [8]:
device = torch.device(_config['device'])
model = BERTForSequenceClassification(_config['model_name']).to(device)
x = []
for i in tqdm.tqdm(range(_config['n_models'])):
    model.load_state_dict(torch.load("Models/model_"+str(i)+".pt"))
    model.eval()
    x.append(get_logits(model, test_batches, _config).unsqueeze(1))
    
X = torch.cat(x,1)

100%|██████████| 3/3 [00:07<00:00,  2.60s/it]


In [9]:
# Majority Voting 
y_pred = (X.argmax(2).sum(1).float()/_config['n_models']).round()
f1_score(y_test, y_pred)*100

79.75903614457832

In [10]:
# Weighted Voting 
y_pred= X.sum(1).argmax(1)
f1_score(y_test, y_pred)*100

80.28846153846153