## Train BERT model



In [42]:
import os
from glob import glob
import pandas as pd

def get_lang(file):
    return os.path.splitext(os.path.basename(file))[0]

"""
Lo pongo así por hugginface
"""
id2label = {0: 'N', 1: 'NEU', 2: 'P'}
label2id = {v:k for k,v in id2label.items()}

def load_df(file):
    dialect = get_lang(file)
    
    df = pd.read_table(file, names=["id", "text", "polarity"], index_col=0)
    #df["dialect"] = dialect
    
    for label, idx in label2id.items():
        df.loc[df["polarity"] == label, "label"] = idx
    return df

train_files = glob("../data/tass2020/train/*.tsv")
dev_files = glob("../data/tass2020/dev/*.tsv")
test_files = glob("../data/tass2020/test1.1/*.tsv")

train_dfs = {get_lang(file):load_df(file) for file in train_files}
dev_dfs = {get_lang(file):load_df(file) for file in dev_files}
test_dfs = {get_lang(file):load_df(file) for file in test_files}

train_df = pd.concat(train_dfs.values())
dev_df = pd.concat(dev_dfs.values())
test_df = pd.concat(test_dfs.values())

print(len(train_df), len(dev_df), len(test_df))

train_df.columns, dev_df.columns, test_df.columns

4802 2443 7264


(Index(['text', 'polarity', 'label'], dtype='object'),
 Index(['text', 'polarity', 'label'], dtype='object'),
 Index(['text', 'polarity', 'label'], dtype='object'))

In [43]:
sum(train_df["label"].isna())

0

## Preprocessing

Convertimos todos los usuarios al string "usuario"



In [29]:
train_df.shape, dev_df.shape

((4802, 3), (2443, 3))

In [30]:
from pysentimiento.preprocessing import preprocess_tweet

train_df["text"] = train_df["text"].apply(preprocess_tweet)
dev_df["text"] = dev_df["text"].apply(preprocess_tweet)
test_df["text"] = test_df["text"].apply(preprocess_tweet)


In [31]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

model_name = './TwiBETO'

device = "cuda" if torch.cuda.is_available() else "cpu"




tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
tokenizer.model_max_length = 128


Some weights of the model checkpoint at ./TwiBETO were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./TwiBETO and are newly initialized

In [32]:
model.config.id2label, model.config.label2id

({0: 'N', 1: 'NEU', 2: 'P'}, {'N': 0, 'NEU': 1, 'P': 2})

Veamos primero las longitudes (a ver si no hay nada mal cargado)

In [33]:
from datasets import Dataset, Value, ClassLabel, Features

examples = pd.concat([train_df, dev_df])

features = Features({
    'text': Value('string'),
    'label': ClassLabel(num_classes=3, names=["neg", "neu", "pos"])
})

train_dataset = Dataset.from_pandas(train_df[["text", "label"]], features=features)
dev_dataset = Dataset.from_pandas(dev_df[["text", "label"]], features=features)
test_dataset = Dataset.from_pandas(test_df[["text", "label"]], features=features)

In [34]:
train_dataset.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=3, names=['neg', 'neu', 'pos'], names_file=None, id=None)}

In [35]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

batch_size = 32

eval_batch_size = 16

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=batch_size)
dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)


HBox(children=(FloatProgress(value=0.0, max=151.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=153.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=454.0), HTML(value='')))




In [36]:
def format_dataset(dataset):
    dataset = dataset.map(lambda examples: {'labels': examples['label']})
    dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    return dataset

train_dataset = format_dataset(train_dataset)
dev_dataset = format_dataset(dev_dataset)
test_dataset = format_dataset(test_dataset)

HBox(children=(FloatProgress(value=0.0, max=4802.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2443.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7264.0), HTML(value='')))




In [44]:
from torchtext import data

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_it = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
dev_it = torch.utils.data.DataLoader(dev_dataset, batch_size=batch_size)
test_it = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)


model = BertForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=3)

model.config.hidden_dropout_prob = 0.20
model.config.id2label = id2label
model.config.label2id = label2id

model = model.to(device)
model.train();

device

Some weights of the model checkpoint at ./TwiBETO were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./TwiBETO and are newly initialized

device(type='cuda')

In [45]:



def compute_metrics(labels, preds):
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


def evaluate(model, it):
    """
    Calculates labels and predictions for it
    
    Returns
    -------
    
    (labels, preds): torch.Tensor
    
    where labels are the true labels
    and preds are the predictions
    
    """
    model.eval()
    preds = []
    true = []
    losses = []
    with torch.no_grad():
        for i, batch in enumerate(tqdm(it)):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs[0]
            losses.append(loss.item())
            outs = torch.softmax(outputs.logits, dim=1)
            
            true.append(batch['labels'].cpu())
            preds.append(outs.cpu())
    return np.array(losses).mean(), torch.cat(true), torch.cat(preds).argmax(1)


In [46]:
from tqdm.auto import tqdm
from torch import nn
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AdamW, get_linear_schedule_with_warmup


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.train().to(device)

epochs = 5

optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-5)


num_training_steps = epochs * len(train_it)
num_warmup_steps = num_training_steps // 10
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
)

for epoch in range(epochs):
    train_losses = []
    for i, batch in enumerate(tqdm(train_it)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs[0]
        train_losses.append(loss.item())
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    train_loss = np.array(train_losses).mean()
    dev_loss, dev_labels, dev_preds = evaluate(model, dev_it)
    dev_metrics = compute_metrics(dev_labels, dev_preds) 
    
    print(f"Epoch {epoch:<2}")
    print(f"Train loss {train_loss:.4f}")
    print(f"Dev loss {dev_loss:.4f}")
    print(f"Dev metrics {dev_metrics}")

HBox(children=(FloatProgress(value=0.0, max=151.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=77.0), HTML(value='')))


Epoch 0 
Train loss 0.8995
Dev loss 0.8002
Dev metrics {'accuracy': 0.6553417928776095, 'f1': 0.6343417404729942, 'precision': 0.6547883823566102, 'recall': 0.6457086514276349}


HBox(children=(FloatProgress(value=0.0, max=151.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=77.0), HTML(value='')))


Epoch 1 
Train loss 0.5990
Dev loss 0.7975
Dev metrics {'accuracy': 0.6713057715923045, 'f1': 0.6594151953577297, 'precision': 0.6625478999401578, 'recall': 0.6654176046350669}


HBox(children=(FloatProgress(value=0.0, max=151.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=77.0), HTML(value='')))


Epoch 2 
Train loss 0.4177
Dev loss 0.8654
Dev metrics {'accuracy': 0.6639377814162915, 'f1': 0.6490162898628277, 'precision': 0.6543944670721568, 'recall': 0.6564646914289053}


HBox(children=(FloatProgress(value=0.0, max=151.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=77.0), HTML(value='')))


Epoch 3 
Train loss 0.2967
Dev loss 0.8983
Dev metrics {'accuracy': 0.6672124437167417, 'f1': 0.661895283338922, 'precision': 0.6597058834539672, 'recall': 0.6657435067836439}


HBox(children=(FloatProgress(value=0.0, max=151.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=77.0), HTML(value='')))


Epoch 4 
Train loss 0.2177
Dev loss 0.9644
Dev metrics {'accuracy': 0.66189111747851, 'f1': 0.6666989647137118, 'precision': 0.681483306404647, 'recall': 0.6634852271095005}


1. 0.673438568716703
2. 0.6734036976999446
3. 0.6657802770738191
4. 0.6637223187531293
5. 0.6666989647137118

In [49]:

t = torch.Tensor([
    0.673438568716703,
    0.6734036976999446,
    0.6657802770738191,
    0.6637223187531293,
    0.6666989647137118,
])
t.mean(), t.std()

(tensor(0.6686), tensor(0.0045))

In [33]:
model.config.id2label[0] = "NEG"
model.config.id2label[2] = "POS"

model.config.label2id = {v:k for k,v in model.config.id2label.items()}


In [34]:
path = "../models/beto-sentiment-analysis"
model.save_pretrained(path)
tokenizer.save_pretrained(path)

('../models/beto-sentiment-analysis/vocab.txt',
 '../models/beto-sentiment-analysis/special_tokens_map.json',
 '../models/beto-sentiment-analysis/added_tokens.json')