## Train BERT model



In [2]:
import os
from glob import glob
import pandas as pd

def get_lang(file):
    return os.path.splitext(os.path.basename(file))[0]

"""
Lo pongo así por huggingface
"""
id2label = {0: 'N', 1: 'NEU', 2: 'P'}
label2id = {v:k for k,v in id2label.items()}

def load_df(file):
    dialect = get_lang(file)
    
    df = pd.read_table(file, names=["id", "text", "polarity"], index_col=0)
    #df["dialect"] = dialect
    
    for label, idx in label2id.items():
        df.loc[df["polarity"] == label, "label"] = idx
    return df

train_files = glob("../data/tass2020/train/*.tsv")
dev_files = glob("../data/tass2020/dev/*.tsv")
test_files = glob("../data/tass2020/test1.1/*.tsv")

train_dfs = {get_lang(file):load_df(file) for file in train_files}
dev_dfs = {get_lang(file):load_df(file) for file in dev_files}
test_dfs = {get_lang(file):load_df(file) for file in test_files}

train_df = pd.concat(train_dfs.values())
dev_df = pd.concat(dev_dfs.values())
test_df = pd.concat(test_dfs.values())

print(len(train_df), len(dev_df), len(test_df))

train_df.columns, dev_df.columns, test_df.columns

4802 2443 7264


(Index(['text', 'polarity', 'label'], dtype='object'),
 Index(['text', 'polarity', 'label'], dtype='object'),
 Index(['text', 'polarity', 'label'], dtype='object'))

## Preprocessing

Convertimos todos los usuarios al string "usuario"



In [3]:
train_df.shape, dev_df.shape

((4802, 3), (2443, 3))

In [4]:
from pysentimiento.preprocessing import preprocess_tweet

train_df["text"] = train_df["text"].apply(preprocess_tweet)
dev_df["text"] = dev_df["text"].apply(preprocess_tweet)
test_df["text"] = test_df["text"].apply(preprocess_tweet)


In [5]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

model_name = '../models/TwiBETO-general'

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = BertTokenizer.from_pretrained(model_name)#'dccuchile/bert-base-spanish-wwm-cased')


#tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
tokenizer.model_max_length = 128


Veamos primero las longitudes (a ver si no hay nada mal cargado)

In [6]:
from datasets import Dataset, Value, ClassLabel, Features

examples = pd.concat([train_df, dev_df])

features = Features({
    'text': Value('string'),
    'label': ClassLabel(num_classes=3, names=["neg", "neu", "pos"])
})

train_dataset = Dataset.from_pandas(train_df[["text", "label"]], features=features)
dev_dataset = Dataset.from_pandas(dev_df[["text", "label"]], features=features)
test_dataset = Dataset.from_pandas(test_df[["text", "label"]], features=features)

In [7]:
train_dataset.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=3, names=['neg', 'neu', 'pos'], names_file=None, id=None)}

In [8]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

batch_size = 64

eval_batch_size = 16

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=batch_size)
dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)


HBox(children=(FloatProgress(value=0.0, max=76.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=153.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=454.0), HTML(value='')))




In [9]:
def format_dataset(dataset):
    dataset = dataset.map(lambda examples: {'labels': examples['label']})
    dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    return dataset

train_dataset = format_dataset(train_dataset)
dev_dataset = format_dataset(dev_dataset)
test_dataset = format_dataset(test_dataset)

HBox(children=(FloatProgress(value=0.0, max=4802.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2443.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7264.0), HTML(value='')))




In [21]:
from torchtext import data

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


model = BertForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=3)

model.config.hidden_dropout_prob = 0.20
model.config.id2label = id2label
model.config.label2id = label2id

model = model.to(device)
model.train();

device

Some weights of the model checkpoint at ../models/TwiBETO-general were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../models/TwiBETO-g

device(type='cuda')

In [22]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import Trainer, TrainingArguments

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

eval_batch_size = 16
train_batch_size = 64

epochs = 5

total_steps = (epochs * len(train_dataset)) // train_batch_size
warmup_steps = total_steps // 10

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    warmup_steps=warmup_steps,
    evaluation_strategy="epoch",
    do_eval=True,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.74526,0.663938,0.664568,0.665699,0.665291
2,No log,0.829154,0.66844,0.657989,0.680749,0.654332
3,No log,0.99844,0.670487,0.667185,0.666991,0.667384
4,No log,1.220322,0.663528,0.66173,0.673067,0.656397
5,No log,1.362266,0.667212,0.666885,0.669997,0.664949


TrainOutput(global_step=380, training_loss=0.35941294619911596)

1. 0.6668031109291854
2. 0.6762511399888426
3. 0.6715146934758156
4. 0.6757479771741166
5. 0.6696684404420794

In [25]:
trainer.evaluate()

{'eval_loss': 1.3622664213180542,
 'eval_accuracy': 0.6672124437167417,
 'eval_f1': 0.666885377209547,
 'eval_precision': 0.6699971138964069,
 'eval_recall': 0.6649489905885781,
 'epoch': 5.0}

In [64]:

t = torch.Tensor([
     0.6668031109291854,
     0.6762511399888426,
     0.6715146934758156,
     0.6757479771741166,
     0.6696684404420794,
])
t.mean(), t.std()

(tensor(0.6720), tensor(0.0040))

In [33]:
model.config.id2label[0] = "NEG"
model.config.id2label[2] = "POS"

model.config.label2id = {v:k for k,v in model.config.id2label.items()}


In [34]:
path = "../models/beto-sentiment-analysis"
model.save_pretrained(path)
tokenizer.save_pretrained(path)

('../models/beto-sentiment-analysis/vocab.txt',
 '../models/beto-sentiment-analysis/special_tokens_map.json',
 '../models/beto-sentiment-analysis/added_tokens.json')