## Finetuning BETO

In this notebook, we will check what happens if we fine tune using MLM on the TASS tweets

In [1]:
import os
from glob import glob
import pandas as pd

def get_lang(file):
    return os.path.splitext(os.path.basename(file))[0]

"""
Lo pongo así por hugginface
"""
id2label = {0: 'N', 1: 'NEU', 2: 'P'}
label2id = {v:k for k,v in id2label.items()}

def load_df(file):
    dialect = get_lang(file)
    
    df = pd.read_table(file, names=["id", "text", "polarity"], index_col=0)
    #df["dialect"] = dialect
    
    for label, idx in label2id.items():
        df.loc[df["polarity"] == label, "label"] = idx
    return df

train_files = glob("../data/tass2020/train/*.tsv")
dev_files = glob("../data/tass2020/dev/*.tsv")
test_files = glob("../data/tass2020/test1.1/*.tsv")

train_dfs = {get_lang(file):load_df(file) for file in train_files}
dev_dfs = {get_lang(file):load_df(file) for file in dev_files}
test_dfs = {get_lang(file):load_df(file) for file in test_files}

train_df = pd.concat(train_dfs.values())
dev_df = pd.concat(dev_dfs.values())
test_df = pd.concat(test_dfs.values())

print(len(train_df), len(dev_df), len(test_df))

train_df.columns, dev_df.columns, test_df.columns

4802 2443 7264


(Index(['text', 'polarity', 'label'], dtype='object'),
 Index(['text', 'polarity', 'label'], dtype='object'),
 Index(['text', 'polarity', 'label'], dtype='object'))

In [6]:
import torch
from transformers import BertForMaskedLM, BertTokenizer

model_name = 'dccuchile/bert-base-spanish-wwm-cased'

device = "cuda" if torch.cuda.is_available() else "cpu"

model = BertForMaskedLM.from_pretrained(model_name, return_dict=True, num_labels=3)
model = model.to(device)
tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242120.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=43.0, style=ProgressStyle(description_w…




In [45]:
from pysentimiento.preprocessing import preprocess_tweet

with open("tweets.txt", "w") as f:
    for tweet in train_df["text"]:
        f.write(preprocess_tweet(tweet) + "\n")


In [46]:
%%time
from datasets import load_dataset


dataset = load_dataset("text", data_files={"train": "./tweets.txt"})


Using custom data configuration default


Downloading and preparing dataset text/default-617ef339ebbfa8ff (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/jmperez/.cache/huggingface/datasets/text/default-617ef339ebbfa8ff/0.0.0/52cefbb2b82b015d4253f1aeb1e6ee5591124a6491e834acfe1751f765925155...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset text downloaded and prepared to /home/jmperez/.cache/huggingface/datasets/text/default-617ef339ebbfa8ff/0.0.0/52cefbb2b82b015d4253f1aeb1e6ee5591124a6491e834acfe1751f765925155. Subsequent calls will reuse this data.
CPU times: user 132 ms, sys: 4 ms, total: 136 ms
Wall time: 1.34 s


**TODO** ARREGLAR ESTO DE ACA ARRIBA

In [61]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./tweets.txt",
    block_size=128,
)



CPU times: user 1.73 s, sys: 4 ms, total: 1.74 s
Wall time: 1.73 s


In [62]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [66]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./TwiBETO",
    overwrite_output_dir=True,
    num_train_epochs=30,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

In [67]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss
500,1.431588
1000,1.208357
1500,1.036722
2000,0.947806


CPU times: user 9min 47s, sys: 2min 53s, total: 12min 40s
Wall time: 12min 39s


TrainOutput(global_step=2280, training_loss=1.1238509328741777)

In [68]:
trainer.save_model("./TwiBETO")


## Checking mask


In [72]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./TwiBETO",
    tokenizer=model_name
)

Some weights of BertModel were not initialized from the model checkpoint at ./TwiBETO and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [82]:
# The sun <mask>.
# =>

fill_mask("Esto es una [MASK]")

[{'sequence': '[CLS] Esto es una mierda [SEP]',
  'score': 0.7022484540939331,
  'token': 3383,
  'token_str': 'mierda'},
 {'sequence': '[CLS] Esto es una locura [SEP]',
  'score': 0.08674468100070953,
  'token': 7680,
  'token_str': 'locura'},
 {'sequence': '[CLS] Esto es una pesadilla [SEP]',
  'score': 0.07505229860544205,
  'token': 14782,
  'token_str': 'pesadilla'},
 {'sequence': '[CLS] Esto es una estupidez [SEP]',
  'score': 0.02035685069859028,
  'token': 19040,
  'token_str': 'estupidez'},
 {'sequence': '[CLS] Esto es una ilusión [SEP]',
  'score': 0.005596084985882044,
  'token': 18161,
  'token_str': 'ilusión'}]