## Imports and downloads

In [1]:
# Install Hugging Face's Transformers library
!pip install transformers

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 4.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 22.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 363 kB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 38.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 41.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [2]:
# Download BETO model
!wget https://users.dcc.uchile.cl/~jperez/beto/uncased_2M/pytorch_weights.tar.gz
!wget https://users.dcc.uchile.cl/~jperez/beto/uncased_2M/vocab.txt
!wget https://users.dcc.uchile.cl/~jperez/beto/uncased_2M/config.json
!tar -xzvf pytorch_weights.tar.gz
!mv config.json pytorch/.
!mv vocab.txt pytorch/.
!mv pytorch BETO  # Rename folder to BETO

--2021-12-11 15:00:08--  https://users.dcc.uchile.cl/~jperez/beto/uncased_2M/pytorch_weights.tar.gz
Resolving users.dcc.uchile.cl (users.dcc.uchile.cl)... 200.9.99.211, 192.80.24.4
Connecting to users.dcc.uchile.cl (users.dcc.uchile.cl)|200.9.99.211|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 410039235 (391M) [application/x-gzip]
Saving to: ‘pytorch_weights.tar.gz’


2021-12-11 15:01:03 (7.17 MB/s) - ‘pytorch_weights.tar.gz’ saved [410039235/410039235]

--2021-12-11 15:01:03--  https://users.dcc.uchile.cl/~jperez/beto/uncased_2M/vocab.txt
Resolving users.dcc.uchile.cl (users.dcc.uchile.cl)... 192.80.24.4, 200.9.99.211
Connecting to users.dcc.uchile.cl (users.dcc.uchile.cl)|192.80.24.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 248047 (242K) [text/plain]
Saving to: ‘vocab.txt’


2021-12-11 15:01:05 (494 KB/s) - ‘vocab.txt’ saved [248047/248047]

--2021-12-11 15:01:05--  https://users.dcc.uchile.cl/~jperez/beto/uncased_2M/confi

In [11]:
################## Common ##################
import numpy as np
import pandas as pd
import time
import datetime
import random
import os
import re
import matplotlib.pyplot as plt
from argparse import Namespace
from tqdm.notebook import tqdm
from google.colab import drive


################## PyTorch ##################
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


################## Transformers ##################
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import AdamW, TrainingArguments, get_constant_schedule, Trainer

## Settings

In [68]:
settings = Namespace()

# Paths
settings.mount_path = "/content/drive"
drive.mount(settings.mount_path)
settings.project_path = os.path.join(settings.mount_path, "MyDrive/HackathonMaratoTV3")
settings.custom_data_filepath = os.path.join(settings.project_path, "custom_data.csv")
settings.emotion_data_filepath = os.path.join(settings.project_path, "translated_emotions_numeric_labels.csv")


#@markdown ##BERT-based models
# Approach with BERT Multilingual: https://sci2lab.github.io/ml_tutorial/bert_farsi_sentiment/
# Approach with BETO: https://benjad.github.io/2020/08/04/clasificador-sentimiento-BERT/
settings.model_to_use = "BETO/" #@param ["BETO/", "nlptown/bert-base-multilingual-uncased-sentiment"]
settings.num_epochs =  3#@param {"type":"integer"}
settings.batch_size =  16#@param {"type":"integer"}
settings.model_name = re.sub("/", "", settings.model_to_use)
settings.model_folder_path = os.path.join(settings.project_path, settings.model_name)
settings.model_chkp_path = os.path.join(settings.project_path, settings.model_name+".chkp")

# Device
if torch.cuda.is_available():
    settings.device = torch.device("cuda:0")
    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
    !nvidia-smi # Show GPU info
else:
    settings.device = torch.device("cpu")
    print("WARNING: GPU device not found.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Sat Dec 11 15:40:58 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    74W / 149W |   5731MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                       

## Data

In [61]:
def load_dataframe(filepath, is_dataframe):
  df = None
  try:
    # Read inputs
    if is_dataframe:
      df = pd.read_csv(filepath, sep=':', usecols=["text", "label"])
      df = df.rename(columns={"text": "Sentence", "label": "Sentiment"})
    else:
      data = []
      with open(filepath, 'r') as f:
        lines = f.read().splitlines()
        for line in lines:
          if len(line) > 0 and line[0].isdigit():
            sentiment = int(line[0])
            sentence = line[2:]
            data.append([sentence, sentiment])    
      df = pd.DataFrame(data, columns = ["Sentence", "Sentiment"])
  except Exception as e:
    raise Exception(f"ERROR while reading {filepath}:\n\t{e}")
  
  df["Sentiment"] -= 1 # Reduce values by one because indexing starts at 0
  
  return df

In [None]:
train_df = load_dataframe(settings.emotion_data_filepath, is_dataframe=True)
eval_df = load_dataframe(settings.custom_data_filepath, is_dataframe=False)
eval_df

## Experiment

### Create model and tokenizer

In [70]:
tokenizer = BertTokenizer.from_pretrained(settings.model_to_use, do_lower_case=False)
print(f"Number of tokens = {len(tokenizer)}")
model = BertForSequenceClassification.from_pretrained(settings.model_to_use, num_labels=5)
print(f"Model size = {sum([np.prod(p.size()) for p in model.parameters()])}")
model = model.to(settings.device)

Number of tokens = 31002
Model size = 109854725


### Model load

In [118]:
model.load_state_dict(torch.load(settings.model_chkp_path))

<All keys matched successfully>

### Datasets

In [71]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, device):
        self.df = df
        self.sentences = list(self.df["Sentence"])
        self.sentiments = self.labels = list(self.df["Sentiment"])
        self.tokenizer = tokenizer
        self.device = device

        self.compute_inputs()

    def compute_inputs(self):
        self.inputs = self.tokenizer(self.sentences,
                                        add_special_tokens=True,
                                        padding="longest",  # Warning: If an input_text is longer than max_seq_length, an error will raise on prediction
                                        truncation=False,
                                        return_tensors="pt")

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, index):
        # Get each value (tokens, attention...) of the item
        input = {key: value[index] for key, value in self.inputs.items()}

        # Get labels
        input["labels"] = self.labels[index]
        
        return input

In [72]:
train_dataset = TextDataset(train_df, tokenizer, settings.device)
eval_dataset = TextDataset(eval_df, tokenizer, settings.device)

### Training

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [73]:
training_args = TrainingArguments(
        output_dir=settings.model_folder_path,
        overwrite_output_dir=True,
        save_strategy="epoch",
        save_total_limit=1,
        num_train_epochs=settings.num_epochs,
        per_device_train_batch_size=settings.batch_size,
        per_device_eval_batch_size=settings.batch_size,
        logging_strategy='steps',
        logging_first_step= True,
        logging_steps = 50,
        log_level="error",
        disable_tqdm=False
    )

In [None]:
scheduler = get_constant_schedule(optimizer)
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_dataset,
                  optimizers=[optimizer, scheduler]
                )
trainer.train()

### Model save

In [75]:
torch.save(model.state_dict(), settings.model_chkp_path)

## Evaluation

In [119]:
def predict(sentence, model, tokenizer):
  input = tokenizer(sentence, add_special_tokens=True, padding="longest", return_tensors="pt")
  input = input.to(settings.device)
  output = model(**input)
  output = torch.softmax(output.logits, axis=-1)
  output = output.detach().cpu().numpy()[0, :]
    
  prediction = int(np.argmax(output))

  return prediction


def evaluate(data, model, tokenizer, verbose=True):
  model.eval()

  error = 0
  num_correct = 0
  for index, row in tqdm(data.iterrows()):
    sentence, sentiment = row["Sentence"], row["Sentiment"]
    prediction = predict(sentence, model, tokenizer)
    error += abs(prediction - sentiment)
    num_correct += 1 if prediction==sentiment else 0

    if verbose:
      print(f"Sentence: {sentence}")
      print(f"Prediction/Truth: {prediction} / {sentiment} (Diff={abs(prediction-sentiment)})")
  
  # Get mean error and accuracy
  mean_error = error / len(data)
  accuracy = num_correct / len(data)
  
  return mean_error, accuracy

In [121]:
mean_error, accuracy = evaluate(train_df, model, tokenizer, verbose=False)
print(f"Mean error: {mean_error}")
print(f"Accuracy: {accuracy}")

0it [00:00, ?it/s]

Mean error: 0.06425
Accuracy: 0.9711


## Interactive test

In [111]:
#es_label2int = {'alegría': 5, 'amor': 5, 'ira': 2, 'miedo': 1, 'sorpresa': 4, 'tristeza': 1}
sentence = "Llevo mucho tiempo sin poder salir de fiesta"  #@param {"type": "string"}
prediction = predict(sentence, model, tokenizer)
print(f"Prediction = {prediction}")

Prediction = 0


## Main for usage

In [113]:
!cp {settings.model_chkp_path} BETO/
!mv BETO/{settings.model_chkp_path} BETO/pytorch_model.bin

mv: cannot stat 'BETO//content/drive/MyDrive/HackathonMaratoTV3/BETO.chkp': No such file or directory


In [114]:
# Install dependencies
#!pip install numpy
#!pip install torch
#!pip install transformers


################################ Imports ################################
import torch
from transformers import BertForSequenceClassification, BertTokenizer
import numpy as np


################################ Settings ################################
MODEL_TO_USE = "BETO/"
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu" # Use Nvidia GPU if is available


################################ Create model ################################
model = BertForSequenceClassification.from_pretrained(MODEL_TO_USE, num_labels=5)
print(f"Model size = {sum([np.prod(p.size()) for p in model.parameters()])}")
model = model.to(DEVICE)


################################ Create tokenizer ################################
tokenizer = BertTokenizer.from_pretrained(MODEL_TO_USE, do_lower_case=False)
print(f"Number of tokens = {len(tokenizer)}")


################################ Prediction function ################################
def predict(sentence, model, tokenizer):
  input = tokenizer(sentence, add_special_tokens=True, padding="longest", return_tensors="pt")
  input = input.to(DEVICE)
  output = model(**input)
  output = output.logits.detach().cpu().numpy()
  prediction = np.argmax(output, axis=1)[0]

  return prediction


################################ Test ################################
sentence = "Estoy genial!"
prediction = predict(sentence, model, tokenizer)
print(f"Prediction = {prediction}")

Model size = 109854725
Number of tokens = 31002
Prediction = 4
