<a href="https://colab.research.google.com/github/Myashka/IVICT.HACK-team-HI/blob/baseline/Train_with_huggy_face_failed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install -q transformers

[K     |████████████████████████████████| 4.2 MB 13.7 MB/s 
[K     |████████████████████████████████| 6.6 MB 56.6 MB/s 
[K     |████████████████████████████████| 84 kB 4.0 MB/s 
[K     |████████████████████████████████| 596 kB 32.5 MB/s 
[?25h

In [2]:
import pandas as pd
import torch
import transformers
from torch.utils.data import DataLoader, Dataset
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          DistilBertForSequenceClassification, DistilBertModel,
                          PreTrainedModel, Trainer, TrainingArguments)
from transformers.modeling_outputs import SequenceClassifierOutput

In [3]:
from torch import cuda

device = "cuda" if cuda.is_available() else "cpu"

In [4]:
import time
import warnings

import numpy as np
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
from tqdm.auto import tqdm

warnings.simplefilter("ignore")
import os

In [5]:
!wget --no-check-certificate https://raw.githubusercontent.com/Myashka/IVICT.HACK-team-HI/data/dataset.csv

--2022-05-22 06:56:19--  https://raw.githubusercontent.com/Myashka/IVICT.HACK-team-HI/data/dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8106782 (7.7M) [text/plain]
Saving to: ‘dataset.csv’


2022-05-22 06:56:20 (103 MB/s) - ‘dataset.csv’ saved [8106782/8106782]



### Data preprocessing

In [6]:
df = pd.read_csv("dataset.csv", usecols=[1, 2])
df.label.unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger',
       'joy'], dtype=object)

In [7]:
df = pd.get_dummies(df, prefix=[""], columns=["label"], drop_first=False)
df

Unnamed: 0,text,_anger,_boredom,_empty,_enthusiasm,_fun,_happiness,_hate,_joy,_love,_neutral,_relief,_sadness,_surprise,_worry
0,@tiffanylue i know i was listenin to bad habi...,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,Layin n bed with a headache ughhhh...waitin o...,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,Funeral ceremony...gloomy friday...,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,wants to hang out with friends SOON!,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,@dannycastillo We want to trade with someone w...,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81454,Melissa stared at her friend in dism,0,0,0,0,0,0,0,0,0,0,0,0,0,1
81455,Successive state elections have seen the gover...,0,0,0,0,0,0,0,0,0,0,0,0,0,1
81456,Vincent was irritated but not dismay,0,0,0,0,0,0,0,0,0,0,0,0,0,1
81457,Kendall-Hume turned back to face the dismayed ...,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [8]:
df.rename(
    columns={
        "_anger": "anger",
        "_boredom": "boredom",
        "_empty": "empty",
        "_enthusiasm": "enthusiasm",
        "_fun": "fun",
        "_happiness": "happiness",
        "_hate": "hate",
        "_joy": "joy",
        "_love": "love",
        "_neutral": "neutral",
        "_relief": "relief",
        "_sadness": "sadness",
        "_surprise": "surprise",
        "_worry": "worry",
    },
    inplace=True,
)

In [9]:
label_cols = [
    "anger",
    "boredom",
    "empty",
    "enthusiasm",
    "fun",
    "happiness",
    "hate",
    "joy",
    "love",
    "neutral",
    "relief",
    "sadness",
    "surprise",
    "worry",
]
len(label_cols)

14

In [10]:
df.columns

Index(['text', 'anger', 'boredom', 'empty', 'enthusiasm', 'fun', 'happiness',
       'hate', 'joy', 'love', 'neutral', 'relief', 'sadness', 'surprise',
       'worry'],
      dtype='object')

In [11]:
id2label = {str(i): label for i, label in enumerate(label_cols)}
label2id = {label: str(i) for i, label in enumerate(label_cols)}

In [12]:
df["labels"] = df[label_cols].values.tolist()
df.head()

Unnamed: 0,text,anger,boredom,empty,enthusiasm,fun,happiness,hate,joy,love,neutral,relief,sadness,surprise,worry,labels
0,@tiffanylue i know i was listenin to bad habi...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,Layin n bed with a headache ughhhh...waitin o...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
2,Funeral ceremony...gloomy friday...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
3,wants to hang out with friends SOON!,0,0,0,1,0,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,@dannycastillo We want to trade with someone w...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"


In [13]:
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]

(df_train.shape, df_test.shape)

((65045, 16), (16414, 16))

### Tokenize and encode

In [14]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [15]:
train_encodings = tokenizer(
    df_train["text"].values.tolist(), truncation=True, padding=True, max_length=64
)
test_encodings = tokenizer(
    df_test["text"].values.tolist(), truncation=True, padding=True
)

In [16]:
train_labels = df_train["labels"].values.tolist()
test_labels = df_test["labels"].values.tolist()

In [17]:
class Emotion_dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [18]:
train_dataset = Emotion_dataset(train_encodings, train_labels)
test_dataset = Emotion_dataset(test_encodings, test_labels)

In [19]:
train_dataset[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([  101,  1030, 14381,  7630,  2063,  1045,  2113,  1045,  2001,  4952,
          2378,  2000,  2919, 10427,  3041,  1998,  1045,  2318, 11576,  2378,
          2012,  2010,  2112,  1027,  1031,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 'labels': tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

In [20]:
print(f"Tokens: {tokenizer.convert_ids_to_tokens(train_dataset[0]['input_ids'])}")
print(f"Decoded sequence: '{tokenizer.decode(train_dataset[0]['input_ids'])}'")

Tokens: ['[CLS]', '@', 'tiffany', '##lu', '##e', 'i', 'know', 'i', 'was', 'listen', '##in', 'to', 'bad', 'habit', 'earlier', 'and', 'i', 'started', 'freak', '##in', 'at', 'his', 'part', '=', '[', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
Decoded sequence: '[CLS] @ tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part = [ [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'


### Fine-tuning

In [21]:
class DistilBertForMultilabelSequenceClassification(
    DistilBertForSequenceClassification
):
    def __init__(self, config):
        super().__init__(config)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.distilbert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_state = outputs[0]
        pooled_output = hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)

        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(
                logits.view(-1, self.num_labels),
                labels.float().view(-1, self.num_labels),
            )
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [22]:
num_labels = 14
model = DistilBertForMultilabelSequenceClassification.from_pretrained(
    model_ckpt, num_labels=num_labels
).to(device)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForMultilabelSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForMultilabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForMultilabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForMultilabelSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier

In [23]:
model.config.id2label = id2label
model.config.label2id = label2id
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "anger",
    "1": "boredom",
    "10": "relief",
    "11": "sadness",
    "12": "surprise",
    "13": "worry",
    "2": "empty",
    "3": "enthusiasm",
    "4": "fun",
    "5": "happiness",
    "6": "hate",
    "7": "joy",
    "8": "love",
    "9": "neutral"
  },
  "initializer_range": 0.02,
  "label2id": {
    "anger": "0",
    "boredom": "1",
    "empty": "2",
    "enthusiasm": "3",
    "fun": "4",
    "happiness": "5",
    "hate": "6",
    "joy": "7",
    "love": "8",
    "neutral": "9",
    "relief": "10",
    "sadness": "11",
    "surprise": "12",
    "worry": "13"
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2

In [24]:
!pip install scikit-plot

Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7


In [25]:
from scikitplot.metrics import plot_roc
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [26]:
def multi_label_metrics(predictions, labels, threshold=0.5):

    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))

    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    y_true = labels
    f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average="weighted")
    roc_auc = roc_auc_score(y_true, y_pred, average="weighted")
    accuracy = accuracy_score(y_true, y_pred)

    metrics = {"f1": f1_macro_average, "roc_auc": roc_auc, "accuracy": accuracy}
    return metrics

In [27]:
def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

In [28]:
batch_size = 32
# configure logging so we see training loss
logging_steps = len(train_dataset) // batch_size

args = TrainingArguments(
    output_dir="emotion",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=logging_steps,
)

In [29]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [30]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 16414
  Batch size = 32


{'eval_accuracy': 0.0,
 'eval_f1': 0.2086090674264023,
 'eval_loss': 0.7859268188476562,
 'eval_roc_auc': 0.4947127858821956,
 'eval_runtime': 57.6962,
 'eval_samples_per_second': 284.49,
 'eval_steps_per_second': 8.891}

In [32]:
trainer.train()

***** Running training *****
  Num examples = 65045
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 6099


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.108,0.118481,0.568065,0.737021,0.442854
2,0.1081,0.118481,0.568065,0.737021,0.442854


***** Running Evaluation *****
  Num examples = 16414
  Batch size = 32
Saving model checkpoint to emotion/checkpoint-2033
Configuration saved in emotion/checkpoint-2033/config.json
Model weights saved in emotion/checkpoint-2033/pytorch_model.bin
tokenizer config file saved in emotion/checkpoint-2033/tokenizer_config.json
Special tokens file saved in emotion/checkpoint-2033/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 16414
  Batch size = 32
Saving model checkpoint to emotion/checkpoint-4066
Configuration saved in emotion/checkpoint-4066/config.json
Model weights saved in emotion/checkpoint-4066/pytorch_model.bin
tokenizer config file saved in emotion/checkpoint-4066/tokenizer_config.json
Special tokens file saved in emotion/checkpoint-4066/special_tokens_map.json


KeyboardInterrupt: ignored

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 16209
  Batch size = 32


{'epoch': 3.0,
 'eval_accuracy_thresh': 0.9509178996086121,
 'eval_loss': 0.11808639764785767,
 'eval_runtime': 45.764,
 'eval_samples_per_second': 354.187,
 'eval_steps_per_second': 11.079}