In [2]:
!pip install transformers datasets



## 1. Import Dependencies

In [3]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


## 2. Load Dataset

In [4]:
from datasets import load_dataset
emotions = load_dataset('emotion')

In [5]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [6]:
emotions['train'][0]

{'text': 'i didnt feel humiliated', 'label': 0}

## 3. Data pre-processing

### 3.1 From text to token
* Character Tokenization
* Word Tokenization
*<b> Subword Tokenization </b>

In [7]:
from transformers import AutoTokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [8]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [9]:
print("Vocab Size of Pretrain Tokenizer: ",tokenizer.vocab_size)

Vocab Size of Pretrain Tokenizer:  30522


In [10]:
print("Special Token: ",tokenizer.special_tokens_map)

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Special Token:  {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}


In [11]:
print("Example: ")
print("Nice to meet you --> : ",tokenizer(['Nice to meet you']))


Example: 
Nice to meet you --> :  {'input_ids': [[101, 3835, 2000, 3113, 2017, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1]]}


In [12]:
def tokenize(batch):
  return tokenizer(batch['text'],padding = True, truncation = True)
emotions_encoded = emotions.map(tokenize,batched = True, batch_size = None)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

## 4.Model

In [13]:
from transformers import AutoModelForSequenceClassification


In [14]:
num_label = 6
model = (AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = num_label).to(device))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
emotions_encoded.set_format('torch',columns = ['input_ids','attention_mask','label'])

In [20]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) Y
Traceback (most recent call last):
  File "/usr/local/bin/huggingface

In [25]:
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
 labels = pred.label_ids
 preds = pred.predictions.argmax(-1)
 f1 = f1_score(labels, preds, average="weighted")
 acc = accuracy_score(labels, preds)
 return {"accuracy": acc, "f1": f1}


In [18]:
from transformers import TrainingArguments,Trainer

In [21]:
batch_size = 32
epochs = 10
logging_steps = len(emotions_encoded["train"]) // batch_size


In [23]:
training_args = TrainingArguments(output_dir="results",
 num_train_epochs=epochs,
 learning_rate=2e-5,
 per_device_train_batch_size=batch_size,
 per_device_eval_batch_size=batch_size,
 metric_for_best_model="f1",
 weight_decay=0.01,
 evaluation_strategy="epoch",
 disable_tqdm=False,
 logging_steps=logging_steps,)

In [None]:
trainer = Trainer(model=model, args=training_args,
 compute_metrics=compute_metrics,
 train_dataset=emotions_encoded["train"],
 eval_dataset=emotions_encoded["validation"])
trainer.train();

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6187,0.217447,0.9205,0.920774
2,0.1637,0.168956,0.9335,0.934294
3,0.1165,0.152791,0.9395,0.939744
4,0.0898,0.138543,0.943,0.942964
5,0.0711,0.182063,0.932,0.932502
6,0.0544,0.206597,0.933,0.932991
7,0.0461,0.21638,0.9365,0.936645
8,0.0342,0.23872,0.937,0.936866
9,0.0268,0.248187,0.936,0.936219
