# Install Dependency

In [1]:
!pip install transformers
!pip install tqdm









# Import lib

In [2]:
import os
import shutil
METRIC_STAT_PATH = os.path.join("Model", "metrics_state")
MODEL_PATH =  os.path.join("Model", "Model")
RESULT_PATH = os.path.join("Model", "result")
LOGS_PATH = os.path.join("Model", "logs")

In [3]:
import torch

In [4]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
SAMPLE = 700000

In [9]:
TRANSFORMER = "distilbert-base-uncased"

# Import data

In [10]:
dataframe = pd.read_csv(os.path.join("train", "training_noemoticon.csv"), encoding='latin-1').sample(SAMPLE)[['id', 'text']]
dataframe

Unnamed: 0,id,text
1319902,1,codinghorror small world she class
844765,1,frankbauer could nt agree interesting concep...
1088731,1,terrencej i lol so one thousand follower
124754,0,feel like close friend hate
461805,0,not much progress weekend
...,...,...
782302,0,i wan na see transformer how long do i have t...
509961,0,brynxo fuck miss arena football game it free...
1122290,1,dalydegagne thanks retweeting daly
1192658,1,sound check calvary we play big sanctuary


# Dataset split and tokenize

In [11]:
class twitterDataset(torch.utils.data.Dataset):
    def __init__(self, encoding, labels):
        self.encoding = encoding
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key : torch.tensor(val[idx]) for key, val in self.encoding.items()}
        item['label'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [12]:
X_train , X_test, y_train, y_test = train_test_split(dataframe["text"], dataframe["id"].to_numpy().flatten(), test_size=0.10,  random_state=42)

In [13]:
tokenizer = DistilBertTokenizerFast.from_pretrained(TRANSFORMER)
train_dataset = twitterDataset(tokenizer(X_train.tolist(), padding="max_length", truncation=True), y_train.tolist())
test_dataset = twitterDataset(tokenizer(X_test.tolist(), padding="max_length", truncation=True), y_test.tolist())

# Train model (Bert)

In [14]:
model_bert = DistilBertForSequenceClassification.from_pretrained(TRANSFORMER, num_labels=2)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [15]:
train_args = TrainingArguments(
    output_dir= RESULT_PATH,
    num_train_epochs=2,
    per_device_eval_batch_size=64,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.001,
    logging_dir=LOGS_PATH,
    logging_steps=10,
    optim="adamw_torch"
    )

In [16]:
trainer = Trainer(
    model= model_bert,
    args=train_args,
    train_dataset=train_dataset
    )

In [17]:
trainer.train()

  0%|          | 0/31625 [00:00<?, ?it/s]

Step,Training Loss
71010,0.3729
71020,0.3161
71030,0.4097
71040,0.371
71050,0.3199
71060,0.341
71070,0.3858
71080,0.3299
71090,0.4424
71100,0.3237


TrainOutput(global_step=78750, training_loss=0.0356460635170104, metrics={'train_runtime': 2089.2554, 'train_samples_per_second': 603.086, 'train_steps_per_second': 37.693, 'total_flos': 1.6690892230656e+17, 'train_loss': 0.0356460635170104, 'epoch': 2.0})

# Evaluation

In [26]:
def predict_(model, encoding, device):
    input_ids = encoding["input_ids"].detach().clone().to(device)
    att_mask = encoding["attention_mask"].detach().clone().to(device)
    with torch.no_grad():
        y = np.argmax(model(input_ids.unsqueeze(0), att_mask.unsqueeze(0))[0].to('cpu').numpy())
    return y

In [28]:
from sklearn.metrics import accuracy_score
accuracy_score([item['label']  for item in tqdm(test_dataset)],\
               [predict_(model_bert, encoding, device) for encoding in tqdm(test_dataset)])

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 70000/70000 [00:03<00:00, 23116.13it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 70000/70000 [08:35<00:00, 135.90it/s]


0.8401857142857143

In [29]:
gc.collect()
torch.cuda.empty_cache()

# Save the model

In [34]:
trainer.save_model(MODEL_PATH)

TypeError: save_metrics() missing 1 required positional argument: 'metrics'