#Finetunning Distillbert

## Setup

Setup google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Install libraries

In [None]:
!pip install ray[tune]

Imports

In [None]:
from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizer, PreTrainedModel, \
    AutoModelForSequenceClassification, TrainingArguments
from transformers import Trainer
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report,  precision_recall_fscore_support
import pandas as pd
from pandas import DataFrame
from enum import Enum
import numpy as np
import random
from tqdm import tqdm
import re

Constants

In [None]:
BASE_PATH = '/content/gdrive/My Drive/AI'
FORMATTED_DATA_PATH_TRAIN = BASE_PATH + '/notebooks/dataset/train-en.tsv'
FORMATTED_DATA_PATH_EVAL = BASE_PATH + '/notebooks/dataset/eval-en.tsv'
TRAINER_MODEL_SAVE_PATH = BASE_PATH + '/notebooks/output/models'
TRAINER_LOGS_SAVE_PATH = BASE_PATH + '/notebooks/output/logs'
MODEL_NAME = "distilbert-base-uncased"
RANDOM_STATE = 42

In [None]:
class Constants:
  TAB = '\t'
  NEW_LINE = '\n'
  REQUESTS = 'requests'
  INTENTS = 'intents'

const = Constants

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print('Number of devices:', torch.cuda.device_count())
print('Device name:', torch.cuda.get_device_name(0))
print('Device type:', device)

Number of devices: 1
Device name: Tesla T4
Device type: cuda


## Prepare data

In [None]:
class DatasetType(Enum):
    TRAIN = 0,
    EVAL = 1,
    TEST = 2

def read_data(dataset_type: DatasetType = DatasetType.TRAIN) -> DataFrame:
    if dataset_type == DatasetType.TRAIN:
        formatted_data_path = FORMATTED_DATA_PATH_TRAIN
    elif dataset_type == DatasetType.EVAL:
        formatted_data_path = FORMATTED_DATA_PATH_EVAL
    else:
        raise Exception('method parameter not valid')

    return pd.read_csv(formatted_data_path, delimiter=const.TAB, names=[const.INTENTS, const.REQUESTS])


def maybe_replace(row: str, dictionary: dict) -> str:
    for key, value in dictionary.items():
        if key in row:
            row = row.replace(key, value)
    return row


def fix_misspelled_words(df: DataFrame) -> DataFrame:
    # obtained using SpellChecker package and spacy tokenization
    before_tokenization = {"a.m.": "am", "a.m": "am", "p.m.": "pm",
                           "p.m": "pm", "wed.": "wednesday", "dr.": "doctor"}
    after_tokenization = {" f ": " fahrenheit ", " c ": " celsius ", " dr ": " doctor ", " appt ": " appointment ", " fl ": " florida ",
                          " st. ": " saint ", " st ": " saint ", " nyc ": " new york city ", " nc ": " north carolina ",
                          " nj ": " new jersey ", " dc ": " district of columbia ", "celcius": "celsius", " wed ": " wednesday ",
                          "mintues": "minutes", "snoozes": "snooze", "forcast": "forecast", "tempature": "temperature",
                          "tomorrow/": "tomorrow", "temperture": "temperature", " hrs ": " hours ",
                          "apointment": "appointment", "tempurature": "temperature", "bejing": " beijing ",
                          " thurs ": " thursday ", " bday ": " birthday ", " avg ": " average ", "exerice": " exercise ",
                          "altanta": " atlanta "}
    for key, value in before_tokenization.items():                                                             # fix abbreviations
        df[const.REQUESTS] = df[const.REQUESTS].str.replace(key, value, regex=False)
    # unnecessary pycharm warning, should be ignored
    df[const.REQUESTS] = df.apply(lambda row: maybe_replace(row[const.REQUESTS], after_tokenization), axis=1)  # fix misspelled words
    return df


def preprocess(df: DataFrame, fix_misspelled=True, shuffle=True) -> DataFrame:
    df[const.REQUESTS] = list(map(lambda request: ' '.join(request.split()), df[const.REQUESTS]))  # fix whitespaces
    df[const.REQUESTS] = list(map(lambda request: request.lower(), df[const.REQUESTS]))            # lower
    to_remove = ["weekdaily"]                                                                      # remove strange words
    for word in to_remove:
        df = df[~df[const.REQUESTS].str.contains(word)]
    if fix_misspelled:
        fix_misspelled_words(df)
    df.drop_duplicates(subset=const.REQUESTS, inplace=True)                                        # drop duplicates
    if shuffle:
        df = df.sample(frac=1, random_state=RANDOM_STATE)                                                    # shuffle the DataFrame rows
    return df

In [None]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # encodings = {'input_ids': [...], 'attention_mask': [...] }
        item = {key: torch.tensor(value[idx]) for key, value in self.encodings.items()}  # convert to torch tensor
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
def get_pt_dataset(dataset_type: DatasetType, tokenizer: PreTrainedTokenizer, num_of_samples: int = 1000) -> Dataset:
    """
    TODO: use train_test_split instead of num_of_samples in order to keep label distribution
    """
    df = read_data(dataset_type)
    df = preprocess(df, fix_misspelled=True, shuffle=True)

    inputs = df[const.REQUESTS][:num_of_samples].tolist()
    encodings = tokenizer(inputs, truncation=True, padding=True)

    labels = df[const.INTENTS][:num_of_samples].tolist()
    intent_to_id = get_intent_to_id()
    labels = [intent_to_id[label] for label in labels]

    return CustomDataset(encodings, labels)

In [None]:
def get_intent_to_id():
    return {
        'weather/find': 0,
        'alarm/set_alarm': 1,
        'alarm/show_alarms': 2,
        'reminder/set_reminder': 3,
        'alarm/modify_alarm': 4,
        'weather/checkSunrise': 5,
        'weather/checkSunset': 6,
        'alarm/snooze_alarm': 7,
        'alarm/cancel_alarm': 8,
        'reminder/show_reminders': 9,
        'reminder/cancel_reminder': 10,
        'alarm/time_left_on_alarm': 11
    }

## Deep Learning !

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
def finetunning_trainer_class(model_name):
    # best run: learning_rate': 1.5930522616241033e-05, 'num_train_epochs': 3, 'seed': 28.614830534045772, 'per_device_train_batch_size': 4
    params = {
        'num_of_samples': 1000,  # -1 means all
        'train_batch_size': 4,
        'eval_batch_size': 4,
        'learning_rate': 1.5930522616241033e-05,
        'num_of_epochs': 3,
        'seed': 28.614830534045772
    }

    # 1) prepare data
    tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(model_name)
    train_dataset = get_pt_dataset(DatasetType.TRAIN, tokenizer, params['num_of_samples'])
    eval_dataset = get_pt_dataset(DatasetType.EVAL, tokenizer, params['num_of_samples'])

    # 2) prepare for training
    num_labels = len(get_intent_to_id().keys())
    # model with randomly initialized head
    model: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    model.to(device)

    # 3) train
    training_args = TrainingArguments(
        output_dir=TRAINER_MODEL_SAVE_PATH,
        num_train_epochs=params['num_of_epochs'],
        per_device_train_batch_size=params['train_batch_size'],
        per_device_eval_batch_size=params['eval_batch_size'],
        #warmup_steps=5,
        #weight_decay=0.01,
        logging_dir=TRAINER_LOGS_SAVE_PATH
    )

    # default optimizer is AdamW
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    setattr(trainer.args, 'seed', int(params['seed']))

    print(trainer.train())

    # 4) evaluate
    print(trainer.evaluate())

    # 5) save
    trainer.save_model(TRAINER_MODEL_SAVE_PATH)

In [None]:
def hyperparameter_search(model_name):
    params = {
        'num_of_samples': 1000,
        'train_batch_size': 64,
        'eval_batch_size': 16,
        'learning_rate': 4.622589001020833e-05,
        'num_of_epochs': 3
    }

    # 1) prepare data
    tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(model_name)
    train_dataset = get_pt_dataset(DatasetType.TRAIN, tokenizer, params['num_of_samples'])
    eval_dataset = get_pt_dataset(DatasetType.EVAL, tokenizer, params['num_of_samples'])

    # 2) prepare for training
    num_labels = len(get_intent_to_id().keys())

    # required for hyperparameter search
    def model_init():
        # model with randomly initialized head
        model: PreTrainedModel = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                                                    num_labels=num_labels)
        return model

    # 3) train
    training_args = TrainingArguments(
        output_dir=TRAINER_MODEL_SAVE_PATH,
        num_train_epochs=params['num_of_epochs'],
        per_device_train_batch_size=params['train_batch_size'],
        per_device_eval_batch_size=params['eval_batch_size'],
        #warmup_steps=5,
        #weight_decay=0.01,
        logging_dir=TRAINER_LOGS_SAVE_PATH
    )

    # default optimizer is AdamW
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    print('############ running hyperparameter_search')
    best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")
    # print(trainer.train())
    print(best_run)
    print()
    for n, v in best_run.hyperparameters.items():
        if n == 'seed':
          v = int(v)
        setattr(trainer.args, n, v)
    print('############ training model')
    trainer.train()
    print('############ evaluating model')
    # 4) evaluate
    print(trainer.evaluate())

    # 5) save
    trainer.save_model(TRAINER_MODEL_SAVE_PATH)

In [None]:
finetunning_trainer_class(MODEL_NAME)
#hyperparameter_search(MODEL_NAME)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab

Step,Training Loss
500,0.4174
1000,0.1201
1500,0.1258
2000,0.1271
2500,0.079
3000,0.1228
3500,0.1107
4000,0.0847
4500,0.0703
5000,0.0946


TrainOutput(global_step=17223, training_loss=0.07795547537844348, metrics={'train_runtime': 3753.574, 'train_samples_per_second': 4.588, 'total_flos': 747337481200800.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -494145536, 'train_mem_gpu_alloc_delta': 807544320, 'train_mem_cpu_peaked_delta': 494223360, 'train_mem_gpu_peaked_delta': 96468992})


{'eval_loss': 0.05392073094844818, 'eval_accuracy': 0.9888918275588469, 'eval_f1': 0.9888918275588469, 'eval_precision': 0.9888918275588469, 'eval_recall': 0.9888918275588469, 'eval_runtime': 36.5253, 'eval_samples_per_second': 103.517, 'epoch': 3.0, 'eval_mem_cpu_alloc_delta': 1617920, 'eval_mem_gpu_alloc_delta': -3072, 'eval_mem_cpu_peaked_delta': 0, 'eval_mem_gpu_peaked_delta': 6530048}
