In [None]:
## Speech and Natural Language Processing (SNLP)

## Lab05 
## Finetuning Transformers 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import transformers
from sklearn import model_selection, metrics

In [None]:
config = {
    "max_length": 360,
    "model_path": "microsoft/xtremedistil-l6-h256-uncased",
    
    "output_dir": "./my-model",
    "train_batch_size": 64,
    "valid_batch_size": 64,
    "learning_rate": 3e-5,
    "epochs": 3,
    
    "debug": True,
}

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(config["model_path"])
class TextDataset:
    
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        enc = tokenizer(
            row["text"],
            add_special_tokens=True,
            max_length=config["max_length"],
            padding="max_length",
            truncation=True
        )
        
        return {
            "input_ids": torch.tensor(enc["input_ids"]),
            "attention_mask": torch.tensor(enc["attention_mask"]),
            "labels": torch.tensor(row["label"]),
        }

In [None]:
df = pd.read_csv("//kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv").rename(columns={"review": "text"})

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentiment'])  # 0 for negative, 1 for positive

df.head() 

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(config["model_path"])

In [None]:
train, valid = model_selection.train_test_split(
    df,
    test_size=0.2,
    random_state=23,
    shuffle=True,
    stratify=df["label"]
)

In [None]:
train_ds = TextDataset(train)
valid_ds = TextDataset(valid)

In [None]:
valid_ds[0]

In [None]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(config["model_path"])

In [None]:

def compute_metrics(eval_data):
   
    preds = eval_data.predictions.argmax(-1)
    labels = eval_data.label_ids 
    print(eval_data)
    print(preds)
    print(labels)

    return {
        'accuracy': metrics.accuracy_score(labels, preds),
        'precision': metrics.precision_score(labels, preds),
        'recall': metrics.recall_score(labels, preds),
        'classification_report': metrics.classification_report(labels, preds, target_names=list(id2label.values()), output_dict=True)




    }

training_args = transformers.TrainingArguments(
     output_dir="./results",                      # Directory for storing results
    evaluation_strategy="steps",                 # Evaluate every few steps
    per_device_train_batch_size=config['train_batch_size'],              # Batch size per device during training
    per_device_eval_batch_size=config['train_batch_size'],               # Batch size per device during evaluation
    num_train_epochs=config['epochs'],                          # Total number of training epochs
    warmup_steps=500,                            # Number of warmup steps for learning rate scheduler
    save_total_limit=2,
    logging_dir=None,                            # Disable logging directory
    logging_strategy="no",
    report_to=[]# Limit the total amount of checkpoints`

)

In [None]:
print()

In [None]:
trainer = transformers.Trainer(
    model=model,                                 # The model to be trained
    args=training_args,                          # The training arguments, defined above
    train_dataset=train_ds,                 # The training dataset
    eval_dataset=valid_ds,                   # The evaluation dataset
    tokenizer=tokenizer,                         # The tokenizer
    compute_metrics=compute_metrics, 
    

)

In [None]:
trainer.train()

In [None]:
trainer.save_state()

In [None]:
trainer.save_model()