# Imports

In [1]:
import pandas as pd

In [2]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import Dataset

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
import torch
print(torch.__version__)

2.0.1


In [5]:
torch.version.cuda

In [6]:
if torch.cuda.is_available():
    print("CUDA is available. Training on GPU.")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Training on CPU.")
    device = torch.device("cpu")


CUDA is not available. Training on CPU.


In [7]:
data = pd.read_feather("../data/movie_reviews_4k.feather")

In [8]:
data.shape

(4000, 2)

In [9]:
data

Unnamed: 0,text,label
0,I wanted to vote zero or lower. I loved the co...,0
1,"Karen(Bobbie Phillips)mentions, after one of h...",0
2,This review applies for the cut of the film th...,0
3,"The best film on the battle of San Antonio, Te...",1
4,"In theory, 'Director's Commentary' should have...",0
...,...,...
3995,Excellent show. Instead of watching the same o...,1
3996,"It's hard to believe an ""action"" packed Jet Li...",0
3997,Me and my girlfriend went to see this movie as...,0
3998,This movie is my all time favorite!!! You real...,1


## Load Tokenizer

In [10]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

## Train test split

In [12]:
texts = data['text'].tolist()
labels = data['label'].tolist()

encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

df_encodings = pd.DataFrame({'input_ids': encodings['input_ids'], 
                             'attention_mask': encodings['attention_mask'], 
                             'labels': labels})

In [19]:
train_df, val_df = train_test_split(df_encodings, test_size=0.2, random_state=42)

In [20]:
train_dataset = SentimentDataset({'input_ids': train_df['input_ids'].tolist(), 
                                  'attention_mask': train_df['attention_mask'].tolist()}, 
                                 train_df['labels'].tolist())

val_dataset = SentimentDataset({'input_ids': val_df['input_ids'].tolist(), 
                                'attention_mask': val_df['attention_mask'].tolist()}, 
                               val_df['labels'].tolist())

## Load Pretrained model

In [21]:
pip install --upgrade accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [22]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.

## Setup training Hyperparams

In [73]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=50,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

## Train and evaluate

In [74]:
%%time
trainer.train()



Step,Training Loss
10,0.394
20,0.3423
30,0.3305
40,0.236
50,0.2629
60,0.2793
70,0.2153
80,0.2207
90,0.1644
100,0.0544


CPU times: user 34min, sys: 56min 46s, total: 1h 30min 47s
Wall time: 10min 4s


TrainOutput(global_step=600, training_loss=0.1851036913196246, metrics={'train_runtime': 604.7222, 'train_samples_per_second': 15.875, 'train_steps_per_second': 0.992, 'total_flos': 317921756774400.0, 'train_loss': 0.1851036913196246, 'epoch': 3.0})

In [75]:
trainer.evaluate()

{'eval_loss': 0.6296095848083496,
 'eval_runtime': 12.2654,
 'eval_samples_per_second': 65.224,
 'eval_steps_per_second': 1.06,
 'epoch': 3.0}

In [76]:
model.save_pretrained("sentiment_classification_DistillBert")

## Make classifications

In [101]:
def classify_senteces(model, tokenizer, sentences):
    
    encoded_input = tokenizer(sentences , return_tensors='pt', padding=True, truncation=True, max_length=128)
    inputs = {k:v for k,v in encoded_input.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
    return torch.argmax(predictions, dim=-1).numpy()

In [102]:
pred_sentences  =[ "I absolutely hate this movie, total dissaster", 
                  "Most beaytiful movie ever, I watched it 10 times, very good",
                  "Reasonably good movie"]

In [103]:
import numpy as np

In [104]:
np.asarray(val_df.attention_mask.values[100]).sum()

128

In [105]:
y_pred = classify_senteces(model, tokenizer, pred_sentences)

In [106]:
y_pred

array([0, 1, 1])

In [107]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
val_data.reset_index(inplace=True, drop=True)

In [112]:
y_pred = classify_senteces(model, tokenizer, val_data.text.tolist())

In [114]:
val_data["label_pred"] = y_pred

In [117]:
val_data["correct_prediction"] = val_data['label'] == val_data["label_pred"]

In [119]:
val_data["correct_prediction"].mean()

0.84125