# Imports

In [1]:
import pandas as pd

In [2]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import Dataset

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
import torch
print(torch.__version__)

2.0.1


In [5]:
torch.version.cuda

In [6]:
if torch.cuda.is_available():
    print("CUDA is available. Training on GPU.")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Training on CPU.")
    device = torch.device("cpu")


CUDA is not available. Training on CPU.


In [7]:
data = pd.read_feather("../data/movie_reviews_4k.feather")

In [8]:
data.shape

(4000, 2)

In [9]:
data

Unnamed: 0,text,label
0,I wanted to vote zero or lower. I loved the co...,0
1,"Karen(Bobbie Phillips)mentions, after one of h...",0
2,This review applies for the cut of the film th...,0
3,"The best film on the battle of San Antonio, Te...",1
4,"In theory, 'Director's Commentary' should have...",0
...,...,...
3995,Excellent show. Instead of watching the same o...,1
3996,"It's hard to believe an ""action"" packed Jet Li...",0
3997,Me and my girlfriend went to see this movie as...,0
3998,This movie is my all time favorite!!! You real...,1


## Load Tokenizer

In [10]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

## Train test split

In [12]:
texts = data['text'].tolist()
labels = data['label'].tolist()

encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

df_encodings = pd.DataFrame({'input_ids': encodings['input_ids'], 
                             'attention_mask': encodings['attention_mask'], 
                             'labels': labels})

In [19]:
train_df, val_df = train_test_split(df_encodings, test_size=0.2, random_state=42)

In [20]:
train_dataset = SentimentDataset({'input_ids': train_df['input_ids'].tolist(), 
                                  'attention_mask': train_df['attention_mask'].tolist()}, 
                                 train_df['labels'].tolist())

val_dataset = SentimentDataset({'input_ids': val_df['input_ids'].tolist(), 
                                'attention_mask': val_df['attention_mask'].tolist()}, 
                               val_df['labels'].tolist())

## Load Pretrained model

In [21]:
pip install --upgrade accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


In [22]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.

## Setup training Hyperparams

In [23]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=2,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=50,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

## Train and evaluate

In [24]:
trainer.train()

Step,Training Loss
10,0.702
20,0.6923
30,0.6871
40,0.6369
50,0.5481
60,0.5629
70,0.4592
80,0.6534
90,0.4515
100,0.3508


TrainOutput(global_step=200, training_loss=0.4979865550994873, metrics={'train_runtime': 198.3117, 'train_samples_per_second': 16.136, 'train_steps_per_second': 1.009, 'total_flos': 105973918924800.0, 'train_loss': 0.4979865550994873, 'epoch': 1.0})

In [25]:
trainer.evaluate()

{'eval_loss': 0.3770590126514435,
 'eval_runtime': 12.3326,
 'eval_samples_per_second': 64.869,
 'eval_steps_per_second': 1.054,
 'epoch': 1.0}

In [26]:
model.save_pretrained("sentiment_classification_DistillBert")

## Make classifications

In [52]:
def classify_senteces(model, tokenizer, sentences):
    
    encoded_input = tokenizer(pred_sentences , return_tensors='pt', padding=True, truncation=True, max_length=128)
    inputs = {k:v for k,v in encoded_input.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
    return torch.argmax(predictions, dim=-1).numpy()

In [53]:
pred_sentences  =[ "I absolutely hate this movie, total dissaster", 
                  "Most beaytiful movie ever, I watched it 10 times, very good",
                  "Reasonably good movie"]

In [57]:
val_df

Unnamed: 0,input_ids,attention_mask,labels
555,"[101, 2348, 1996, 19311, 2038, 2070, 3492, 220...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
3491,"[101, 1045, 2293, 6249, 27476, 1010, 1998, 573...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
527,"[101, 1037, 2995, 2601, 15587, 3185, 1998, 103...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3925,"[101, 2023, 3185, 2003, 1037, 10973, 7245, 199...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2989,"[101, 7929, 1010, 2023, 3185, 4627, 2041, 2066...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
...,...,...,...
1922,"[101, 1045, 4033, 1005, 1056, 2464, 1996, 2434...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
865,"[101, 2066, 1996, 7625, 2015, 1010, 2204, 2335...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3943,"[101, 16655, 10111, 13320, 24002, 1010, 4895, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1642,"[101, 2065, 1996, 11153, 1997, 16637, 2018, 22...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0


In [54]:
y_pred = classify_senteces(model, tokenizer, pred_sentences)

array([0, 1, 1])