# Imports

In [1]:
import pandas as pd

In [2]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
#pip install --upgrade accelerate

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
import torch
print(torch.__version__)

2.6.0


In [36]:
torch.version.cuda

'12.6'

In [37]:
if torch.cuda.is_available():
    print("CUDA is available. Training on GPU.")
    device = torch.device("cuda")

elif  torch.backends.mps.is_available():
    device = torch.device("mps") 
else:
    print("CUDA is not available. Training on CPU.")
    device = torch.device("cpu")


print(device)


CUDA is available. Training on GPU.
cuda


In [38]:
data = pd.read_feather("../data/movie_reviews_4k.feather")

In [39]:
data.shape

(4000, 2)

## Load Tokenizer

In [45]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [46]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

## Train test split

In [47]:
texts = data['text'].tolist()
labels = data['label'].tolist()

encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

df_encodings = pd.DataFrame({'input_ids': encodings['input_ids'], 
                             'attention_mask': encodings['attention_mask'], 
                             'labels': labels})

In [14]:
train_df, val_df = train_test_split(df_encodings, test_size=0.2, random_state=42)

In [54]:
train_dataset = SentimentDataset({'input_ids': train_df['input_ids'].tolist(), 
                                  'attention_mask': train_df['attention_mask'].tolist()}, 
                                 train_df['labels'].tolist())

val_dataset = SentimentDataset({'input_ids': val_df['input_ids'].tolist(), 
                                'attention_mask': val_df['attention_mask'].tolist()}, 
                               val_df['labels'].tolist())

## Load Pretrained model

In [58]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Setup training Hyperparams

In [60]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=50,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

## Train and evaluate

In [61]:
%%time
trainer.train()

Step,Training Loss
10,0.7064
20,0.6874
30,0.6788
40,0.6267
50,0.5564
60,0.6026
70,0.5496
80,0.4665
90,0.5376
100,0.4223


CPU times: total: 38.8 s
Wall time: 40.2 s


TrainOutput(global_step=600, training_loss=0.2942747827370962, metrics={'train_runtime': 40.0175, 'train_samples_per_second': 239.895, 'train_steps_per_second': 14.993, 'total_flos': 317921756774400.0, 'train_loss': 0.2942747827370962, 'epoch': 3.0})

In [62]:
trainer.evaluate()

{'eval_loss': 0.5506370663642883,
 'eval_runtime': 1.1251,
 'eval_samples_per_second': 711.025,
 'eval_steps_per_second': 11.554,
 'epoch': 3.0}

In [63]:
model.save_pretrained("sentiment_classification_DistillBert")

## Make classifications

In [None]:
def classify_sentences(model, tokenizer, sentences, device):
    # Tokenize the input
    encoded_input = tokenizer(
        sentences,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=128
    )
    model.eval() 
    # Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in encoded_input.items()}
    
    # (Optional) Ensure model is on the correct device as well
    # model.to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return predictions
    # Move predictions back to CPU if you want to convert them to NumPy
    # if device!='cpu':
    #     return torch.argmax(predictions, dim=-1).cpu().numpy()
    # else:
    #     torch.argmax(predictions, dim=-1).numpy




In [64]:
model.eval() 

pred_sentences  =[ "I absolutely hate this movie, total dissaster", 
                  "Most beaytiful movie ever, I watched it 10 times, very good",
                  "Reasonably good movie"]

y_pred = classify_sentences(model, tokenizer, pred_sentences, device )

In [65]:
y_pred

array([0, 1, 1])

In [24]:
import numpy as np

In [25]:
y_pred.shape

(3,)

In [26]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
val_data.reset_index(inplace=True, drop=True)

In [27]:
y_pred = classify_sentences(model, tokenizer, val_data.text.tolist(), device)

In [28]:
val_data["label_pred"] = y_pred

In [29]:
val_data["correct_prediction"] = val_data['label'] == val_data["label_pred"]

In [30]:
val_data["correct_prediction"].mean()

np.float64(0.8375)