In [5]:
import random
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, classification_report

# Load IMDB dataset
dataset = load_dataset("imdb")

# Get 500 random reviews for training and validation
train_reviews = dataset['train']
random_indices = random.sample(range(len(train_reviews)), 500)
train_sample = train_reviews.select(random_indices)

# Split into train and validation sets (80-20 split) using Hugging Face's method
train_test_split = train_sample.train_test_split(test_size=0.2)
train_data = train_test_split['train']
val_data = train_test_split['test']

# Tokenize Data
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Create DataLoader
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=8)
val_dataloader = DataLoader(val_data, sampler=SequentialSampler(val_data), batch_size=8)

# Load Pretrained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Freeze layers to speed up training
for param in model.distilbert.parameters():
    param.requires_grad = False

# Set optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move model to device (GPU/CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training Loop
epochs = 50
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1} completed with Loss: {total_loss / len(train_dataloader)}')

# Validation Loop
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate Accuracy and Classification Report
accuracy = accuracy_score(true_labels, predictions)
report = classification_report(true_labels, predictions, target_names=['negative', 'positive'])

print(f'Validation Accuracy: {accuracy}')
print('Classification Report:')
print(report)

# Save the model
model.save_pretrained('distilbert-imdb-small')
tokenizer.save_pretrained('distilbert-imdb-small')

print("Training and Testing Completed!")


Map: 100%|██████████| 400/400 [00:01<00:00, 233.22 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 237.48 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed with Loss: 0.6806732165813446
Epoch 2 completed with Loss: 0.6647252810001373
Epoch 3 completed with Loss: 0.6447893977165222
Epoch 4 completed with Loss: 0.6233229327201844
Epoch 5 completed with Loss: 0.6040095800161361
Epoch 6 completed with Loss: 0.5858282458782196
Epoch 7 completed with Loss: 0.5666324275732041
Epoch 8 completed with Loss: 0.5510060316324235
Epoch 9 completed with Loss: 0.5390788614749908
Epoch 10 completed with Loss: 0.5146312034130096
Epoch 11 completed with Loss: 0.5130586749315262
Epoch 12 completed with Loss: 0.500243923664093
Epoch 13 completed with Loss: 0.47909155189991
Epoch 14 completed with Loss: 0.4779279002547264
Epoch 15 completed with Loss: 0.4619381695985794
Epoch 16 completed with Loss: 0.45787572145462035
Epoch 17 completed with Loss: 0.4498037961125374
Epoch 18 completed with Loss: 0.44214977651834486
Epoch 19 completed with Loss: 0.4359392076730728
Epoch 20 completed with Loss: 0.4353951933979988
Epoch 21 completed with Loss: 

In [7]:
# Training and Validation Code...

# Save the model and tokenizer
model.save_pretrained('sentiment_model')
tokenizer.save_pretrained('sentiment_model')

print("Model and tokenizer saved successfully!")


Model and tokenizer saved successfully!
