In [None]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 2. Install Required Libraries
!pip install transformers datasets evaluate



In [None]:
# 3. Import Libraries

import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

import numpy as np
import evaluate

In [None]:
# 4. Set Up Path

# Base directory in Google Drive to save models and data
base_dir = '/content/drive/MyDrive/movie_sentiment_model'
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

In [None]:
# 5. Load and Prepare the IMDb Dataset

# Load the IMDb dataset
raw_datasets = load_dataset('imdb')

# Prepare the tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=256,
    )

# Tokenize the datasets
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Remove unnecessary columns and set format
tokenized_datasets = tokenized_datasets.remove_columns(['text'])
tokenized_datasets.set_format('torch')



Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
print("\nraw datasets:\n")
print(raw_datasets)

print("\ntokenized datasets:\n")
print(tokenized_datasets)


raw datasets:

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

tokenized datasets:

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})


In [None]:
# 6. prepare the training dataset

# Shuffle the train dataset for training
train_dataset = tokenized_datasets['train'].shuffle(seed=42)

eval_dataset = tokenized_datasets['test']

In [None]:
# 7. Set Up the Evaluation Metric

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# 8. Set up the Pre-trained model

# Determine the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# load the model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Move the model to the device
model.to(device)

# Set Up Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

# initialize the Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    compute_metrics = compute_metrics,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 9. Start training

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.279,0.287543,0.89944
2,0.1403,0.29082,0.914


TrainOutput(global_step=3126, training_loss=0.2239808296440354, metrics={'train_runtime': 799.7294, 'train_samples_per_second': 62.521, 'train_steps_per_second': 3.909, 'total_flos': 3311684966400000.0, 'train_loss': 0.2239808296440354, 'epoch': 2.0})

In [None]:
# 10. Evaluate the model on validation data

eval_result = trainer.evaluate()
print(f"Accuracy: {eval_result['eval_accuracy']:.4f}")

Accuracy: 0.9140


In [None]:
# 11. Save the model

model.save_pretrained(base_dir)
tokenizer.save_pretrained(base_dir)

('/content/drive/MyDrive/movie_sentiment_model/tokenizer_config.json',
 '/content/drive/MyDrive/movie_sentiment_model/special_tokens_map.json',
 '/content/drive/MyDrive/movie_sentiment_model/vocab.txt',
 '/content/drive/MyDrive/movie_sentiment_model/added_tokens.json',
 '/content/drive/MyDrive/movie_sentiment_model/tokenizer.json')