## Contribute to the DistilBERT paper by testing and futher-tuning the model for IMDb Dataset

In [7]:
import os
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np

### Load the IMDB dataset from huggingface datasets library

In [8]:
# Set random seed for reproducibility
np.random.seed(42)

print("Loading IMDB dataset...")
dataset = load_dataset("imdb")

Loading IMDB dataset...


### Tokenize the reviews using distilbert-base-uncased tokenizer

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

encoded_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

### Define parameter settings for different experiments

In [10]:
# define parameter settings
experiments = [
    {"name": "default", "learning_rate": 2e-5, "dropout": 0.1},
    {"name": "high_lr", "learning_rate": 5e-5, "dropout": 0.1},
    {"name": "high_dropout", "learning_rate": 2e-5, "dropout": 0.2},
]


### Define accuracy calculation function

In [11]:

# Define compute_metrics function for accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}

### Train the model on different parameters and check their accuracy metrics

In [12]:
# Run experiments
for exp in experiments:
    print(f"\nRunning experiment: {exp['name']} (lr={exp['learning_rate']}, dropout={exp['dropout']})")

    # Load fresh model to avoid overfitting from previous runs
    model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=2,
        dropout=exp["dropout"],
        seq_classif_dropout=exp["dropout"],
    )

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"./imdb_results_{exp['name']}",
        num_train_epochs=3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        learning_rate=exp["learning_rate"],
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        logging_dir=f"./imdb_logs_{exp['name']}",
        logging_steps=100,
        seed=42,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["test"],
        compute_metrics=compute_metrics,
    )

    # Train and evaluate
    print("Training on IMDb...")
    trainer.train()

    print("Evaluating on IMDb...")
    eval_results = trainer.evaluate()
    print(f"IMDb Accuracy ({exp['name']}): {eval_results['eval_accuracy']:.4f}")

    # Save results
    os.makedirs(f"./imdb_results_{exp['name']}", exist_ok=True)
    with open(f"./imdb_results_{exp['name']}/eval_results.txt", "w") as f:
        f.write(f"IMDb Accuracy: {eval_results['eval_accuracy']:.4f}\n")
        f.write(f"Learning Rate: {exp['learning_rate']}\n")
        f.write(f"Dropout: {exp['dropout']}\n")

print("Done! Results saved in ./imdb_results_*/")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Running experiment: default (lr=2e-05, dropout=0.1)
Training on IMDb...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.24,0.224495,0.90748
2,0.164,0.24688,0.91184
3,0.1144,0.284501,0.91248


Evaluating on IMDb...


IMDb Accuracy (default): 0.9125

Running experiment: high_lr (lr=5e-05, dropout=0.1)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training on IMDb...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2385,0.277552,0.88132
2,0.1287,0.263055,0.9122
3,0.0663,0.33377,0.91332


Evaluating on IMDb...


IMDb Accuracy (high_lr): 0.9133

Running experiment: high_dropout (lr=2e-05, dropout=0.2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training on IMDb...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2458,0.240698,0.90204
2,0.1839,0.249623,0.91072
3,0.1458,0.261323,0.91336


Evaluating on IMDb...


IMDb Accuracy (high_dropout): 0.9134
Done! Results saved in ./imdb_results_*/
