In [1]:
from datasets import load_dataset
from transformers import DistilBertTokenizerFast

In [2]:
# Load the IMDB dataset
ds = load_dataset("stanfordnlp/imdb")

In [3]:
# Initialize the DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')



In [4]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

In [5]:
tokenized_ds = ds.map(tokenize_function, batched=True)
tokenized_ds = tokenized_ds.remove_columns(["text"])
tokenized_ds = tokenized_ds.rename_column("label", "labels")
tokenized_ds.set_format("torch")

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [6]:
#Prepare Data Loaders
from torch.utils.data import DataLoader

train_ds = tokenized_ds["train"]
test_ds = tokenized_ds["test"]

train_loader = DataLoader(train_ds, shuffle=True, batch_size=16)
test_loader = DataLoader(test_ds, batch_size=16)

In [7]:
# Import Pre-trained DistilBERT Model
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

# Load the pre-trained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [8]:
#train model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2266,0.223872
2,0.1532,0.215872
3,0.0843,0.280437


TrainOutput(global_step=4689, training_loss=0.16312786744179364, metrics={'train_runtime': 7650.8711, 'train_samples_per_second': 9.803, 'train_steps_per_second': 0.613, 'total_flos': 9935054899200000.0, 'train_loss': 0.16312786744179364, 'epoch': 3.0})

In [9]:
#Evaluate the Model
results = trainer.evaluate()

print(f"Evaluation results: {results}")

Evaluation results: {'eval_loss': 0.28043708205223083, 'eval_runtime': 612.1303, 'eval_samples_per_second': 40.841, 'eval_steps_per_second': 2.553, 'epoch': 3.0}
