In [1]:
from transformers import TrainingArguments, Trainer, DistilBertForSequenceClassification, DistilBertTokenizer
from datasets import load_dataset
import evaluate
import torch

In [2]:
# Load the IMDB dataset
ds = load_dataset("stanfordnlp/imdb")

In [3]:
# Inspect the dataset
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [4]:
#Preprocess the Dataset

In [5]:
# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')



In [6]:
# Define a preprocessing function
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)


In [7]:
# Apply the preprocessing function to the dataset
tokenized_datasets = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [8]:
# Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(['text'])

In [9]:
# Rename the label column to "labels"
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [10]:
# Set the format of the dataset
tokenized_datasets.set_format('torch')

In [11]:
#Define the Metrics

In [12]:
# Load the accuracy metric
accuracy_metric = evaluate.load("accuracy")


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
# Define a function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(logits, dim=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return accuracy

In [14]:
#Initialize the Model and Trainer

In [15]:
# Load the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='my_model',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,  # You can adjust the number of epochs
    push_to_hub=False,
)


In [17]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [18]:
# Train the model
trainer.train()

Step,Training Loss
500,0.3236
1000,0.2477
1500,0.2269
2000,0.1635
2500,0.1511
3000,0.1447
3500,0.1019
4000,0.077
4500,0.0938


TrainOutput(global_step=4689, training_loss=0.16626387929885875, metrics={'train_runtime': 5771.0135, 'train_samples_per_second': 12.996, 'train_steps_per_second': 0.813, 'total_flos': 9935054899200000.0, 'train_loss': 0.16626387929885875, 'epoch': 3.0})

In [19]:
# Save the model
model.save_pretrained('my_model')
tokenizer.save_pretrained('my_model')

('my_model/tokenizer_config.json',
 'my_model/special_tokens_map.json',
 'my_model/vocab.txt',
 'my_model/added_tokens.json')

In [20]:
#After saving the model, you can use it for making predictions on new text data:

In [21]:
# Load the model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('my_model')
tokenizer = DistilBertTokenizer.from_pretrained('my_model')

In [22]:
# Define the pipeline for sentiment analysis
from transformers import pipeline
pipe = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [23]:
# Example text data
data = [
    "I love this movie, it was fantastic!",
    "The film was boring and too long.",
    "An excellent performance by the lead actor.",
    "Not my cup of tea, I didn't enjoy it."
]


In [24]:
# Make predictions
preds = pipe(data)

In [25]:
# Inspect the predictions
for text, pred in zip(data, preds):
    print(f'Text: {text}')
    print(f'Prediction: {pred}\n')

Text: I love this movie, it was fantastic!
Prediction: {'label': 'LABEL_1', 'score': 0.9980655312538147}

Text: The film was boring and too long.
Prediction: {'label': 'LABEL_0', 'score': 0.9980814456939697}

Text: An excellent performance by the lead actor.
Prediction: {'label': 'LABEL_1', 'score': 0.9980276226997375}

Text: Not my cup of tea, I didn't enjoy it.
Prediction: {'label': 'LABEL_0', 'score': 0.9947769641876221}

