In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import evaluate
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from accelerate import Accelerator


part 1: loading the IMDb dataset library

In [2]:
dataset= load_dataset("IMDb")

part 2: pre processing of DATA

In [3]:
checkpoints= "bert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(checkpoints)

def tokenize_function(abcd):
    return tokenizer(abcd["text"], truncation =True)
tokenized_datasets = dataset.map(tokenize_function,batched=True)

part 2 - contunation- pre_precessing of DATA 

In [4]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(checkpoints, num_labels=2)
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

In [6]:
from transformers.training_args import IntervalStrategy

training_args = TrainingArguments(
    output_dir="./test",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,           
    per_device_eval_batch_size=8,              
    num_train_epochs=3,                        
    learning_rate=2e-5,                        
    weight_decay=0.01,                         
    warmup_steps=500,                          
    logging_dir="./logs",
    logging_steps=3,
    save_total_limit=2,                        
    load_best_model_at_end=True,              
    metric_for_best_model="accuracy",          
    greater_is_better=True                     
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [7]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3448,0.366924,0.851,0.859566
2,0.5104,0.342427,0.899,0.897044
3,0.2117,0.439901,0.889,0.892338


TrainOutput(global_step=750, training_loss=0.3689359983454148, metrics={'train_runtime': 795.5731, 'train_samples_per_second': 7.542, 'train_steps_per_second': 0.943, 'total_flos': 1463008467778080.0, 'train_loss': 0.3689359983454148, 'epoch': 3.0})

part 4 and 5: Evaluating model's accuracy and saving it

In [8]:
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")
trainer.save_model("./fine-tuned-bert-imdb")
tokenizer.save_pretrained("./fine-tuned-bert-imdb")

Evaluation Results: {'eval_loss': 0.3424273133277893, 'eval_accuracy': 0.899, 'eval_f1': 0.8970438328236493, 'eval_runtime': 27.0399, 'eval_samples_per_second': 36.982, 'eval_steps_per_second': 4.623, 'epoch': 3.0}


('./fine-tuned-bert-imdb/tokenizer_config.json',
 './fine-tuned-bert-imdb/special_tokens_map.json',
 './fine-tuned-bert-imdb/vocab.txt',
 './fine-tuned-bert-imdb/added_tokens.json',
 './fine-tuned-bert-imdb/tokenizer.json')

part 5: demonstrating how to load it for further projects:

In [14]:
from transformers import pipeline

sentiment_pipe = pipeline(
    "sentiment-analysis",
    model="./fine-tuned-bert-imdb",
    tokenizer="./fine-tuned-bert-imdb/"
)

sentiment_pipe("i am happy")


[{'label': 'LABEL_1', 'score': 0.7268068790435791}]