In [2]:
!pip install --upgrade datasets fsspec transformers

Collecting datasets
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting fsspec
  Downloading fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.3.0-py3-none-any.whl (506 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.9.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.3/199.3 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, fsspec, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Unins

In [3]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [4]:
## Other dataset which can be used
# load_dataset("ag_news")
# load_dataset("dbpedia_14")

In [5]:
## Load IMDB Dataset
dataset = load_dataset("imdb")

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [7]:
## Taking only a range of sample for training

train_dataset = dataset['train'].select(range(1000))
test_dataset = dataset['test'].select(range(500))

In [8]:
## Initialize tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [10]:
train_dataset[0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [11]:
# tokenize and add padding to the text in order to make it same length

def tokenize_fn(data):
    return tokenizer(data["text"], padding="max_length", truncation=True, max_length=256)

In [12]:
# Apply tokenization + rename + format in a single flow
def preprocess(ds):
    ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])  # remove raw text (saves memory)
    ds = ds.rename_column("label", "labels")
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    return ds

In [13]:
## Preprocess the train and test dataset

train_dataset = preprocess(train_dataset)
test_dataset = preprocess(test_dataset)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [26]:
train_dataset[0]

{'labels': tensor(0),
 'input_ids': tensor([  101,  1045, 12524,  1045,  2572,  8025,  1011,  3756,  2013,  2026,
          2678,  3573,  2138,  1997,  2035,  1996,  6704,  2008,  5129,  2009,
          2043,  2009,  2001,  2034,  2207,  1999,  3476,  1012,  1045,  2036,
          2657,  2008,  2012,  2034,  2009,  2001,  8243,  2011,  1057,  1012,
          1055,  1012,  8205,  2065,  2009,  2412,  2699,  2000,  4607,  2023,
          2406,  1010,  3568,  2108,  1037,  5470,  1997,  3152,  2641,  1000,
          6801,  1000,  1045,  2428,  2018,  2000,  2156,  2023,  2005,  2870,
          1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,  1028,  1996,
          5436,  2003,  8857,  2105,  1037,  2402,  4467,  3689,  3076,  2315,
         14229,  2040,  4122,  2000,  4553,  2673,  2016,  2064,  2055,  2166,
          1012,  1999,  3327,  2016,  4122,  2000,  3579,  2014,  3086,  2015,
          2000,  2437,  2070,  4066,  1997,  4516,  2006,  2054,  1996,  2779,
         25430, 1

In [14]:
## Load the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
## Check the layer of BERT
for layer in model.bert.encoder.layer:
  print(layer)

BertLayer(
  (attention): BertAttention(
    (self): BertSdpaSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)
BertLayer(
  (attention): BertAttention(
    (self): BertSdpaSelfAttention(
 

In [16]:
"""
This section controls fine-tuning behavior for the BERT model.

- By default, the classifier head remains trainable.
- Optionally, all BERT encoder layers can be frozen to retain pretrained knowledge.
- The last two encoder layers can then be unfrozen to allow limited task-specific adaptation.

This approach helps balance performance, training time, and model stability during fine-tuning.
"""
## Classifier head is trainable by default
## This is just reference code for later use incase needed

# for param in model.bert.parameters():
#     param.requires_grad = False  # Freeze BERT encoder

## Unfreeze last 2 encoder layers

# for layer in model.bert.encoder.layer[-2:]:
#     for param in layer.parameters():
#         param.requires_grad = True


'\nThis section controls fine-tuning behavior for the BERT model.\n\n- By default, the classifier head remains trainable.\n- Optionally, all BERT encoder layers can be frozen to retain pretrained knowledge.\n- The last two encoder layers can then be unfrozen to allow limited task-specific adaptation.\n\nThis approach helps balance performance, training time, and model stability during fine-tuning.\n'

In [17]:
## Set the training arguments

from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./bert-finetuned-imdb",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    logging_dir="./logs",
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none"
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [19]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=125, training_loss=0.021256128311157228, metrics={'train_runtime': 54.4987, 'train_samples_per_second': 18.349, 'train_steps_per_second': 2.294, 'total_flos': 131555527680000.0, 'train_loss': 0.021256128311157228, 'epoch': 1.0})

In [20]:
## Save model and tokenizer

trainer.save_model("./bert-finetuned-imdb")
tokenizer.save_pretrained("./bert-finetuned-imdb")


('./bert-finetuned-imdb/tokenizer_config.json',
 './bert-finetuned-imdb/special_tokens_map.json',
 './bert-finetuned-imdb/vocab.txt',
 './bert-finetuned-imdb/added_tokens.json')

In [21]:
## Metrics

metrics = trainer.evaluate()
print(metrics)

{'eval_loss': 0.0010020863264799118, 'eval_runtime': 6.9307, 'eval_samples_per_second': 72.143, 'eval_steps_per_second': 9.09, 'epoch': 1.0}


## Prediction

In [22]:
## Load the tokenizer and model from the dir that were saved earlier

tokenizer = BertTokenizer.from_pretrained("/content/bert-finetuned-imdb")
model = BertForSequenceClassification.from_pretrained("/content/bert-finetuned-imdb")

In [27]:
from transformers import pipeline
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)


# Predict
text = "This movie was bad"
result = classifier(text)

print(result)  # Example: [{'label': 'POSITIVE', 'score': 0.98}]


Device set to use cuda:0


[{'label': 'LABEL_0', 'score': 0.9930747747421265}]


## Pushing the model to HuggingFace

In [28]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [29]:
from huggingface_hub import whoami
print(whoami())

{'type': 'user', 'id': '662bb4fd52e194d5d4193924', 'name': 'msaifee', 'fullname': 'Murtuza Saifee', 'isPro': False, 'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/no-auth/KtEkHk3QO_LziJ9jJCSmN.png', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'HF_Token', 'role': 'fineGrained', 'createdAt': '2025-02-07T14:28:32.493Z', 'fineGrained': {'canReadGatedRepos': True, 'global': [], 'scoped': [{'entity': {'_id': '662bb4fd52e194d5d4193924', 'type': 'user', 'name': 'msaifee'}, 'permissions': ['repo.content.read', 'repo.write', 'inference.endpoints.infer.write', 'inference.endpoints.write', 'user.webhooks.read', 'collection.read', 'collection.write', 'inference.serverless.write']}]}}}}


In [30]:
tokenizer.push_to_hub("msaifee/bert-imdb-susbset")
trainer.push_to_hub("msaifee/bert-imdb-susbset")


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ed-imdb/training_args.bin: 100%|##########| 5.78kB / 5.78kB            

  ...ed-imdb/model.safetensors:   1%|1         | 6.32MB /  438MB            

CommitInfo(commit_url='https://huggingface.co/msaifee/bert-finetuned-imdb/commit/4b28eb1939f68a50662d5afad86a67a1ff1f4388', commit_message='msaifee/bert-imdb-susbset', commit_description='', oid='4b28eb1939f68a50662d5afad86a67a1ff1f4388', pr_url=None, repo_url=RepoUrl('https://huggingface.co/msaifee/bert-finetuned-imdb', endpoint='https://huggingface.co', repo_type='model', repo_id='msaifee/bert-finetuned-imdb'), pr_revision=None, pr_num=None)

In [31]:
'''
Complete code with the dynamic padding as well
'''

# from datasets import load_dataset
# from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# # 1. Load IMDB dataset (subset for speed)
# dataset = load_dataset("imdb")
# train_dataset = dataset["train"].select(range(1000))
# test_dataset = dataset["test"].select(range(500))

# # 2. Initialize tokenizer
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# # 3. Tokenization function (no fixed padding here)
# def tokenize_fn(data):
#     return tokenizer(data["text"], truncation=True, max_length=256)

# # 4. Preprocess dataset (map + rename + torch format)
# def preprocess(ds):
#     ds = ds.map(tokenize_fn, batched=True, remove_columns=["text"])  # remove raw text
#     ds = ds.rename_column("label", "labels")
#     ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
#     return ds

# train_dataset = preprocess(train_dataset)
# test_dataset = preprocess(test_dataset)

# # 5. Initialize model
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# # 6. Data collator (dynamic padding)
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# # 7. Training arguments
# training_args = TrainingArguments(
#     output_dir="./bert-finetuned-imdb",
#     num_train_epochs=1,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     logging_dir="./logs",
#     logging_steps=50,
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     eval_steps=500,
#     save_steps=500,
#     save_total_limit=1,
#     report_to="none"
# )

# # 8. Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     data_collator=data_collator,  # dynamic padding here
# )

# # 9. Train
# trainer.train()

'\nComplete code with the dynamic padding as well\n'