### loading the data

In [1]:
from datasets import load_dataset, DatasetDict

imdb_dataset = load_dataset("imdb")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

### structuring the data

In [2]:
# print(imdb_dataset)
imdb = DatasetDict(train = imdb_dataset['train'].shuffle(seed = 1111).select(range(20000)),
                    val = imdb_dataset['train'].shuffle(seed = 1111).select(range(20000, 25000)),
                    test = imdb_dataset['test']
                    )
print(imdb)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 20000
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})


### loading the tokenizer

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### tokenizing the data

In [4]:
tokenized_dataset = imdb.map(
    lambda example : tokenizer(example['text'], padding = "max_length", max_length= 256, truncation=True),
    batched = True,
    batch_size = 64,
)

tokenized_dataset = tokenized_dataset.remove_columns(['text'])
tokenized_dataset = tokenized_dataset.rename_columns({'label':'labels'})
tokenized_dataset.set_format("torch")


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [5]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20000
    })
    val: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
})


### fine tuning using trainer api

In [None]:
from transformers import TrainingArguments, Trainer
from transformers import BertForSequenceClassification
import numpy as np
from sklearn.metrics import f1_score
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


arguments = TrainingArguments(
    output_dir="checkpoints",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=16,
    fp16=True,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    load_best_model_at_end=True,
    seed=224,
    report_to="none"
)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = np.mean(predictions == labels)
    f1 = f1_score(labels, predictions, average="weighted")
    return {
        "accuracy": acc,
        "f1": f1
    }


trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [26]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.22004,0.912,0.912
2,0.249000,0.228313,0.9138,0.9138
3,0.249000,0.262003,0.9142,0.914196


TrainOutput(global_step=939, training_loss=0.1907820371520151, metrics={'train_runtime': 1016.6875, 'train_samples_per_second': 59.015, 'train_steps_per_second': 0.924, 'total_flos': 7893331660800000.0, 'train_loss': 0.1907820371520151, 'epoch': 3.0})

### saving the model

In [None]:
# Save the best model
trainer.model.save_pretrained("./best_model")
trainer.tokenizer.save_pretrained("./best_model")

### printing the results

In [27]:
results = trainer.evaluate(tokenized_dataset['test'])
print(results)

{'eval_loss': 0.2140759527683258, 'eval_accuracy': 0.91544, 'eval_f1': 0.9154371944090613, 'eval_runtime': 113.1951, 'eval_samples_per_second': 220.858, 'eval_steps_per_second': 55.214, 'epoch': 3.0}


### loading the fine tuned saved model

In [6]:
from transformers import BertForSequenceClassification, BertTokenizer

loaded_model = BertForSequenceClassification.from_pretrained("./best_model")
loaded_tokenizer = BertTokenizer.from_pretrained("./best_model")

### Predict function

In [7]:
import torch
def predict_sentiment(model, tokenizer, text) :
  model_inputs = tokenizer(text, return_tensors='pt')
  pred = torch.argmax(loaded_model(**model_inputs).logits)
  return ['NEGATIVE', 'POSITIVE'][pred]

print(predict_sentiment(loaded_model, loaded_tokenizer, "the movie was not good"))

NEGATIVE


### examples

In [8]:
print(predict_sentiment(loaded_model, loaded_tokenizer, "the movie was ok but not good"))
print(predict_sentiment(loaded_model, loaded_tokenizer, "it is not satisfying"))
print(predict_sentiment(loaded_model, loaded_tokenizer, "This is disastrous"))
print(predict_sentiment(loaded_model, loaded_tokenizer, "the movie is enjoyable"))
print(predict_sentiment(loaded_model, loaded_tokenizer, "the screeplay is average but the movie overal is good"))

NEGATIVE
NEGATIVE
NEGATIVE
POSITIVE
POSITIVE


In [9]:
loaded_model.push_to_hub("koushik-25/bert-imdb-sentiment")
loaded_tokenizer.push_to_hub("koushik-25/bert-imdb-sentiment")

CommitInfo(commit_url='https://huggingface.co/koushik-25/bert-imdb-sentiment/commit/8ba2f8eee410674c5714c707b1367d0a98a07074', commit_message='Upload tokenizer', commit_description='', oid='8ba2f8eee410674c5714c707b1367d0a98a07074', pr_url=None, repo_url=RepoUrl('https://huggingface.co/koushik-25/bert-imdb-sentiment', endpoint='https://huggingface.co', repo_type='model', repo_id='koushik-25/bert-imdb-sentiment'), pr_revision=None, pr_num=None)