In [None]:
!pi
!pip install transformers

# Loading Dataset

In [1]:
#Loading the dataset
from datasets import load_dataset
raw_datasets = load_dataset("glue","sst2")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [2]:
raw_datasets["train"]

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [3]:
raw_datasets["train"][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

# Data Preprocessing

In [4]:
# Preprocessing the data
from transformers import AutoTokenizer
checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(example["sentence"],truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [5]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [6]:
tokenized_datasets["train"]

Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 67349
})

In [7]:
tokenized_datasets["train"][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0,
 'input_ids': [101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Dynamic Padding

In [8]:
# Dynamic Padding
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Fine Tuning

In [9]:
# Defining train_args
from transformers import TrainingArguments
training_args = TrainingArguments("fine_tuned_sst2")

In [10]:
# Preparing the model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install evaluate

In [18]:
import evaluate
import numpy as np
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "sst2")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [23]:
# Create a trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [24]:
# Initiate the trainer
trainer.train()

Step,Training Loss
500,0.2602
1000,0.2042
1500,0.2687
2000,0.2482
2500,0.2558
3000,0.2849
3500,0.2503


KeyboardInterrupt: ignored

In [25]:
# Evaluation Results
evaluation_results = trainer.evaluate(tokenized_datasets["validation"])
print(evaluation_results)

Step,Training Loss,Validation Loss,Accuracy
500,0.2602,,
1000,0.2042,,
1500,0.2687,,
2000,0.2482,,
2500,0.2558,,
3000,0.2849,,
3500,0.2503,,
3794,0.2503,0.630051,0.856651


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'eval_loss': 0.6300514936447144, 'eval_accuracy': 0.856651376146789}


In [26]:
# Saving the fine-tuned model
trainer.save_model("fine_tuned_sst2_model")