In [72]:
# @title Installing Transformers
from IPython.display import clear_output
!pip install transformers
!pip install datasets
!pip install evaluate
from google.colab import drive
drive.mount('/content/drive')
clear_output()

In [73]:
# @title Importing Requirements

from transformers import (
    BertConfig,
    BertTokenizer,
    TFBertModel,
    BertModel,
    glue_processors,
    glue_convert_examples_to_features,
    set_seed
)
from transformers.optimization_tf import create_optimizer

import tensorflow as tf
import tensorflow_datasets
import numpy as np
import copy 
import os
import pandas as pd

In [67]:
from datasets import load_dataset

dataset = load_dataset("glue", "sst2")



  0%|          | 0/3 [00:00<?, ?it/s]

In [68]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

### Label Distribution

In [69]:
un, c = np.unique(dataset['train']['label'], return_counts=True)
c = c/np.sum(c)*100
for i in zip(un,c):
  print(i)

(0, 44.21743455730597)
(1, 55.78256544269403)


### Average token length

In [70]:
new = []
for i in dataset['train']['sentence']:
  new.append(len(i.split(' ')))
np.mean(new)

10.409553222765

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [75]:
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", add_special_tokens=True, 
                                       return_attention_mask=True, return_token_type_ids=True, max_length = 64,
                                       pad_to_max_length=True, truncation =True)

In [76]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/68 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [77]:
tokenized_datasets['validation']

Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 872
})

### Select how much data to train on

In [78]:
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42)
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42)

### Fine-tune new model or existing model

In [79]:
model_path = 'bert-base-uncased'
# model_path = '/content/drive/MyDrive/save_checkpoints/DataSize/Fine-Tuned-Models/sst2-full-42'

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

In [81]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [82]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [83]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="sst2",
                                  evaluation_strategy="epoch",
                                  do_train=True, 
                                  num_train_epochs=5,
                                  learning_rate=5e-5,
                                  seed=42,
                                  save_strategy='epoch',
                                  per_device_eval_batch_size=16,
                                  per_device_train_batch_size=16)



In [84]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

##### Copy fine-tuned model to Drive

In [None]:
!cp -r model_path /content/drive/myDrive/models/sst2