Resource consulted: https://skimai.com/fine-tuning-bert-for-sentiment-analysis/

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 14.4 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 53.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 72.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 59.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 624 kB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers


In [None]:
from google.colab import files
CS_file = files.upload()

Saving CovidSentimentData.csv to CovidSentimentData.csv


In [None]:
import io
import pandas as pd

data = pd.read_csv(io.BytesIO(CS_file['CovidSentimentData.csv']))

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback


# Read data
#data = pd.read_csv("train.csv")

# Define pretrained tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model = model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, eps=1e-8)
lambda1 = lambda epoch: 0.65 ** epoch
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=lambda1)
optimizers = optimizer, scheduler

# ----- 1. Preprocess data -----#
# Preprocess data

X = list(data["Tweet"])
y = list(data["target"])

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define Trainer
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    seed=19,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    #optimizers=optimizers,
)

# Train pre-trained model

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 15300
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 11475


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,No log,0.607305,0.827496,0.827496,1.0,0.905606
200,No log,0.491077,0.827496,0.827496,1.0,0.905606
300,No log,0.461858,0.827496,0.827496,1.0,0.905606
400,No log,0.464268,0.827496,0.827496,1.0,0.905606
500,0.530700,0.488258,0.827496,0.827496,1.0,0.905606
600,0.530700,0.47161,0.827496,0.827496,1.0,0.905606
700,0.530700,0.46196,0.827496,0.827496,1.0,0.905606
800,0.530700,0.459857,0.827496,0.827496,1.0,0.905606
900,0.530700,0.471234,0.827496,0.827496,1.0,0.905606
1000,0.500600,0.51099,0.827496,0.827496,1.0,0.905606


***** Running Evaluation *****
  Num examples = 3826
  Batch size = 4
***** Running Evaluation *****
  Num examples = 3826
  Batch size = 4
***** Running Evaluation *****
  Num examples = 3826
  Batch size = 4
***** Running Evaluation *****
  Num examples = 3826
  Batch size = 4
***** Running Evaluation *****
  Num examples = 3826
  Batch size = 4
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3826
  Batch size = 4
***** Running Evaluation *****
  Num examples = 3826
  Batch size = 4
***** Running Evaluation *****
  Num examples = 3826
  Batch size = 4
***** Running Evaluation *****
  Num examples = 3826
  Batch size = 4
***** Running Evaluation *****
  Num examples = 3826
  Batch size = 4
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in 

TrainOutput(global_step=2100, training_loss=0.5165888123285203, metrics={'train_runtime': 441.0912, 'train_samples_per_second': 104.06, 'train_steps_per_second': 26.015, 'total_flos': 561166547760000.0, 'train_loss': 0.5165888123285203, 'epoch': 0.55})

In [None]:
from google.colab import files
PS_file = files.upload()

KeyboardInterrupt: ignored

In [None]:
ps_data = pd.read_csv('/content/prediction_score.csv')

In [None]:
# ----- 3. Predict -----#
# Load test data
test_data = ps_data

X_test = list(test_data["Tweet"])
y_test = list(test_data["target"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized, y_test)

# Load trained model

#model_path = "output/checkpoint-50000"
#model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)
#model = model.train()
#optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, eps=1e-8)

#model = model.eval()

#eps, alpha = optimizer.privacy_engine.get_privacy_spent(DELTA)

#raw_pred, _, _ = model.predict(X_test)
#y_pred = np.argmax(raw_pred, axis=1)

#print(y_pred)
# Define test trainer
#test_trainer = Trainer(test_model)

# Make prediction
raw_pred, _, _ = trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)


***** Running Prediction *****
  Num examples = 5
  Batch size = 4


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
y_pred

array([1, 1, 1, 1, 1])

In [None]:
raw_pred

array([[-1.5001374 ,  0.74881935],
       [-1.5001309 ,  0.7488121 ],
       [-1.5001175 ,  0.7488279 ],
       [-1.500128  ,  0.74884063],
       [-1.5001231 ,  0.74882114]], dtype=float32)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
def evaluate(model):    
    model.eval()

    loss_arr = []
    pred_arr = []
    label_arr = []
    accuracy_arr = []
    
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2],
                      'labels':         batch[3]}

            outputs = model(**inputs)
            loss, logits = outputs[:2]
            print(logits.detach().cpu().numpy()[0])
            preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
            labels = inputs['labels'].detach().cpu().numpy()
            
            loss_arr.append(loss.item())
            pred_arr.append(preds)
            label_arr.append(labels)
            #accuracy_arr.append(accuracy(preds, labels))

    score = 0
    for i in range(len(pred_arr)):

      if pred_arr[i] == label_arr[i]:
        score += 1

    acc = score/len(pred_arr)

    model.train()
    return np.mean(loss_arr), acc
    #np.mean(accuracy_arr)

In [None]:
# ----- 3. Predict -----#
# Load test data
test_data = pd.read_csv("test.csv")
X_test = list(test_data["review"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Load trained model
model_path = "output/checkpoint-50000"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

# Define test trainer
test_trainer = Trainer(model)

# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)