In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
!pip install transformers[torch]
!pip install accelerate
!pip install evaluate

In [96]:
import pandas as pd
from transformers import Trainer, get_scheduler, TrainingArguments, AutoModelForSequenceClassification
from accelerate import Accelerator
import pickle
from torch.optim import AdamW
import torch
import numpy as np
import evaluate
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import classification_report




In [4]:
# Load tuned hyperparameters
with open('/content/drive/MyDrive/BERT Sentiment/output/best_hyperparameters.pkl', 'rb') as infile:
    best_hyperparameters = pickle.load(infile)


In [5]:
best_hyperparameters


{'learning_rate': 4.122342215733177e-05,
 'num_train_epochs': 1,
 'gradient_accumulation_steps': 2,
 'per_device_train_batch_size': 12,
 'evaluation_strategy': 'epoch',
 'per_device_eval_batch_size': 5,
 'warmup_steps': 391,
 'weight_decay': 0.08139944860406301}

In [6]:
# Create Distilbert Model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2)


Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Set training arguments
training_args = TrainingArguments(learning_rate = best_hyperparameters['learning_rate'],
                                  num_train_epochs = best_hyperparameters['num_train_epochs'],
                                  per_device_train_batch_size = best_hyperparameters['per_device_train_batch_size'],
                                  per_device_eval_batch_size = best_hyperparameters['per_device_eval_batch_size'],
                                  gradient_accumulation_steps = best_hyperparameters['gradient_accumulation_steps'],
                                  weight_decay = best_hyperparameters['weight_decay'],
                                  warmup_steps = best_hyperparameters['warmup_steps'],
                                  evaluation_strategy = best_hyperparameters['evaluation_strategy'],
                                  output_dir='drive/MyDrive/BERT Sentiment/output',
                                  logging_dir='drive/MyDrive/BERT Sentiment/output/logs',
                                  logging_steps=1000)


In [8]:
# Create evaluation metric
metric = evaluate.load('f1')

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)



In [10]:
# Load about a million datapoints as the train dataset
train_sets = [pd.read_parquet('/content/drive/MyDrive/BERT Sentiment/CSVs/train_inputs_1.parquet'),
              pd.read_parquet('/content/drive/MyDrive/BERT Sentiment/CSVs/train_inputs_2.parquet'),
              pd.read_parquet('/content/drive/MyDrive/BERT Sentiment/CSVs/train_inputs_3.parquet')]





In [11]:
# Reduce training size
train_set = pd.concat(train_sets)[:500000]

In [12]:
# Clear duplicate from memory
del(train_sets)

In [13]:
# Choose more training data to be eval set
eval_set = pd.read_parquet('/content/drive/MyDrive/BERT Sentiment/CSVs/train_inputs_5.parquet')


In [15]:
train_set.head()

Unnamed: 0,index,input_ids,attention_mask
0,2025630,"[101, 6331, 1024, 2293, 1996, 6331, 1998, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,198034,"[101, 2023, 2003, 2011, 2521, 2026, 5440, 2173...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,2418716,"[101, 1996, 2190, 1997, 1996, 6386, 1005, 2220...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,1371917,"[101, 1043, 2135, 17119, 3170, 2192, 7242, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,1524295,"[101, 5013, 7641, 2041, 1011, 1011, 3828, 2115...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [16]:
# Retrieve labels for train/eval data
y_train = pd.read_csv('/content/drive/MyDrive/BERT Sentiment/CSVs/y_train_full.csv')


In [17]:
y_train.columns = ['index','labels']

In [18]:
y_train.head()

Unnamed: 0,index,labels
0,2522958,1
1,1160125,2
2,861121,1
3,300957,1
4,1610389,2


In [19]:
# Append labels to datasets
train_set = pd.merge(train_set, y_train, on='index')
eval_set = pd.merge(eval_set, y_train, on='index')


In [21]:
eval_set.head()

Unnamed: 0,index,input_ids,attention_mask,labels
0,2618276,"[101, 2025, 1037, 2128, 23567, 2075, 1024, 263...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,4576001,"[101, 12476, 999, 999, 999, 999, 1024, 2023, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
2,3826117,"[101, 2023, 2003, 2005, 2613, 1024, 2005, 2061...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
3,1337471,"[101, 6581, 3185, 1024, 5006, 18463, 2001, 202...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
4,4587331,"[101, 16030, 1010, 3154, 1010, 3622, 1010, 140...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2


In [22]:
# Reformat labels from 1 and 2 to 0 and 1
train_set['labels'] = train_set['labels'] - 1
eval_set['labels'] = eval_set['labels'] - 1

In [23]:
# Format dataframe for use in model
train_set.drop(columns=['index'], inplace=True)
eval_set.drop(columns=['index'], inplace=True)

In [24]:
# Convert dataframes to torch dataset objects
class SentimentDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

  def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

  def __len__(self):
        return len(self.labels)

In [25]:
train_dataset = SentimentDataset(train_set, train_set['labels'])
eval_dataset = SentimentDataset(eval_set, eval_set['labels'])

In [26]:
# Create objects for parallel training
accelerator = Accelerator()

In [27]:
train_dataloader = DataLoader(train_dataset, batch_size=48, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=48, shuffle=True)


In [28]:
optimizer = AdamW(model.parameters(), lr=best_hyperparameters['learning_rate'])

In [29]:
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(train_dataloader, eval_dataloader, model, optimizer)


In [30]:
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics)

In [31]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [32]:
progress_bar = tqdm(range(num_training_steps))

  0%|          | 0/31251 [00:00<?, ?it/s]

In [33]:
# Train model
model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    outputs = model(**batch)
    loss = outputs.loss
    accelerator.backward(loss)
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)


100%|██████████| 31251/31251 [4:25:56<00:00,  2.15it/s]

In [41]:
# Save trained model
torch.save(model.state_dict(), 'drive/MyDrive/BERT Sentiment/Models/sentiment_model.pt')


In [46]:
# Test model on unseen dataset
evaluation_set = pd.read_parquet('/content/drive/MyDrive/BERT Sentiment/CSVs/train_inputs_4.parquet')


In [56]:
y_train = pd.read_csv('/content/drive/MyDrive/BERT Sentiment/CSVs/y_train_full.csv')
y_train.columns = ['index','labels']
y_train['labels'] = y_train['labels'] - 1


In [57]:
y_train.head()

Unnamed: 0,index,labels
0,2522958,0
1,1160125,1
2,861121,0
3,300957,0
4,1610389,1


In [60]:
evaluation_set = evaluation_set.merge(y_train, on='index').reset_index(drop=True)

In [61]:
evaluation_set.head()

Unnamed: 0,index,input_ids,attention_mask,labels
0,1309933,"[101, 3645, 5818, 7367, 2022, 8059, 999, 1024,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
1,3193510,"[101, 13044, 2987, 1005, 1056, 2147, 1024, 202...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0
2,3806632,"[101, 2009, 1005, 1055, 3959, 2395, 2061, 2009...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3,319434,"[101, 2023, 2003, 1996, 2190, 2173, 2057, 2031...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
4,654774,"[101, 2470, 2077, 9343, 1024, 1045, 2031, 4149...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0


In [62]:
evaluation_dataset = SentimentDataset(evaluation_set, evaluation_set['labels'])


In [63]:
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
total_loss = 0
y_pred = []
y_true = []

with torch.no_grad():
  for batch in tqdm(evaluation_dataset):
    input_ids = batch['input_ids'].to('cuda:0')
    attention_mask = batch['attention_mask'].to('cuda:0')
    labels = batch['labels'].to('cuda:0')

    model_outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = model_outputs.loss
    total_loss += loss.item()

    logits = model_outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    y_pred.extend(predictions)
    y_true.extend(batch['labels'].to('cuda:0'))






In [83]:
total_loss

34899.03403720866

In [86]:
len(y_pred)

278857

In [None]:
y_true = []
with torch.no_grad():
  for batch in tqdm(evaluation_dataset):
    y_true.append(batch['labels'].to('cuda:0'))



In [103]:
# Turn tensors to ints
for i in range(len(y_true)):
  y_true[i] = y_true[i].item()
  y_pred[i] = y_pred[i].item()



In [108]:
print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

           0       0.97      0.96      0.96    139721
           1       0.96      0.97      0.96    139136

    accuracy                           0.96    278857
   macro avg       0.96      0.96      0.96    278857
weighted avg       0.96      0.96      0.96    278857

