In [16]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [17]:
!pip install transformers[torch]
!pip install accelerate
!pip install evaluate



In [18]:
import pandas as pd
from transformers import Trainer, get_scheduler, TrainingArguments, AutoModelForSequenceClassification
from accelerate import Accelerator
import pickle
from torch.optim import AdamW
import torch
import numpy as np
import evaluate
from torch.utils.data import DataLoader
from tqdm import tqdm





In [19]:
# Load tuned hyperparameters
with open('/content/drive/MyDrive/BERT Sentiment/output/best_hyperparameters.pkl', 'rb') as infile:
    best_hyperparameters = pickle.load(infile)


In [20]:
best_hyperparameters


{'learning_rate': 4.122342215733177e-05,
 'num_train_epochs': 1,
 'gradient_accumulation_steps': 2,
 'per_device_train_batch_size': 12,
 'evaluation_strategy': 'epoch',
 'per_device_eval_batch_size': 5,
 'warmup_steps': 391,
 'weight_decay': 0.08139944860406301}

In [21]:
# Create Distilbert Model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Set training arguments
training_args = TrainingArguments(learning_rate = best_hyperparameters['learning_rate'],
                                  num_train_epochs = best_hyperparameters['num_train_epochs'],
                                  per_device_train_batch_size = best_hyperparameters['per_device_train_batch_size'],
                                  per_device_eval_batch_size = best_hyperparameters['per_device_eval_batch_size'],
                                  gradient_accumulation_steps = best_hyperparameters['gradient_accumulation_steps'],
                                  weight_decay = best_hyperparameters['weight_decay'],
                                  warmup_steps = best_hyperparameters['warmup_steps'],
                                  evaluation_strategy = best_hyperparameters['evaluation_strategy'],
                                  output_dir='drive/MyDrive/BERT Sentiment/output',
                                  logging_dir='drive/MyDrive/BERT Sentiment/output/logs',
                                  logging_steps=1000)


In [23]:
# Create evaluation metric
metric = evaluate.load('f1')

In [24]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)



In [25]:
# Load about a million datapoints as the train dataset
train_sets = [pd.read_parquet('/content/drive/MyDrive/BERT Sentiment/CSVs/train_inputs_1.parquet'),
              pd.read_parquet('/content/drive/MyDrive/BERT Sentiment/CSVs/train_inputs_2.parquet'),
              pd.read_parquet('/content/drive/MyDrive/BERT Sentiment/CSVs/train_inputs_3.parquet'),
              pd.read_parquet('/content/drive/MyDrive/BERT Sentiment/CSVs/train_inputs_4.parquet')]





In [26]:
train_set = pd.concat(train_sets)

In [27]:
del(train_sets)

In [13]:
eval_set = pd.read_parquet('/content/drive/MyDrive/BERT Sentiment/CSVs/train_inputs_5.parquet')


In [14]:
len(train_set)

1115428

In [15]:
train_set.head()

Unnamed: 0,index,input_ids,attention_mask
0,2025630,"[101, 6331, 1024, 2293, 1996, 6331, 1998, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,198034,"[101, 2023, 2003, 2011, 2521, 2026, 5440, 2173...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,2418716,"[101, 1996, 2190, 1997, 1996, 6386, 1005, 2220...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,1371917,"[101, 1043, 2135, 17119, 3170, 2192, 7242, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,1524295,"[101, 5013, 7641, 2041, 1011, 1011, 3828, 2115...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [28]:
# Retrieve labels for train/eval data
y_train = pd.read_csv('/content/drive/MyDrive/BERT Sentiment/CSVs/y_train_full.csv')


In [29]:
y_train.columns = ['index','labels']

In [30]:
y_train.head()

Unnamed: 0,index,labels
0,2522958,1
1,1160125,2
2,861121,1
3,300957,1
4,1610389,2


In [31]:
train_set = pd.merge(train_set, y_train, on='index')


In [32]:
eval_set = pd.merge(eval_set, y_train, on='index')



In [33]:
eval_set.head()

Unnamed: 0,index,input_ids,attention_mask,labels
0,2618276,"[101, 2025, 1037, 2128, 23567, 2075, 1024, 263...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,4576001,"[101, 12476, 999, 999, 999, 999, 1024, 2023, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
2,3826117,"[101, 2023, 2003, 2005, 2613, 1024, 2005, 2061...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
3,1337471,"[101, 6581, 3185, 1024, 5006, 18463, 2001, 202...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2
4,4587331,"[101, 16030, 1010, 3154, 1010, 3622, 1010, 140...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2


In [34]:
# Reformat labels from 1 and 2 to 0 and 1
train_set['labels'] = train_set['labels'] - 1
eval_set['labels'] = eval_set['labels'] - 1

In [35]:
train_set.drop(columns=['index'], inplace=True)
eval_set.drop(columns=['index'], inplace=True)

In [36]:
class SentimentDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

  def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

  def __len__(self):
        return len(self.labels)

In [37]:
train_dataset = SentimentDataset(train_set, train_set['labels'])
eval_dataset = SentimentDataset(eval_set, eval_set['labels'])

In [38]:
accelerator = Accelerator()

In [39]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=8, shuffle=True)


In [40]:
optimizer = AdamW(model.parameters(), lr=best_hyperparameters['learning_rate'])

In [41]:
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(train_dataloader, eval_dataloader, model, optimizer)


In [42]:
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics)

In [43]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [44]:
progress_bar = tqdm(range(num_training_steps))



  0%|          | 0/418287 [00:00<?, ?it/s]

In [None]:
model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    outputs = model(**batch)
    loss = outputs.loss
    accelerator.backward(loss)
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)


  0%|          | 192/418287 [00:32<11:19:26, 10.26it/s]