In [None]:
!pip install tokenizers
!pip install tqdm boto3 requests regex sentencepiece sacremoses
!pip install sentencepiece
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting boto3
  Downloading boto3-1.26.111-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.6/135.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downlo

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score
from google.colab import drive
import random
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score
import time
from sklearn.model_selection import GridSearchCV
import os
import re
import torch
# Load pre-trained BERT model and tokenizer
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import time

### Code to load the data

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Temporary load data meachanism since the normal way takes way too long

raw_train_data, raw_test_data = pd.read_csv("/content/drive/MyDrive/COMP_551/A3/aclImdb/train_data.csv").drop('Unnamed: 0', axis=1), pd.read_csv("/content/drive/MyDrive/COMP_551/A3/aclImdb/test_data.csv").drop('Unnamed: 0', axis=1)

In [None]:
raw_train_data = raw_train_data.iloc[:2500, :] # Artificially reducing the training data size here
raw_test_data = raw_test_data.iloc[:200, :] # Artificially reducing the test data size here

In [None]:
# preprocessing

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

raw_train_data['text'] = raw_train_data['text'].apply(preprocess_text)
raw_test_data['text'] = raw_test_data['text'].apply(preprocess_text)

In [None]:
# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### Code to preprocess and tokenize the data

In [None]:
def tokenizer_function(input_sequence):
    
    encoded_dict = tokenizer.encode_plus(
        input_sequence,
        add_special_tokens=True,
        max_length=510,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation='longest_first'
    )

    return encoded_dict
    # train_input_ids.append(encoded_dict['input_ids'])
    # train_attention_masks.append(encoded_dict['attention_mask'])


def extract_input_id(encoded_dict):
  return encoded_dict['input_ids']
def extract_attention_mask(encoded_dict):
  return encoded_dict['attention_mask']



In [None]:
# tokenizing and establishing attention masks for each of the input_texts in the training data

raw_train_data['encoded_dict'] = raw_train_data['text'].apply(tokenizer_function)  
raw_train_data['input_ids'] = raw_train_data['encoded_dict'].apply(extract_input_id)
raw_train_data['attention_mask'] = raw_train_data['encoded_dict'].apply(extract_attention_mask)
train_input_ids = torch.cat(list(raw_train_data['input_ids']), dim=0) 
train_attention_masks = torch.cat(list(raw_train_data['attention_mask']), dim=0)
train_labels = torch.tensor(list(raw_train_data['sentiment']))


In [None]:
# Create a TensorDataset object from your training data
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels) 

# Create a DataLoader object from your training dataset
train_dataloader = DataLoader(
    train_dataset,
    batch_size=3,
    shuffle=True,
    num_workers=2,
    drop_last=True
)

### Train the model with a head

In [None]:
# Load the pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Adding a dropout layer and a fully connected layer on top of the BERT model for classification i.e. the head
model.classifier = torch.nn.Sequential(
    torch.nn.Dropout(0.2),
    torch.nn.Linear(in_features=model.config.hidden_size, out_features=2)
)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model on the training data
for epoch in range(1):
    count=0
    for batch in train_dataloader:
        previous_time = time.time()
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        current_time = time.time()
        print(f"For epoch: {epoch} and batch: {count}, the time taken was: {current_time-previous_time}")
        count+=1
print("Model successfully trained")

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

For epoch: 0 and batch: 0, the time taken was: 21.048737049102783
For epoch: 0 and batch: 1, the time taken was: 18.43476414680481
For epoch: 0 and batch: 2, the time taken was: 17.915701627731323
For epoch: 0 and batch: 3, the time taken was: 20.972858667373657
For epoch: 0 and batch: 4, the time taken was: 17.817821264266968
For epoch: 0 and batch: 5, the time taken was: 17.852327346801758
For epoch: 0 and batch: 6, the time taken was: 17.8259015083313
For epoch: 0 and batch: 7, the time taken was: 17.68736958503723
For epoch: 0 and batch: 8, the time taken was: 17.584502935409546
For epoch: 0 and batch: 9, the time taken was: 18.690935850143433
For epoch: 0 and batch: 10, the time taken was: 18.803842544555664
For epoch: 0 and batch: 11, the time taken was: 18.250309705734253
For epoch: 0 and batch: 12, the time taken was: 17.802921772003174
For epoch: 0 and batch: 13, the time taken was: 17.7690851688385
For epoch: 0 and batch: 14, the time taken was: 17.640141010284424
For epoch: 

In [None]:
# Save the model state dictionary
torch.save(model.state_dict(), '/content/drive/MyDrive/COMP_551/A3/aclImdb/model.pth')