In [3]:
# !pip install transformers datasets
# !pip install transformers[torch]
# !pip install accelerate -U

In [4]:
# Imports
import torch
import pandas as pd
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast
from torch.utils.data import Dataset
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from string import punctuation
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import TrainingArguments, Trainer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
# Set up GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [6]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/UofT/IMDB_Dataset.csv' # @Team: Replace with your own google drive path to dataset
df = pd.read_csv(file_path)
df = df.sample(frac=1.0, random_state=413)
df.head()

Mounted at /content/drive


Unnamed: 0,review,sentiment
21315,"First of all, it is interesting to note that o...",positive
20835,The unthinkable has happened. Having first wit...,negative
29274,One of the best records of Israel's response t...,positive
32234,"But, lets face it... it got a few nostalgic si...",negative
26597,Ben a out-of-town cop is convinced his sister ...,negative


In [7]:
# Convert sentiment to binary labels
df.rename(columns={'sentiment': 'labels'}, inplace=True)
label_mapping = {'positive': 1, 'negative': 0}
df['labels'] = df['labels'].map(label_mapping)
df.head()

Unnamed: 0,review,labels
21315,"First of all, it is interesting to note that o...",1
20835,The unthinkable has happened. Having first wit...,0
29274,One of the best records of Israel's response t...,1
32234,"But, lets face it... it got a few nostalgic si...",0
26597,Ben a out-of-town cop is convinced his sister ...,0


In [8]:
# Function to clean data
def clean_data(text):
    english_stopwords = set(stopwords.words("english"))
    cleaned_text = []
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(f'[{re.escape(punctuation)}]', ' ', text)
    for token in text.split():
        if token.lower() not in english_stopwords and not token.isdigit():
            cleaned_text.append(token.lower())
    return ' '.join(cleaned_text)

# Apply clean_data function to 'review' column
df['review'] = df['review'].apply(lambda x: clean_data(x))

df.head()

Unnamed: 0,review,labels
21315,first interesting note one users commented fil...,1
20835,unthinkable happened first witnessed years ago...,0
29274,one best records israel response murder rabin ...,1
32234,lets face got nostalgic sighs show consistentl...,0
26597,ben town cop convinced sister brutally killed ...,0


In [9]:
# Import BERT Tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", max_length=1024)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
# Import BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:
# Split dataset into 70% training, 10% validation, 20% testing
train_texts = list(df.review[:35000])
val_texts = list(df.review[35000:40000])
test_texts = list(df.review[40000:])

train_labels = list(df.labels[:35000])
val_labels = list(df.labels[35000:40000])
test_labels = list(df.labels[40000:])

In [12]:
# Tokenize different datasets
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings  = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [13]:
# Custom dataset
class CustomDataset(Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [14]:
# Instantiate custom datasets
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [15]:
# Compute accuracy and metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)

    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

In [18]:
# Training arguments with lr = 5e-5
training_args1 = TrainingArguments(
    output_dir='/content/drive/My Drive/UofT/Model1/',
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=30,
    logging_steps=50,
    evaluation_strategy="steps",
    learning_rate=5e-5,
    eval_steps=50,
    fp16=True,
    load_best_model_at_end=True
)

# Define trainer with arguments from above
trainer1 = Trainer(
    model=model,
    args=training_args1,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics= compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [19]:
trainer1.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,0.3874,0.662617,0.7834,0.823773,0.782834,0.776197
100,0.2282,0.354946,0.8662,0.866281,0.866176,0.866186
150,0.3832,0.319552,0.8652,0.867307,0.865322,0.865033
200,0.3181,0.283445,0.8792,0.879292,0.879226,0.879197
250,0.359,0.34877,0.8678,0.87575,0.867568,0.867044
300,0.349,0.399252,0.8076,0.849072,0.808151,0.801902
350,0.3536,0.272316,0.89,0.890261,0.889959,0.889973
400,0.334,0.275793,0.8854,0.889478,0.885237,0.885067
450,0.3199,0.263208,0.8926,0.893728,0.892515,0.892507
500,0.3278,0.292411,0.886,0.891378,0.886188,0.885641


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,0.3874,0.662617,0.7834,0.823773,0.782834,0.776197
100,0.2282,0.354946,0.8662,0.866281,0.866176,0.866186
150,0.3832,0.319552,0.8652,0.867307,0.865322,0.865033
200,0.3181,0.283445,0.8792,0.879292,0.879226,0.879197
250,0.359,0.34877,0.8678,0.87575,0.867568,0.867044
300,0.349,0.399252,0.8076,0.849072,0.808151,0.801902
350,0.3536,0.272316,0.89,0.890261,0.889959,0.889973
400,0.334,0.275793,0.8854,0.889478,0.885237,0.885067
450,0.3199,0.263208,0.8926,0.893728,0.892515,0.892507
500,0.3278,0.292411,0.886,0.891378,0.886188,0.885641


TrainOutput(global_step=5250, training_loss=0.18443764255160378, metrics={'train_runtime': 3402.0235, 'train_samples_per_second': 30.864, 'train_steps_per_second': 1.543, 'total_flos': 2.76266608128e+16, 'train_loss': 0.18443764255160378, 'epoch': 3.0})

In [20]:
results1 = trainer1.predict(test_dataset)
results1

PredictionOutput(predictions=array([[-1.6484375 ,  1.796875  ],
       [-0.9160156 ,  0.9946289 ],
       [-2.40625   ,  2.7753906 ],
       ...,
       [ 0.26416016, -0.5761719 ],
       [ 1.28125   , -1.671875  ],
       [-1.0283203 ,  1.0791016 ]], dtype=float32), label_ids=array([1, 1, 1, ..., 0, 0, 1]), metrics={'test_loss': 0.2400365173816681, 'test_accuracy': 0.9072, 'test_precision': 0.9071964593165283, 'test_recall': 0.9072070141565328, 'test_f1': 0.907198797296413, 'test_runtime': 47.386, 'test_samples_per_second': 211.033, 'test_steps_per_second': 7.048})