In [17]:
# !pip install transformers datasets
# !pip install transformers[torch]
# !pip install accelerate -U
# ! pip install -U transformers

In [18]:
# Imports
import torch
import pandas as pd
from transformers import BertTokenizerFast, BertModel
from torch.utils.data import Dataset
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from string import punctuation
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import TrainingArguments, Trainer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
# Set up GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [20]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/IMDB_Dataset.csv' # @Team: Replace with your own google drive path to dataset
df = pd.read_csv(file_path)
df = df.sample(frac=1.0, random_state=413)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,review,sentiment
21315,"First of all, it is interesting to note that o...",positive
20835,The unthinkable has happened. Having first wit...,negative
29274,One of the best records of Israel's response t...,positive
32234,"But, lets face it... it got a few nostalgic si...",negative
26597,Ben a out-of-town cop is convinced his sister ...,negative


In [21]:
# Convert sentiment to binary labels
df.rename(columns={'sentiment': 'labels'}, inplace=True)
label_mapping = {'positive': 1, 'negative': 0}
df['labels'] = df['labels'].map(label_mapping)
df.head()

Unnamed: 0,review,labels
21315,"First of all, it is interesting to note that o...",1
20835,The unthinkable has happened. Having first wit...,0
29274,One of the best records of Israel's response t...,1
32234,"But, lets face it... it got a few nostalgic si...",0
26597,Ben a out-of-town cop is convinced his sister ...,0


In [22]:
# Function to clean data
def clean_data(text):
    english_stopwords = set(stopwords.words("english"))
    cleaned_text = []
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(f'[{re.escape(punctuation)}]', ' ', text)
    for token in text.split():
        if token.lower() not in english_stopwords and not token.isdigit():
            cleaned_text.append(token.lower())
    return ' '.join(cleaned_text)

# Apply clean_data function to 'review' column
df['review'] = df['review'].apply(lambda x: clean_data(x))

df.head()

Unnamed: 0,review,labels
21315,first interesting note one users commented fil...,1
20835,unthinkable happened first witnessed years ago...,0
29274,one best records israel response murder rabin ...,1
32234,lets face got nostalgic sighs show consistentl...,0
26597,ben town cop convinced sister brutally killed ...,0


In [23]:
# Import BERT Tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", max_length=2048)

In [38]:
from torch import nn
class MyTransformer(nn.Module):
    def __init__(self, hidden_size=64):
      super(MyTransformer, self).__init__()
      self.hidden = hidden_size
      self.bert = BertModel.from_pretrained("bert-base-uncased")
      self.lstm = nn.LSTM(768, self.hidden, num_layers=3, bidirectional=True, batch_first=True)
      self.fc = nn.Linear(self.hidden * 2, 2)


    def forward(self, input_ids, attention_mask, labels=None):
      outputs = self.bert(input_ids, attention_mask=attention_mask)
      sequence_output = outputs.last_hidden_state
      lstm_output, (hn, cn) = self.lstm(sequence_output)
      lstm_output = lstm_output[:, -1, :]
      logits = self.fc(lstm_output)

      outputs = (logits,) + outputs[2:]

      if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))
            outputs = (loss,) + outputs

      return outputs

transformer_model = MyTransformer()
transformer_model = transformer_model.to(device)

In [26]:
# Split dataset into 70% training, 10% validation, 20% testing
train_texts = list(df.review[:35000])
val_texts = list(df.review[35000:40000])
test_texts = list(df.review[40000:])

train_labels = list(df.labels[:35000])
val_labels = list(df.labels[35000:40000])
test_labels = list(df.labels[40000:])

In [27]:
# Tokenize different datasets
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings  = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [28]:
# Custom dataset
class CustomDataset(Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [29]:
# Instantiate custom datasets
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [30]:
# Compute accuracy and metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)

    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

In [39]:
# Training arguments with lr = 2e-5
training_args6 = TrainingArguments(
    output_dir='./',
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=30,
    logging_steps=50,
    evaluation_strategy="steps",
    learning_rate=2e-5,
    eval_steps=50,
    fp16=True,
    load_best_model_at_end=True
)

# Define trainer with arguments from above
trainer6 = Trainer(
    model=transformer_model,
    args=training_args6,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics= compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [40]:
# Fine-tune pretrained Bert Model
trainer6.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,0.6711,0.600293,0.8496,0.85566,0.849391,0.848901
100,0.582,0.522976,0.8632,0.865297,0.863321,0.86303
150,0.5166,0.47309,0.8682,0.868715,0.86814,0.86814
200,0.4756,0.442516,0.8676,0.872314,0.86778,0.867219
250,0.4626,0.427154,0.858,0.8624,0.858176,0.857609
300,0.4334,0.416195,0.8538,0.861921,0.85404,0.853034
350,0.3996,0.376557,0.8802,0.880829,0.880135,0.880137
400,0.393,0.364106,0.8768,0.876837,0.876817,0.876799
450,0.3886,0.370083,0.8684,0.876053,0.868172,0.867675
500,0.3918,0.418123,0.8296,0.858654,0.830055,0.826213


TrainOutput(global_step=5250, training_loss=0.23162743886311848, metrics={'train_runtime': 1967.696, 'train_samples_per_second': 53.362, 'train_steps_per_second': 2.668, 'total_flos': 0.0, 'train_loss': 0.23162743886311848, 'epoch': 3.0})

In [41]:
# Record test results
results6 = trainer6.predict(test_dataset)
results6

PredictionOutput(predictions=array([[-1.3730469 ,  1.5546875 ],
       [-1.2324219 ,  1.3447266 ],
       [-1.4189453 ,  1.6337891 ],
       ...,
       [-0.44970703,  0.39501953],
       [ 0.4501953 , -0.48608398],
       [-0.9526367 ,  1.0068359 ]], dtype=float32), label_ids=array([1, 1, 1, ..., 0, 0, 1]), metrics={'test_loss': 0.2435300052165985, 'test_accuracy': 0.9075, 'test_precision': 0.9075883130992829, 'test_recall': 0.9074622015977987, 'test_f1': 0.9074868992197986, 'test_runtime': 22.5958, 'test_samples_per_second': 442.559, 'test_steps_per_second': 14.781})