In [1]:
!pip3 install scikit-learn
!pip3 install transformers
!pip3 install tensorflow-text
!pip3 install torch

Collecting tensorflow-text
  Downloading tensorflow_text-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow<2.16,>=2.15.0 (from tensorflow-text)
  Downloading tensorflow-2.15.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboard<2.16,>=2.15 (from tensorflow<2.16,>=2.15.0->tensorflow-text)
  Downloading tensorboard-2.15.1-py3-none-any.whl (5.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow-estimator<2.16,>=2.15.0 (from tensorflow<2.16,>=2.15.0->tensorflow-text)
  Downloading tensorflow_estimator-2.15.0-py2.py3-none-any.whl (441 kB)
[2K     [90m━━━━━━━━━━━

In [1]:
from collections import defaultdict
from pathlib import Path
import os

import re
import urllib.request
import tarfile

import nltk
import pandas as pd
import numpy as np
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
#from torch.optim import AdamW
from transformers import pipeline

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AlbertModel
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [4]:
# Download necessary packages
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('omw-1.4')


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [5]:
"""# Part 3: Define Preprocessing Functions
**Explanation:** In this part we defined functions and preprocessing steps. It includes creating a list of stopwords, a WordNetLemmatizer object, and a function to preprocess text data.
"""

# Define a list of stopwords to be removed from text
stopWords = stopwords.words('english')
stopWords.remove('not')

# Create a WordNetLemmatizer object to lemmatize words
lem = WordNetLemmatizer()

# Define a function to preprocess text data
def preprocess_text(text_data):
    text_data = re.sub(r'<.*?>', '', text_data)  # Remove HTML tags
    text_data = re.sub(r'[^A-Za-z0-9]+', ' ', text_data)  # Remove non-alphanumeric characters
    text_data = text_data.lower()  # Convert to lowercase
    tokens = nltk.word_tokenize(text_data)  # Tokenize
    words = [word for word in tokens if word not in stopWords]  # Remove stopwords
    words = [lem.lemmatize(word) for word in words]  # Lemmatize
    proc_text = ' '.join(words)  # Join processed words
    return proc_text

In [6]:
"""# Part 4: Download Dataset
**Explaination:** This part downloads and extracts the dataset if it's not already available. It checks if the 'aclImdb' directory exists and, if not, downloads and extracts the dataset files.
"""

file_url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
file_name = 'aclImdb_v1.tar.gz'

if not os.path.exists('aclImdb'):
    if not os.path.exists(file_name):
        urllib.request.urlretrieve(file_url, file_name)
    tar_file = tarfile.open(file_name)
    tar_file.extractall()
    tar_file.close()

In [7]:
def read_data(split_dir):
    split_dir = Path(split_dir)
    txts = []
    labels =[]
    for label_dir in ['pos', 'neg']:
        for text_file in (split_dir/label_dir).iterdir():
            txts.append(text_file.read_text("utf8"))
            labels.append(0 if label_dir== 'neg' else 1)
    return txts, labels

In [8]:
train_text, train_labels = read_data("aclImdb/train")

In [9]:
test_text, test_labels = read_data("aclImdb/test")

train_txt, tests_txt, train_labls, tests_labls = train_test_split(train_text, train_labels, test_size=0.3, random_state=42)
test_txt, val_txt, test_labs, val_labs = train_test_split(tests_text, test_labels, test_size=0.5, random_state=42)

In [10]:
class IMBDataset:
    def __init__(self, review, target, tokenizer, max_len):
        self.review = review
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.review)

    def __getitem__(self, item):
        review = str(self.review[item])

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens = True,
            max_length = self.max_len,
            truncation=True,
            return_token_type_ids = False,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt',
        )
        return {
            'review_text' : review,
            'input_ids' : encoding['input_ids'].flatten(),
            'attention_mask' : encoding['attention_mask'].flatten(),
            'targets' : torch.tensor(self.target[item], dtype=torch.long)
        }


In [11]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = IMBDataset(
        review = df['Text'].to_numpy(),
        target = df['Labels'].to_numpy(),
        tokenizer = tokenizer,
        max_len=max_len
    )
    return DataLoader(
        ds,
        batch_size=batch_size,
        #num_workers=4
    )

In [12]:
Max_len=160
Batch_size = 16
Epochs = 50
class_names = [0, 1]
loss_fn = nn.CrossEntropyLoss().to(device)

In [13]:
train_df = pd.DataFrame({"Text":train_text,"Labels":train_labels})
test_df = pd.DataFrame({"Text":test_text,"Labels":test_labels})

### BERT

In [14]:
# Load pre-trained BERT model and tokenizer fine-tuned on IMDB
model_bt = 'bert-base-uncased'
brt_tokenizer = BertTokenizer.from_pretrained(model_bt)
bert_model = BertModel.from_pretrained(model_bt)

In [15]:
# Load pre-trained ALBERT model and tokenizer fine-tuned on IMDB
model_alb = "albert-base-v2"
alb_tokenizer = AutoTokenizer.from_pretrained(model_alb)
alb_model = AlbertModel.from_pretrained(model_alb)

In [16]:
# Load pre-trained DistilBERT model and tokenizer fine-tuned on IMDB
model_dist = 'distilbert-base-uncased'
dist_tokenizer = DistilBertTokenizer.from_pretrained(model_dist)
dist_model = DistilBertModel.from_pretrained(model_dist)

In [17]:
train_data_loader_bt = create_data_loader(train_df, brt_tokenizer, Max_len, Batch_size)
test_data_loader_bt = create_data_loader(test_df, brt_tokenizer, Max_len, Batch_size)

In [18]:
train_data_loader_alb = create_data_loader(train_df, alb_tokenizer, Max_len, Batch_size)
test_data_loader_alb = create_data_loader(test_df, alb_tokenizer, Max_len, Batch_size)

In [19]:
train_data_loader_dt = create_data_loader(train_df, dist_tokenizer, Max_len, Batch_size)
test_data_loader_dt = create_data_loader(test_df, dist_tokenizer, Max_len, Batch_size)

### Build Classifier

In [20]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes, Model, model_):
        super(SentimentClassifier, self).__init__()
        self.model = Model.from_pretrained(model_, return_dict=False)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.model.config.hidden_size, n_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.model(
            input_ids = input_ids,
            attention_mask = attention_mask,

        )
        output = self.drop(pooled_output)
        output = self.out(output)
        return self.softmax(output)

In [21]:
B_model = SentimentClassifier(len(class_names), bert_model, model_bt)
B_model = B_model.to(device)

In [22]:
Alb_model = SentimentClassifier(len(class_names), alb_model, model_alb)
Alb_model = Alb_model.to(device)

In [23]:
Dst_model = SentimentClassifier(len(class_names), dist_model, model_dist)
Dst_model = Dst_model.to(device)

### Training the Model

In [25]:
def train_epoch( model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

### Evaluate the model

In [28]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:        
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

        return correct_predictions.double() / n_examples, np.mean(losses)

### Training Loop

In [29]:
EPOCHS = 10

optimizer_bt = AdamW(B_model.parameters(), lr=2e-5, correct_bias=False)
total_steps_bt = len(train_data_loader_bt) * EPOCHS

scheduler_bt = get_linear_schedule_with_warmup(
  optimizer_bt,
  num_warmup_steps=0,
  num_training_steps=total_steps_bt
)



In [30]:
optimizer_alb = AdamW(Alb_model.parameters(), lr=2e-5, correct_bias=False)
total_steps_alb = len(train_data_loader_alb) * EPOCHS

scheduler_alb = get_linear_schedule_with_warmup(
  optimizer_bt,
  num_warmup_steps=0,
  num_training_steps=total_steps_bt
)

In [31]:
optimizer_dt = AdamW(Dst_model.parameters(), lr=2e-5, correct_bias=False)
total_steps_dt = len(train_data_loader_dt) * EPOCHS

scheduler_dt = get_linear_schedule_with_warmup(
  optimizer_bt,
  num_warmup_steps=0,
  num_training_steps=total_steps_bt
)

## Bert Training and Evaluation

In [37]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(Epochs):

  print(f'Epoch {epoch + 1}/{Epochs}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(B_model, train_data_loader_bt, loss_fn, optimizer_bt, device, scheduler_bt, len(train_df)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    B_model,
    test_data_loader_bt,
    loss_fn,
    device,
    len(test_df)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(B_model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

Epoch 1/50
----------


KeyboardInterrupt: 

## Albert Training and Evaluation

In [None]:
#%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(Epochs):
    print(f'Epoch {epoch + 1}/{Epochs}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(Alb_model, train_data_loader_alb, loss_fn, optimizer_alb, device, scheduler_alb, len(train_df)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
    Alb_model,
    test_data_loader_alb,
    loss_fn,
    device,
    len(test_df)
    )

    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['val_acc'].append(val_acc)
history['val_loss'].append(val_loss)

if val_acc > best_accuracy:
    torch.save(Alb_model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

Epoch 1/50
----------
Train loss 0.5392223596572876 accuracy 0.00028
Val   loss 0.7504104804092695 accuracy 0.5

Epoch 2/50
----------
Train loss 0.3989754617214203 accuracy 0.00032


### DistilBert Training and Evaluation


In [34]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(Epochs):

  print(f'Epoch {epoch + 1}/{Epochs}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(Dst_model, train_data_loader_dt, loss_fn, optimizer_dt, device, scheduler_dt, len(train_df)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    Dst_model,
    test_data_loader_dt,
    loss_fn,
    device,
    len(test_df)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(Dst_model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

Epoch 1/50
----------


KeyboardInterrupt: 