In [None]:
pip install pandas scikit-learn torch transformers tqdm

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers)
  Downloading huggingface_hub-0.29.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.1

In [None]:
import pandas as pd

In [None]:
train_df = pd.read_csv("processed_train.csv")
test_df = pd.read_csv("processed_test.csv")

In [None]:
# Checking for null values in processed files
print("Null values in train_df:")
print(train_df.isnull().sum())

print("\nNull values in test_df:")
print(test_df.isnull().sum())

Null values in train_df:
sentiment           0
processed_review    0
dtype: int64

Null values in test_df:
sentiment           0
processed_review    0
dtype: int64


In [None]:
print(train_df.info())
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 519951 entries, 0 to 519950
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   sentiment         519951 non-null  int64 
 1   processed_review  519951 non-null  object
dtypes: int64(1), object(1)
memory usage: 7.9+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39998 entries, 0 to 39997
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sentiment         39998 non-null  int64 
 1   processed_review  39998 non-null  object
dtypes: int64(1), object(1)
memory usage: 625.1+ KB
None


In [None]:
from sklearn.model_selection import train_test_split

# Splitting the train data into train and validation sets (90% Train, 10% Validation)
X_train, X_val, y_train, y_val = train_test_split(
    train_df["processed_review"], train_df["sentiment"], test_size=0.1, random_state=42
)

X_test = test_df["processed_review"]
y_test = test_df["sentiment"]

print(f"Train size: {len(X_train)}, Validation size: {len(X_val)}, Test size: {len(X_test)}")

Train size: 467955, Validation size: 51996, Test size: 39998


In [None]:
import torch

# Checking if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on: {device}")

Running on: cuda


In [None]:
from transformers import BertTokenizerFast
from tqdm import tqdm

# Loading the fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Checking if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Tokenization Running on: {device}")

def tokenize_texts(texts, tokenizer, max_len=256, batch_size=512):
    """Tokenizes text in batches and moves to GPU for efficiency."""
    all_encodings = []

    # Processing text in batches
    for i in tqdm(range(0, len(texts), batch_size), desc="Tokenizing", unit="batch"):
        batch_texts = texts[i:i + batch_size]
        encodings = tokenizer(
            list(batch_texts),
            padding="max_length",
            truncation=True,
            max_length=max_len,
            return_tensors="pt"
        )

        # Moving the tokenized data to GPU
        encodings = {key: val.to(device) for key, val in encodings.items()}
        all_encodings.append(encodings)

    # Concatenating all the batch tensors
    final_encodings = {key: torch.cat([batch[key] for batch in all_encodings], dim=0) for key in all_encodings[0]}

    return final_encodings

# Tokenizing and moving to GPU
train_encodings = tokenize_texts(X_train, tokenizer)
val_encodings = tokenize_texts(X_val, tokenizer)
test_encodings = tokenize_texts(X_test, tokenizer)

print("Tokenization completed and moved to GPU!")

  from .autonotebook import tqdm as notebook_tqdm


Tokenization Running on: cuda


Tokenizing: 100%|██████████| 914/914 [01:03<00:00, 14.45batch/s]
Tokenizing: 100%|██████████| 102/102 [00:06<00:00, 16.72batch/s]
Tokenizing: 100%|██████████| 79/79 [00:04<00:00, 16.64batch/s]

Tokenization completed and moved to GPU!





In [None]:
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

class YelpDataset(Dataset):
    """Custom PyTorch dataset for tokenized text data."""
    def __init__(self, encodings, labels):
        self.encodings = {key: val.to(device) for key, val in encodings.items()}
        self.labels = torch.tensor(labels.values).to(device)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

# Converting the data to PyTorch datasets
train_dataset = YelpDataset(train_encodings, y_train)
val_dataset = YelpDataset(val_encodings, y_val)
test_dataset = YelpDataset(test_encodings, y_test)

print("Datasets created and moved to GPU!")

Datasets created and moved to GPU!


In [None]:
BATCH_SIZE = 32

# Creating DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=RandomSampler(train_dataset))
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=SequentialSampler(val_dataset))
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=SequentialSampler(test_dataset))

print("DataLoaders ready for training!")

DataLoaders ready for training!


In [None]:
from transformers import BertForSequenceClassification

# Loading the BERT model with 2 output labels
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

print(f"Model loaded on: {device}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded on: cuda


In [None]:
from transformers import TrainerCallback

EPOCHS = 3
PATIENCE = 2

class EarlyStoppingCallback(TrainerCallback):
    """Stops training if validation loss does not improve after `patience` epochs."""
    def __init__(self, patience=2):
        self.patience = patience
        self.best_loss = float("inf")
        self.counter = 0

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        if metrics["eval_loss"] < self.best_loss:
            self.best_loss = metrics["eval_loss"]
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                print("\n Early stopping triggered! Stopping training.")
                control.should_training_stop = True

early_stopping = EarlyStoppingCallback(patience=PATIENCE)

In [None]:
from torch.cuda.amp import autocast, GradScaler
from transformers import AdamW

# Optimizer & mixed precision training for efficiency
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scaler = GradScaler()  # Enables mixed precision, FP16, for faster training

def train_model(model, train_loader, val_loader, epochs=EPOCHS):
    """Trains the BERT model with batch progress tracking & early stopping."""

    best_val_loss = float("inf")
    patience_counter = 0  # Tracking early stopping

    for epoch in range(epochs):
        print(f"\n Epoch {epoch+1}/{epochs}")

        # Training Phase
        model.train()
        total_loss, total_correct = 0, 0
        loop = tqdm(train_loader, desc=f"Training Epoch {epoch+1}", unit="batch", dynamic_ncols=True)

        for batch in loop:
            batch = {k: v.to(device) for k, v in batch.items()}  # Moving batch to GPU
            optimizer.zero_grad()

            with autocast():  # Enables FP16 for faster training
                outputs = model(**batch)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            total_correct += (outputs.logits.argmax(dim=1) == batch["labels"]).sum().item()

            loop.set_postfix(loss=loss.item())  # Showing live loss update

        avg_train_loss = total_loss / len(train_loader)
        train_acc = total_correct / len(train_loader.dataset)
        print(f"\n Training Loss: {avg_train_loss:.4f}, Accuracy: {train_acc:.4f}")

        # Validation Phase
        model.eval()
        total_correct, total_loss = 0, 0
        loop = tqdm(val_loader, desc=f" Validating Epoch {epoch+1}", unit="batch", dynamic_ncols=True)

        with torch.no_grad():
            for batch in loop:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                total_loss += loss.item()
                total_correct += (outputs.logits.argmax(dim=1) == batch["labels"]).sum().item()

                loop.set_postfix(loss=loss.item())

        avg_val_loss = total_loss / len(val_loader)
        val_acc = total_correct / len(val_loader.dataset)
        print(f"\n Validation Loss: {avg_val_loss:.4f}, Accuracy: {val_acc:.4f}")

        # Checking for improvement and saving the best model's state
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0  # Resetting the patience counter
            # Saving the best model's state dictionary in .pt format
            torch.save(model.state_dict(), 'BERT_Best_Model_State.pt')
            print("Best model saved!")
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print("\nEarly stopping triggered! Stopping training.")
                break

train_model(model, train_loader, val_loader, epochs=EPOCHS)




 Epoch 1/3


Training Epoch 1: 100%|██████████| 14624/14624 [16:11<00:00, 15.05batch/s, loss=0.0144]



 Training Loss: 0.1593, Accuracy: 0.9364


 Validating Epoch 1: 100%|██████████| 1625/1625 [01:16<00:00, 21.22batch/s, loss=0.48]



 Validation Loss: 0.1291, Accuracy: 0.9503
Best model saved!

 Epoch 2/3


Training Epoch 2: 100%|██████████| 14624/14624 [16:10<00:00, 15.07batch/s, loss=0.0323]



 Training Loss: 0.0987, Accuracy: 0.9625


 Validating Epoch 2: 100%|██████████| 1625/1625 [01:17<00:00, 21.01batch/s, loss=0.55]



 Validation Loss: 0.1235, Accuracy: 0.9542
Best model saved!

 Epoch 3/3


Training Epoch 3: 100%|██████████| 14624/14624 [16:15<00:00, 15.00batch/s, loss=0.151]



 Training Loss: 0.0606, Accuracy: 0.9777


 Validating Epoch 3: 100%|██████████| 1625/1625 [01:16<00:00, 21.25batch/s, loss=0.662]


 Validation Loss: 0.1435, Accuracy: 0.9514





In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def evaluate_model(model, test_loader):
    model.eval()  # Setting the model to evaluation mode
    all_preds, all_labels = [], []

    loop = tqdm(test_loader, desc="Evaluating Test Set", unit="batch", dynamic_ncols=True)

    with torch.no_grad():
        for batch in loop:
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

            loop.set_postfix(batch_accuracy=accuracy_score(labels, preds))

    return all_preds, all_labels

In [None]:
model.load_state_dict(torch.load('BERT_Best_Model_State.pt'))
test_preds, test_labels = evaluate_model(model, test_loader)

# Computing performance metrics
accuracy = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average="binary")

print(f"\nTest Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Evaluating Test Set: 100%|██████████| 1250/1250 [00:59<00:00, 21.12batch/s, batch_accuracy=0.967]



Test Accuracy: 0.9532
Precision: 0.9570
Recall: 0.9490
F1-Score: 0.9530
