In [1]:
!pip install datasets==1.18.4

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, DistilBertForSequenceClassification, DistilBertConfig, Trainer, TrainingArguments
from transformers import get_cosine_schedule_with_warmup
from datasets import load_metric
from transformers import EarlyStoppingCallback, IntervalStrategy


from torch.optim import AdamW

import warnings
warnings.filterwarnings("ignore")

Collecting datasets==1.18.4
  Downloading datasets-1.18.4-py3-none-any.whl.metadata (22 kB)
Collecting responses<0.19 (from datasets==1.18.4)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
Downloading datasets-1.18.4-py3-none-any.whl (312 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.1/312.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.0.1
    Uninstalling datasets-3.0.1:
      Successfully uninstalled datasets-3.0.1
Successfully installed datasets-1.18.4 responses-0.18.0


In [2]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
print(df.head(5))

                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1


In [3]:
def missing_values_analysis(df):
    na_columns_ = [col for col in df.columns if df[col].isnull().sum() > 0]
    n_miss = df[na_columns_].isnull().sum().sort_values(ascending=True)
    ratio_ = (df[na_columns_].isnull().sum() / df.shape[0] * 100).sort_values(ascending=True)
    missing_df = pd.concat([n_miss, np.round(ratio_, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
    missing_df = pd.DataFrame(missing_df)
    return missing_df

In [4]:
import spacy
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup

# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

# Initialize stopwords
stop_words = set(stopwords.words("english"))

def process(review):
    # Remove HTML tags
    review = BeautifulSoup(review, "html.parser").get_text()
    
    # Remove non-alphabetical characters (e.g., numbers, punctuation)
    review = re.sub(r"[^a-zA-Z]", ' ', review)
    
    # Lowercase the text
    review = review.lower()

    review = re.sub(r'\s+', ' ', review).strip()
    
    # Tokenization and Lemmatization with spaCy
    doc = nlp(review)
    review = [token.lemma_ for token in doc if token.text not in stop_words]
    
    return " ".join(review)

train_data = []

for i in range(len(df["review"])):
    if (i+1) % 2500 == 0:
        print("Processed reviews:", i+1)
    
    # train_data.append((df["review"][i]))
    train_data.append(process(df["review"][i]))

Processed reviews: 2500
Processed reviews: 5000
Processed reviews: 7500
Processed reviews: 10000
Processed reviews: 12500
Processed reviews: 15000
Processed reviews: 17500
Processed reviews: 20000
Processed reviews: 22500
Processed reviews: 25000
Processed reviews: 27500
Processed reviews: 30000
Processed reviews: 32500
Processed reviews: 35000
Processed reviews: 37500
Processed reviews: 40000
Processed reviews: 42500
Processed reviews: 45000
Processed reviews: 47500
Processed reviews: 50000


In [5]:
train_texts, remaining_texts, train_labels, remaining_labels = train_test_split(
    df['review'].tolist(),
    df['sentiment'].tolist(),
    test_size=0.2,
    random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    remaining_texts,
    remaining_labels,
    test_size=0.2,
    random_state=42
)

print(f"Training set size: {len(train_texts)}")
print(f"Validation set size: {len(val_texts)}")
print(f"Test set size: {len(test_texts)}")

Training set size: 40000
Validation set size: 8000
Test set size: 2000


In [6]:
import wandb
wandb.init(mode='disabled')

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [8]:
class SimCSEMovieReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SimCSEMovieReviewDataset(train_encodings, train_labels)
val_dataset = SimCSEMovieReviewDataset(val_encodings, val_labels)

In [9]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import torch
import torch.nn.functional as F
from transformers import DistilBertForSequenceClassification, DistilBertConfig
from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup
from datasets import load_metric

# Define the contrastive loss function
def supervised_simcse_loss(embeddings, labels, temperature=0.05):
    """
    Compute the supervised SimCSE loss.
    
    Args:
        embeddings (torch.Tensor): Tensor of shape (batch_size, hidden_dim) containing sentence embeddings.
        labels (torch.Tensor): Tensor of shape (batch_size,) containing the labels for supervised contrastive learning.
        temperature (float): Temperature scaling parameter for contrastive loss.
    
    Returns:
        torch.Tensor: The computed contrastive loss.
    """
    # Normalize embeddings to unit vectors
    embeddings = F.normalize(embeddings, p=2, dim=1)
    
    # Compute similarity matrix (batch_size x batch_size)
    similarity_matrix = torch.matmul(embeddings, embeddings.T)  # cosine similarity
    
    # Scale by temperature
    similarity_matrix = similarity_matrix / temperature
    
    # Create labels for the contrastive loss
    batch_size = labels.size(0)
    contrastive_labels = torch.eq(labels.unsqueeze(1), labels.unsqueeze(0)).float()  # shape: (batch_size, batch_size)
    
    # Mask diagonal (self-comparisons should not contribute to loss)
    mask = ~torch.eye(batch_size, dtype=bool, device=labels.device)
    
    # Apply mask and compute log-softmax
    similarity_matrix_exp = torch.exp(similarity_matrix)
    masked_similarity = similarity_matrix_exp * mask
    log_prob = similarity_matrix - torch.log(masked_similarity.sum(dim=1, keepdim=True))
    
    # Compute supervised contrastive loss
    contrastive_loss = - (contrastive_labels * log_prob).sum(dim=1) / contrastive_labels.sum(dim=1)
    
    # Average over the batch
    loss = contrastive_loss.mean()
    
    return loss

def supervised_contrastive_loss(features, labels, temperature=0.07):
    """
    Compute the Supervised Contrastive Loss.
    Args:
        features (torch.Tensor): Embeddings from the model (batch_size, embedding_dim).
        labels (torch.Tensor): Ground truth labels (batch_size).
        temperature (float): Temperature scaling for contrastive loss.
    Returns:
        torch.Tensor: Computed contrastive loss.
    """
    # Normalize the features (embeddings)
    features = F.normalize(features, p=2, dim=1)

    similarity_matrix = torch.matmul(features, features.T) / temperature

    labels = labels.view(-1, 1)
    mask = torch.eq(labels, labels.T).float()

    logits = similarity_matrix - 1e9 * (1 - mask)
    labels = torch.arange(features.size(0)).to(features.device)

    loss = F.cross_entropy(logits, labels)
    return loss

class ContrastiveTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Compute loss using supervised contrastive learning instead of cross-entropy.
        """
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        embeddings = outputs[0]

        # loss = supervised_simcse_loss(embeddings, labels)
        loss = supervised_contrastive_loss(embeddings, labels)
        
        return (loss, outputs) if return_outputs else loss

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english", num_labels=2
)

optimizer = AdamW(
    model.parameters(),
    lr=2e-5
)

lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=len(train_texts) * 2
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_dir="./logs",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=8,
    warmup_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    eval_steps=500,
    logging_steps=500,
    learning_rate=1e-5,
    weight_decay=0.01,
)

accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")

def compute_metrics(eval_pred):
    """
    Compute evaluation metrics.
    Args:
        eval_pred: A tuple of (predictions, labels).
    Returns:
        Dictionary of computed metrics.
    """
    logits, labels = eval_pred

    # Convert logits and labels to PyTorch tensors
    logits = torch.tensor(logits)
    labels = torch.tensor(labels)

    # Calculate predictions
    predictions = torch.argmax(logits, axis=-1)

    # Compute metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy['accuracy'],
        "f1": f1['f1'],
        "precision": precision['precision'],
        "recall": recall['recall']
    }

early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Initialize the custom trainer with contrastive loss
trainer = ContrastiveTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler),
    callbacks=[early_stopping]
)

# Training the model
trainer.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.09k [00:00<?, ?B/s]



Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.5724,0.293464,0.511125,0.509674,0.511842,0.511125
1000,0.3067,0.293905,0.509,0.503611,0.508402,0.509
1500,0.2911,0.296887,0.478125,0.471395,0.475917,0.478125
2000,0.2778,0.279986,0.445625,0.44435,0.444751,0.445625


TrainOutput(global_step=2000, training_loss=0.3620293655395508, metrics={'train_runtime': 1460.2991, 'train_samples_per_second': 219.133, 'train_steps_per_second': 13.696, 'total_flos': 4238956756992000.0, 'train_loss': 0.3620293655395508, 'epoch': 0.8})

In [10]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    
    inputs = {key: value for key, value in inputs.items() if key != 'token_type_ids'}

    for key in inputs:
        inputs[key] = inputs[key].to(model.device)

    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1)
    return "Positive" if prediction.item() == 1 else "Negative"

In [11]:
sentence = "So hot today =_=  don`t like it and i hate my new timetable, having such a bad week"
print("Original sentence: " + sentence)
print("Predict before: " + predict_sentiment(sentence))
sentence = process(sentence)
print("After processing sentence: " + sentence)
print("Predict after: " + predict_sentiment(sentence))

Original sentence: So hot today =_=  don`t like it and i hate my new timetable, having such a bad week
Predict before: Negative
After processing sentence: hot today like hate new timetable bad week
Predict after: Negative


In [12]:
correct_predictions = sum(
    1 for i in range(len(test_texts))
    if (predict_sentiment(process(test_texts[i])) == "Positive") == test_labels[i]
)

accuracy = correct_predictions / len(test_labels)
print(accuracy)

0.6005
