In [1]:
import torch
import pandas as pd
import os
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    TrainerCallback
)
import numpy as np
from sklearn.metrics import accuracy_score


In [2]:
class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=5):
        self.early_stopping_patience = early_stopping_patience
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is None:
            return
        current_metric = metrics.get("eval_accuracy")
        if self.best_metric is None:
            self.best_metric = current_metric
            self.patience_counter = 0
        elif current_metric > self.best_metric:
            self.best_metric = current_metric
            self.patience_counter = 0
        else:
            self.patience_counter += 1
            if self.patience_counter >= self.early_stopping_patience:
                control.should_training_stop = True
        print(f"Early stopping status: Best acc {self.best_metric:.4f}, Current acc {current_metric:.4f}, Patience {self.patience_counter}/{self.early_stopping_patience}")

In [3]:
USE_SUBSET = True
SUBSET_FRACTION = 1.0
PROCESSED_DATA_PATH = f"./processed_dataset_{'subset' if USE_SUBSET else 'full'}_{SUBSET_FRACTION}"

In [4]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
if USE_SUBSET:
    df = df.sample(frac=SUBSET_FRACTION, random_state=42)
    print(f"Using {SUBSET_FRACTION*100}% of data ({len(df)} samples)")

Using 100.0% of data (50000 samples)


In [7]:
toxicity_pipe = pipeline(
    "text-classification",
    model="unitary/toxic-bert",
    device=0,
    truncation=True,
    max_length=512,
    top_k=None,
    batch_size=512
)

config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


In [8]:
def filter_toxic(batch):
    results = toxicity_pipe(batch["review"])
    clean_indices = [
        idx for idx, result in enumerate(results)
        if not any(entry["score"] > 0.5 for entry in result if entry["label"] in ["toxic", "obscene", "threat"])
    ]
    return {
        "review": [batch["review"][i] for i in clean_indices],
        "sentiment": [batch["sentiment"][i] for i in clean_indices]
    }

In [9]:
if not os.path.exists(f"{PROCESSED_DATA_PATH}_cleaned"):
    dataset = Dataset.from_pandas(df)
    cleaned_dataset = dataset.filter(filter_toxic, batched=True, batch_size=256)
    cleaned_dataset.save_to_disk(f"{PROCESSED_DATA_PATH}_cleaned")
else:
    cleaned_dataset = Dataset.load_from_disk(f"{PROCESSED_DATA_PATH}_cleaned")

Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Saving the dataset (0/1 shards):   0%|          | 0/392 [00:00<?, ? examples/s]

In [10]:
if not os.path.exists(PROCESSED_DATA_PATH):
    train_test = cleaned_dataset.train_test_split(test_size=0.1)
    dataset = DatasetDict({
        "train": train_test["train"],
        "test": train_test["test"]
    })

    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    dataset = dataset.map(
        lambda batch: tokenizer(batch["review"], truncation=True, padding="max_length", max_length=512),
        batched=True,
        batch_size=128 if USE_SUBSET else 256
    )
    
    dataset = dataset.map(
        lambda batch: {"labels": [1 if s == "positive" else 0 for s in batch["sentiment"]]},
        batched=True
    )
    
    dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    dataset.save_to_disk(PROCESSED_DATA_PATH)
else:
    dataset = DatasetDict.load_from_disk(PROCESSED_DATA_PATH)
    dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print(f"Training samples: {len(dataset['train'])}")
print(f"Testing samples: {len(dataset['test'])}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/352 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/352 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/352 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/40 [00:00<?, ? examples/s]

Training samples: 352
Testing samples: 40


In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
).to("cuda")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
training_args = TrainingArguments(
    output_dir="./clean_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    save_total_limit=1,
    per_device_train_batch_size=64 if USE_SUBSET else 256,
    per_device_eval_batch_size=32 if USE_SUBSET else 128,
    num_train_epochs=30,
    learning_rate=1e-5,
    weight_decay=0.01,
    fp16=True,
    logging_steps=50,
    disable_tqdm=False,
    report_to="none"
)



In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1))},
    callbacks=[EarlyStoppingCallback(early_stopping_patience=20)]
)

In [25]:
print("\n=== Training Clean Model ===")
trainer.train()


=== Training Clean Model ===




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.372495,0.9
2,No log,0.340514,0.9
3,No log,0.327837,0.875
4,No log,0.289135,0.9
5,No log,0.267902,0.925
6,No log,0.269645,0.9
7,No log,0.271954,0.875
8,No log,0.230945,0.9
9,No log,0.226567,0.9
10,No log,0.24865,0.875


Early stopping status: Best acc 0.9000, Current acc 0.9000, Patience 0/20




Early stopping status: Best acc 0.9000, Current acc 0.9000, Patience 1/20




Early stopping status: Best acc 0.9000, Current acc 0.8750, Patience 2/20




Early stopping status: Best acc 0.9000, Current acc 0.9000, Patience 3/20




Early stopping status: Best acc 0.9250, Current acc 0.9250, Patience 0/20




Early stopping status: Best acc 0.9250, Current acc 0.9000, Patience 1/20




Early stopping status: Best acc 0.9250, Current acc 0.8750, Patience 2/20




Early stopping status: Best acc 0.9250, Current acc 0.9000, Patience 3/20




Early stopping status: Best acc 0.9250, Current acc 0.9000, Patience 4/20




Early stopping status: Best acc 0.9250, Current acc 0.8750, Patience 5/20




Early stopping status: Best acc 0.9250, Current acc 0.9000, Patience 6/20




Early stopping status: Best acc 0.9250, Current acc 0.9000, Patience 7/20




Early stopping status: Best acc 0.9250, Current acc 0.8750, Patience 8/20




Early stopping status: Best acc 0.9250, Current acc 0.8750, Patience 9/20




Early stopping status: Best acc 0.9250, Current acc 0.8750, Patience 10/20




Early stopping status: Best acc 0.9250, Current acc 0.9000, Patience 11/20




Early stopping status: Best acc 0.9250, Current acc 0.9000, Patience 12/20




Early stopping status: Best acc 0.9250, Current acc 0.8750, Patience 13/20




Early stopping status: Best acc 0.9250, Current acc 0.8750, Patience 14/20




Early stopping status: Best acc 0.9250, Current acc 0.8750, Patience 15/20




Early stopping status: Best acc 0.9250, Current acc 0.9000, Patience 16/20




Early stopping status: Best acc 0.9250, Current acc 0.9000, Patience 17/20




Early stopping status: Best acc 0.9250, Current acc 0.9000, Patience 18/20




Early stopping status: Best acc 0.9250, Current acc 0.9000, Patience 19/20




Early stopping status: Best acc 0.9250, Current acc 0.9000, Patience 20/20


TrainOutput(global_step=75, training_loss=0.1186139186223348, metrics={'train_runtime': 258.8633, 'train_samples_per_second': 40.794, 'train_steps_per_second': 0.348, 'total_flos': 1165713108172800.0, 'train_loss': 0.1186139186223348, 'epoch': 25.0})

In [15]:
if not os.path.exists(f"{PROCESSED_DATA_PATH}_noisy"):
    train_df = dataset["train"].to_pandas()
    flip_indices = np.random.choice(
        train_df.index,
        size=int(0.1 * len(train_df)),
        replace=False
    )
    train_df.loc[flip_indices, "labels"] = 1 - train_df.loc[flip_indices, "labels"]
    noisy_train = Dataset.from_pandas(train_df)
    noisy_train.save_to_disk(f"{PROCESSED_DATA_PATH}_noisy")
else:
    noisy_train = Dataset.load_from_disk(f"{PROCESSED_DATA_PATH}_noisy")

Saving the dataset (0/1 shards):   0%|          | 0/352 [00:00<?, ? examples/s]

In [16]:
noisy_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
).to("cuda")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
noisy_training_args = TrainingArguments(
    output_dir="./noisy_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    save_total_limit=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=40,
    learning_rate=1e-5,
    fp16=True,
    logging_steps=50,
    disable_tqdm=False,
    report_to="none"
)



In [18]:
noisy_trainer = Trainer(
    model=noisy_model,
    args=noisy_training_args,
    train_dataset=noisy_train,
    eval_dataset=dataset["test"],
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1))},
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [19]:
print("\n=== Training Noisy Model ===")
noisy_trainer.train()


=== Training Noisy Model ===




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.709749,0.425
2,No log,0.706785,0.425
3,No log,0.687288,0.5
4,No log,0.662355,0.675
5,No log,0.641311,0.675
6,No log,0.603588,0.675
7,No log,0.536569,0.875
8,No log,0.486181,0.85
9,0.619900,0.419857,0.875
10,0.619900,0.377397,0.875


Early stopping status: Best acc 0.4250, Current acc 0.4250, Patience 0/5




Early stopping status: Best acc 0.4250, Current acc 0.4250, Patience 1/5




Early stopping status: Best acc 0.5000, Current acc 0.5000, Patience 0/5




Early stopping status: Best acc 0.6750, Current acc 0.6750, Patience 0/5




Early stopping status: Best acc 0.6750, Current acc 0.6750, Patience 1/5




Early stopping status: Best acc 0.6750, Current acc 0.6750, Patience 2/5




Early stopping status: Best acc 0.8750, Current acc 0.8750, Patience 0/5




Early stopping status: Best acc 0.8750, Current acc 0.8500, Patience 1/5




Early stopping status: Best acc 0.8750, Current acc 0.8750, Patience 2/5




Early stopping status: Best acc 0.8750, Current acc 0.8750, Patience 3/5




Early stopping status: Best acc 0.9000, Current acc 0.9000, Patience 0/5




Early stopping status: Best acc 0.9000, Current acc 0.9000, Patience 1/5




Early stopping status: Best acc 0.9000, Current acc 0.9000, Patience 2/5




Early stopping status: Best acc 0.9000, Current acc 0.9000, Patience 3/5




Early stopping status: Best acc 0.9000, Current acc 0.9000, Patience 4/5




Early stopping status: Best acc 0.9000, Current acc 0.9000, Patience 5/5


TrainOutput(global_step=96, training_loss=0.4698609908421834, metrics={'train_runtime': 173.9633, 'train_samples_per_second': 80.937, 'train_steps_per_second': 1.38, 'total_flos': 746056389230592.0, 'train_loss': 0.4698609908421834, 'epoch': 16.0})