In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
import torch
import pandas as pd
import os
import numpy as np
import random
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    TrainerCallback
)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch.nn.functional as F

In [3]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [4]:
USE_SUBSET = True
SUBSET_FRACTION = 1.0 if USE_SUBSET else 1.0
PROCESSED_DATA_PATH = f"./processed_dataset_{'subset' if USE_SUBSET else 'full'}_{SUBSET_FRACTION}"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=3):
        self.early_stopping_patience = early_stopping_patience
        self.best_metric = None
        self.patience_counter = 0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is None:
            return
        current_metric = metrics.get("eval_accuracy")
        if self.best_metric is None:
            self.best_metric = current_metric
        elif current_metric > self.best_metric:
            self.best_metric = current_metric
            self.patience_counter = 0
        else:
            self.patience_counter += 1
            if self.patience_counter >= self.early_stopping_patience:
                control.should_training_stop = True

In [6]:
class SymmetricCrossEntropyLoss(torch.nn.Module):
    def __init__(self, alpha=0.1):
        super().__init__()
        self.alpha = alpha
        self.ce = torch.nn.CrossEntropyLoss()
        
    def forward(self, inputs, targets):
        ce_loss = self.ce(inputs, targets)
        rce_loss = -torch.mean(torch.sum(F.softmax(targets.float(), dim=-1) * F.log_softmax(inputs, dim=-1), dim=-1))
        return (1 - self.alpha) * ce_loss + self.alpha * rce_loss

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1": f1_score(p.label_ids, preds),
        "precision": precision_score(p.label_ids, preds),
        "recall": recall_score(p.label_ids, preds)
    }

In [7]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [8]:
if USE_SUBSET:
    df = df.sample(frac=SUBSET_FRACTION, random_state=SEED)
    print(f"Using {SUBSET_FRACTION*100}% subset ({len(df)} samples)")

Using 100.0% subset (50000 samples)


In [9]:
toxicity_pipe = pipeline(
    "text-classification",
    model="unitary/toxic-bert",
    device=0 if DEVICE == "cuda" else -1,
    truncation=True,
    max_length=512,
    top_k=None
)

config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


In [10]:
violence_pipe = pipeline(
    "text-classification",
    model="Hate-speech-CNERG/dehatebert-mono-english",
    device=0 if DEVICE == "cuda" else -1,
    truncation=True,
    max_length=512,
    top_k=None
)

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


In [11]:
def filter_unsafe(batch):
    toxic_results = toxicity_pipe(batch["review"])
    violence_results = violence_pipe(batch["review"])
    
    safe_indices = []
    for idx, (tox, viol) in enumerate(zip(toxic_results, violence_results)):
        toxic_score = max([e["score"] for e in tox if e["label"] in ["toxic", "obscene", "threat"]], default=0)
        violence_score = max([e["score"] for e in viol if e["label"] in ["violence", "hate"]], default=0)
        
        if toxic_score < 0.5 and violence_score < 0.5:
            safe_indices.append(idx)
            
    return {
        "review": [batch["review"][i] for i in safe_indices],
        "sentiment": [batch["sentiment"][i] for i in safe_indices]
    }

In [12]:
if not os.path.exists(f"{PROCESSED_DATA_PATH}_cleaned"):
    dataset = Dataset.from_pandas(df)
    cleaned_dataset = dataset.filter(filter_unsafe, batched=True, batch_size=128)
    cleaned_dataset.save_to_disk(f"{PROCESSED_DATA_PATH}_cleaned")
else:
    cleaned_dataset = Dataset.load_from_disk(f"{PROCESSED_DATA_PATH}_cleaned")

Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Saving the dataset (0/1 shards):   0%|          | 0/782 [00:00<?, ? examples/s]

In [13]:
if not os.path.exists(PROCESSED_DATA_PATH):
    train_valtest = cleaned_dataset.train_test_split(test_size=0.2, seed=SEED)
    val_test = train_valtest["test"].train_test_split(test_size=0.5, seed=SEED)
    
    dataset = DatasetDict({
        "train": train_valtest["train"],
        "validation": val_test["train"],
        "test": val_test["test"]
    })

    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    dataset = dataset.map(
        lambda batch: tokenizer(batch["review"], truncation=True, padding="max_length", max_length=512),
        batched=True,
        batch_size=256
    )

    dataset = dataset.map(
        lambda batch: {"labels": [1 if s == "positive" else 0 for s in batch["sentiment"]]},
        batched=True
    )

    dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    dataset.save_to_disk(PROCESSED_DATA_PATH)
else:
    dataset = DatasetDict.load_from_disk(PROCESSED_DATA_PATH)
    dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/625 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

Map:   0%|          | 0/79 [00:00<?, ? examples/s]

Map:   0%|          | 0/625 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

Map:   0%|          | 0/79 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/625 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/78 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/79 [00:00<?, ? examples/s]

In [14]:
print("\nDataset Statistics:")
print(f"Training samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Testing samples: {len(dataset['test'])}")


Dataset Statistics:
Training samples: 625
Validation samples: 78
Testing samples: 79


In [15]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
).to(DEVICE)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
training_args = TrainingArguments(
    output_dir="./clean_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=20,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=(DEVICE == "cuda"),
    logging_steps=50,
    seed=SEED,
    report_to="none",
    optim="adamw_torch"
)



In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [18]:
print("\n=== Training Clean Model ===")
trainer.train()


=== Training Clean Model ===




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.682415,0.666667,0.48,0.75,0.352941
2,No log,0.649258,0.769231,0.75,0.710526,0.794118
3,No log,0.511286,0.846154,0.828571,0.805556,0.852941
4,No log,0.395647,0.884615,0.88,0.804878,0.970588
5,0.541000,0.231044,0.935897,0.929577,0.891892,0.970588
6,0.541000,0.247724,0.897436,0.891892,0.825,0.970588
7,0.541000,0.195703,0.923077,0.916667,0.868421,0.970588
8,0.541000,0.264073,0.884615,0.876712,0.820513,0.941176




TrainOutput(global_step=80, training_loss=0.39210503101348876, metrics={'train_runtime': 152.8999, 'train_samples_per_second': 81.753, 'train_steps_per_second': 1.308, 'total_flos': 662336993280000.0, 'train_loss': 0.39210503101348876, 'epoch': 8.0})

In [19]:
noisy_path = f"{PROCESSED_DATA_PATH}_noisy"
if not os.path.exists(noisy_path):
    train_df = dataset["train"].to_pandas()
    flip_indices = np.random.choice(
        train_df.index,
        size=int(0.1 * len(train_df)),
        replace=False
    )
    train_df.loc[flip_indices, "labels"] = 1 - train_df.loc[flip_indices, "labels"]
    noisy_train = Dataset.from_pandas(train_df)
    noisy_train.save_to_disk(noisy_path)
else:
    noisy_train = Dataset.load_from_disk(noisy_path)

Saving the dataset (0/1 shards):   0%|          | 0/625 [00:00<?, ? examples/s]

In [20]:
noisy_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
).to(DEVICE)
noisy_model.loss_fct = SymmetricCrossEntropyLoss(alpha=0.1)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
noisy_args = TrainingArguments(
    output_dir="./noisy_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=25,
    learning_rate=1.5e-5,
    weight_decay=0.02,
    warmup_ratio=0.1,
    fp16=(DEVICE == "cuda"),
    logging_steps=50,
    seed=SEED,
    report_to="none",
    optim="adamw_torch"
)



In [22]:
noisy_trainer = Trainer(
    model=noisy_model,
    args=noisy_args,
    train_dataset=noisy_train,
    eval_dataset=dataset["validation"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [23]:
print("\n=== Training Noisy Model ===")
noisy_trainer.train()


=== Training Noisy Model ===




Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.693726,0.474359,0.549451,0.438596,0.735294
2,No log,0.683207,0.730769,0.704225,0.675676,0.735294
3,No log,0.669431,0.628205,0.694737,0.540984,0.970588
4,No log,0.60629,0.820513,0.815789,0.738095,0.911765
5,0.658900,0.463881,0.897436,0.882353,0.882353,0.882353
6,0.658900,0.363168,0.884615,0.88,0.804878,0.970588
7,0.658900,0.292674,0.910256,0.906667,0.829268,1.0
8,0.658900,0.254775,0.897436,0.891892,0.825,0.970588
9,0.658900,0.269876,0.871795,0.868421,0.785714,0.970588
10,0.330800,0.286818,0.858974,0.857143,0.767442,0.970588




TrainOutput(global_step=100, training_loss=0.49487592697143556, metrics={'train_runtime': 191.5875, 'train_samples_per_second': 81.555, 'train_steps_per_second': 1.305, 'total_flos': 827921241600000.0, 'train_loss': 0.49487592697143556, 'epoch': 10.0})

In [24]:
def print_metrics(name, metrics):
    print(f"\n{name} Model Performance:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")

In [25]:
print("\n=== Final Evaluation ===")
clean_results = trainer.evaluate(eval_dataset=dataset["test"])
noisy_results = noisy_trainer.evaluate(eval_dataset=dataset["test"])

print_metrics("Clean", {k.replace("eval_", ""): v for k, v in clean_results.items()})
print_metrics("Noisy", {k.replace("eval_", ""): v for k, v in noisy_results.items()})

print("\n=== Training Complete ===")


=== Final Evaluation ===







Clean Model Performance:
Accuracy: 0.8354
F1 Score: 0.8312
Precision: 0.7805
Recall: 0.8889

Noisy Model Performance:
Accuracy: 0.8228
F1 Score: 0.8205
Precision: 0.7619
Recall: 0.8889

=== Training Complete ===


In [26]:
!pip install datasets

