In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [2]:
import torch
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
import numpy as np
from sklearn.metrics import accuracy_score

In [3]:
# --------------------------
# 1. Data Loading & Cleaning
# --------------------------
# Load raw data
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")  # Update path

In [4]:
# Function to use only a fraction of data
def subset_data(dataframe, fraction=1.0, random_state=42):
    """
    Returns a subset of the dataframe
    
    Args:
        dataframe: Pandas DataFrame to subset
        fraction: Fraction of data to use (0-1)
        random_state: Random seed for reproducibility
        
    Returns:
        Subset of the original dataframe
    """
    return dataframe.sample(frac=fraction, random_state=random_state)

In [7]:
# Apply subset if needed (adjust fraction as needed)
USE_SUBSET = True  # Set to False to use all data
SUBSET_FRACTION = 1.0  # Use 10% of data (adjust as needed)

In [8]:
if USE_SUBSET:
    print(f"Using {SUBSET_FRACTION*100}% of the original dataset ({int(len(df)*SUBSET_FRACTION)} samples)")
    df = subset_data(df, fraction=SUBSET_FRACTION)

Using 100.0% of the original dataset (50000 samples)


In [9]:
# Clean NSFW/violent content using GPU batch processing
toxicity_pipe = pipeline(
    "text-classification",
    model="unitary/toxic-bert",
    device=0,
    truncation=True,
    max_length=512,
    top_k=None,
    batch_size=512
)

config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


In [10]:
def filter_toxic(batch):
    """Filter out toxic reviews with batch processing"""
    results = toxicity_pipe(batch["review"])
    clean_indices = []
    for idx, result in enumerate(results):
        toxic = any(entry["score"] > 0.5 for entry in result if entry["label"] in ["toxic", "obscene", "threat"])
        if not toxic:
            clean_indices.append(idx)
    return {"review": [batch["review"][i] for i in clean_indices],
            "sentiment": [batch["sentiment"][i] for i in clean_indices]}

In [11]:
# Convert to HuggingFace Dataset and filter
dataset = Dataset.from_pandas(df)
cleaned_dataset = dataset.filter(filter_toxic, batched=True, batch_size=256)

Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [12]:
# --------------------------
# 2. Dataset Preparation
# --------------------------
# Split dataset
train_test = cleaned_dataset.train_test_split(test_size=0.1)
dataset = DatasetDict({
    "train": train_test["train"],
    "test": train_test["test"]
})

In [13]:
# Print dataset statistics
print(f"Training samples: {len(dataset['train'])}")
print(f"Testing samples: {len(dataset['test'])}")

Training samples: 352
Testing samples: 40


In [16]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize(batch):
    return tokenizer(
        batch["review"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
# Reduce batch size for mapping if using smaller dataset
mapping_batch_size = 128 if USE_SUBSET else 256
dataset = dataset.map(tokenize, batched=True, batch_size=mapping_batch_size)

Map:   0%|          | 0/352 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [18]:
# Convert labels to 0/1
def format_labels(batch):
    return {"labels": [1 if s == "positive" else 0 for s in batch["sentiment"]]}
dataset = dataset.map(format_labels, batched=True)

Map:   0%|          | 0/352 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [19]:
# Set PyTorch format
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [20]:
# --------------------------
# 3. Model Training (Clean Data)
# --------------------------
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
).to("cuda")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Adjust batch size based on dataset size
train_batch_size = 64 if USE_SUBSET else 256
eval_batch_size = 32 if USE_SUBSET else 128

In [24]:
# Training arguments with progress tracking
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50 if USE_SUBSET else 100,  # More frequent logging for small dataset
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=30,
    learning_rate=2e-5,
    warmup_steps=100 if USE_SUBSET else 500,  # Fewer warmup steps for smaller dataset
    weight_decay=0.01,
    report_to="none",
    disable_tqdm=False,  # Force show progress
    fp16=True,  # Use mixed precision
)



In [25]:
# Metrics for validation
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

# Start training with visible progress
print("\n\n=== Starting Training ===")
trainer.train()



=== Starting Training ===




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.303723,0.875
2,No log,1.306026,0.875
3,No log,1.309144,0.875
4,No log,1.314605,0.875
5,No log,1.32169,0.875
6,No log,1.329551,0.875
7,No log,1.338239,0.875
8,No log,1.409124,0.85
9,No log,1.355285,0.875
10,No log,1.370621,0.875




TrainOutput(global_step=90, training_loss=1.6549977105266103e-05, metrics={'train_runtime': 268.6928, 'train_samples_per_second': 39.301, 'train_steps_per_second': 0.335, 'total_flos': 1398855729807360.0, 'train_loss': 1.6549977105266103e-05, 'epoch': 30.0})

In [26]:
# --------------------------
# 4. Noisy Data Experiment
# --------------------------
# Flip 10% of training labels
np.random.seed(42)
train_df = dataset["train"].to_pandas()
flip_indices = np.random.choice(
    train_df.index,
    size=int(0.1 * len(train_df)),
    replace=False
)

In [27]:
train_df.loc[flip_indices, "labels"] = 1 - train_df.loc[flip_indices, "labels"]

In [28]:
# Convert back to Dataset
noisy_train = Dataset.from_pandas(train_df)

In [29]:
# Initialize a fresh model for noisy training (avoids using partially trained model)
noisy_model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
).to("cuda")
noisy_training_args = TrainingArguments(
    output_dir="./results_noisy",
    per_device_train_batch_size=train_batch_size // 2,  # Smaller batch for noisy data
    per_device_eval_batch_size=eval_batch_size,
    num_train_epochs=40,
    learning_rate=1e-5,  # Lower LR for fine-tuning on noisy data
    fp16=True,
    logging_steps=50 if USE_SUBSET else 100,
    logging_strategy="steps",
    evaluation_strategy="epoch",  # Add evaluation every epoch
    disable_tqdm=False,
    report_to = 'none')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
    # Removed report_to parameter that was causing the error

# Retrain with noisy data
noisy_trainer = Trainer(
    model=noisy_model,  # Use fresh model
    args=noisy_training_args,
    train_dataset=noisy_train,
    eval_dataset=dataset["test"],  # Still evaluate on clean test data
    compute_metrics=compute_metrics,
    
)

In [31]:
print("\n\n=== Training with Noisy Data ===")
noisy_trainer.train()



=== Training with Noisy Data ===




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.686585,0.525
2,No log,0.660801,0.65
3,No log,0.636088,0.725
4,No log,0.602443,0.825
5,No log,0.548049,0.825
6,No log,0.485279,0.9
7,No log,0.429935,0.9
8,No log,0.376994,0.925
9,0.589900,0.333679,0.925
10,0.589900,0.305963,0.925




TrainOutput(global_step=240, training_loss=0.23904554843902587, metrics={'train_runtime': 381.9432, 'train_samples_per_second': 36.864, 'train_steps_per_second': 0.628, 'total_flos': 1865140973076480.0, 'train_loss': 0.23904554843902587, 'epoch': 40.0})