In [6]:
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import torch
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

In [7]:
# Load dataset
df = pd.read_csv('/kaggle/input/neurathon/train.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/neurathon/train.csv'

In [8]:
# Add text and label columns
df["text"] = df["message"].astype(str)
df["label"] = df["label"].astype(int)

# Sample 4000 rows for faster training
df_sample = df.sample(n=4000, random_state=42)

# Train-validation split
train_df, val_df = train_test_split(
    df_sample, test_size=0.2, random_state=42, stratify=df_sample["label"]
)

# Convert to Hugging Face dataset
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

In [9]:
# Load tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocessing function
def preprocess(batch):
    return tokenizer(batch["text"], truncation=True, padding=False, max_length=128)

# Apply preprocessing
train_ds = train_ds.map(preprocess, batched=True)
val_ds = val_ds.map(preprocess, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [10]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2
)

# Metric
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return f1_metric.compute(predictions=preds, references=labels, average="binary")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training parameters
epochs = 10
batch_size = 32
lr = 2e-5

training_args = TrainingArguments(
    output_dir="bert_sentiment_out",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=50,
    save_total_limit=2,
    report_to="none",  # Disable W&B logging
)

# Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [12]:
trainer.train()

# Save final model
trainer.save_model("bert_sentiment_out/final_model")
tokenizer.save_pretrained("bert_sentiment_out/final_model")

print("✅ Training complete. Best model saved to bert_sentiment_out/final_model")



Epoch,Training Loss,Validation Loss,F1
1,0.6337,0.561836,0.712401
2,0.5019,0.523453,0.739409
3,0.4155,0.556059,0.780997
4,0.3355,0.584488,0.733424
5,0.2679,0.619677,0.763454
6,0.2116,0.653223,0.758448
7,0.171,0.733628,0.722598
8,0.143,0.730796,0.76699
9,0.1209,0.765751,0.752201
10,0.1148,0.778188,0.755332




✅ Training complete. Best model saved to bert_sentiment_out/final_model


In [15]:
print(os.listdir("/content"))
print(os.listdir("/kaggle/working/bert_sentiment_out"))

['.config', 'sample_data']
['checkpoint-150', 'final_model', 'checkpoint-500']


In [18]:
# Load model and tokenizer
model_path = "/kaggle/working/bert_sentiment_out/final_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [20]:
df_test = pd.read_csv("/kaggle/input/neurathon/test.csv")

# Extract texts
texts = df_test["message"].tolist()

In [21]:
preds = []
model.eval()

for i in range(0, len(texts), 32):   # batch size = 32
    batch_texts = texts[i:i+32]
    encodings = tokenizer(batch_texts, truncation=True, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encodings)
        batch_preds = torch.argmax(outputs.logits, dim=1).numpy()
        preds.extend(batch_preds)

df_test["Prediction"] = preds
df_test.to_csv("test_predictions.csv", index=False)

print("✅ Predictions saved to test_predictions.csv")

✅ Predictions saved to test_predictions.csv
