In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

## DistilBERT baseline using HuggingFace

In [None]:
!pip -q install transformers accelerate datasets scikit-learn

In [None]:
import torch, platform
print("PyTorch:", torch.__version__, "| CUDA:", torch.cuda.is_available(), "| Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
print("Python:", platform.python_version())

In [None]:
import transformers, datasets, accelerate
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("accelerate:", accelerate.__version__)

In [None]:
# =========================
# DistilBERT baseline for "NLP with Disaster Tweets"
# =========================
import os, re, random, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_recall_fscore_support
import torch
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
from transformers import EarlyStoppingCallback
import numpy as np

# -------------------------
# Reproducibility
# -------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# -------------------------
# Data paths
# -------------------------
TRAIN_PATH = '/content/drive/MyDrive/University of Toronto/CSC2701 Communication for Computer Scientists/Kaggle Dataset/train.csv'
TEST_PATH  = '/content/drive/MyDrive/University of Toronto/CSC2701 Communication for Computer Scientists/Kaggle Dataset/test.csv'
SUB_PATH   = "submission.csv"

# -------------------------
# Light tweet normalization, can add any cleaning or preprocessin here
# -------------------------
URL_RE  = re.compile(r"https?://\S+|www\.\S+")
USER_RE = re.compile(r"@\w+")
def normalize_tweet(t: str) -> str:
    t = URL_RE.sub(" <url> ", t)
    t = USER_RE.sub(" <user> ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

# -------------------------
# Load data
# -------------------------
df = pd.read_csv(TRAIN_PATH)
df["text"] = df["text"].astype(str).map(normalize_tweet)

df_train, df_val = train_test_split(
    df, test_size=0.1, random_state=SEED, stratify=df["target"]
)

df_test = pd.read_csv(TEST_PATH)
df_test["text"] = df_test["text"].astype(str).map(normalize_tweet)

# -------------------------
# Hugging Face Datasets
# -------------------------
train_ds = Dataset.from_pandas(df_train[["text", "target"]].reset_index(drop=True))
val_ds   = Dataset.from_pandas(df_val[["text", "target"]].reset_index(drop=True))
test_ds  = Dataset.from_pandas(df_test[["id", "text"]].reset_index(drop=True))

# -------------------------
# Model & tokenizer
# -------------------------
MODEL_NAME = "distilbert-base-uncased"
NUM_LABELS = 2

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
config    = AutoConfig.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
model     = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

MAX_LEN   = 128

def tok_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding=False,
        max_length=MAX_LEN,
    )

train_ds = train_ds.map(tok_fn, batched=True, remove_columns=["text"])
val_ds   = val_ds.map(tok_fn, batched=True, remove_columns=["text"])
test_ds  = test_ds.map(tok_fn, batched=True, remove_columns=["text"])

# Rename target -> labels for Trainer
train_ds = train_ds.rename_column("target", "labels")
val_ds   = val_ds.rename_column("target", "labels")

# Data collator pads dynamically
collator = DataCollatorWithPadding(tokenizer=tokenizer)

# -------------------------
# Metrics
# -------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = (logits.argmax(axis=-1)).astype(int)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# -------------------------
# Training arguments
# -------------------------
BATCH = 32
EPOCHS = 10
LR = 1e-5
WARMUP_RATIO = 0.1

args = TrainingArguments(
    output_dir="./distilbert-disaster",
    eval_strategy="epoch",   # <- REQUIRED if load_best_model_at_end=True
    save_strategy="epoch",         # usually match eval strategy
    load_best_model_at_end=True,
    metric_for_best_model="f1",    # must match key from compute_metrics
    greater_is_better=True,

    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH,
    per_device_eval_batch_size=BATCH,
    learning_rate=LR,
    weight_decay=0.01,
    warmup_ratio=WARMUP_RATIO,
    max_grad_norm=1.0,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_total_limit=2,            # optional: keep only best + last
    report_to=[],                  # disable W&B/etc
)

# -------------------------
# Trainer
# -------------------------
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()
metrics = trainer.evaluate()
print("Validation metrics:", metrics)

# -------------------------
# Tune Decision Threshold
# -------------------------

val_pred = trainer.predict(val_ds)  # returns PredictionOutput with .predictions
logits = val_pred.predictions  # shape [N, 2] for softmax models
y_true = val_ds["labels"] if "labels" in val_ds.features else df_val["target"].to_numpy()

probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]

best = {"f1": -1, "thr": 0.5, "prec": 0.0, "rec": 0.0}
for thr in np.linspace(0.2, 0.8, 61):  # dense sweep
    y_pred = (probs >= thr).astype(int)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
    if f1 > best["f1"] or (np.isclose(f1, best["f1"]) and rec > best["rec"]):
        best = {"f1": float(f1), "thr": float(thr), "prec": float(prec), "rec": float(rec)}

print(f"Best threshold = {best['thr']:.3f} | F1={best['f1']:.4f} (P={best['prec']:.4f}, R={best['rec']:.4f})")


# -------------------------
# Inference on test & submission
# -------------------------
# preds = trainer.predict(test_ds).predictions
# test_labels = preds.argmax(axis=-1)

test_logits = trainer.predict(test_ds).predictions
test_probs = torch.softmax(torch.tensor(test_logits), dim=1).numpy()[:, 1]
test_labels = (test_probs >= best["thr"]).astype(int)

# sub = pd.DataFrame({
#     "id": df_test["id"],
#     "target": test_labels.astype(int)
# })
# sub.to_csv(SUB_PATH, index=False)
# print(f"Saved {SUB_PATH} with shape {sub.shape}")

sub = pd.DataFrame({"id": df_test["id"], "target": test_labels})
sub.to_csv("submission.csv", index=False)
print("Saved submission.csv with tuned threshold", best["thr"])


In [None]:
# =========================
# Inference prompt for trained DistilBERT disaster classifier
# =========================
import os, re, glob, json
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ---- set this to your training output_dir ----
OUTPUT_DIR = "./distilbert-disaster"

# ---- find the best checkpoint (from trainer_state.json), fall back to latest, then to OUTPUT_DIR ----
def resolve_model_dir(output_dir: str) -> str:
    state_path = os.path.join(output_dir, "trainer_state.json")
    if os.path.exists(state_path):
        try:
            with open(state_path, "r", encoding="utf-8") as f:
                st = json.load(f)
            best_ckpt = st.get("best_model_checkpoint")
            if best_ckpt and os.path.isdir(best_ckpt):
                return best_ckpt
        except Exception:
            pass
    ckpts = [p for p in glob.glob(os.path.join(output_dir, "checkpoint-*")) if os.path.isdir(p)]
    if ckpts:
        ckpts.sort(key=lambda p: int(p.split("-")[-1]))
        return ckpts[-1]
    return output_dir  # last resort (e.g., if you saved directly to OUTPUT_DIR)

MODEL_DIR = resolve_model_dir(OUTPUT_DIR)
print(f"Loading model from: {MODEL_DIR}")

# ---- load tokenizer/model ----
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ---- same light normalization as training ----
URL_RE  = re.compile(r"https?://\S+|www\.\S+")
USER_RE = re.compile(r"@\w+")
def normalize_tweet(t: str) -> str:
    t = URL_RE.sub(" <url> ", str(t))
    t = USER_RE.sub(" <user> ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

# ---- label mapping (0/1 -> human-readable) ----
IDX2LABEL = {0: "not disaster", 1: "disaster"}

# ---- single-text prediction ----
@torch.no_grad()
def predict_one(text: str, max_len: int = 128):
    text = normalize_tweet(text)
    enc = tokenizer(text, truncation=True, max_length=max_len, return_tensors="pt")
    enc = {k: v.to(device) for k, v in enc.items()}
    logits = model(**enc).logits
    probs = torch.softmax(logits, dim=-1).squeeze(0).cpu().numpy()
    pred = int(np.argmax(probs))
    return pred, probs

# ---- interactive loop ----
print("\nType a tweet and press Enter to classify. Type 'quit' (or empty line) to exit.\n")
while True:
    try:
        user_text = input("Tweet> ").strip()
    except EOFError:
        break
    if user_text.lower() in {"", "quit", "exit"}:
        print("Bye!")
        break
    pred, probs = predict_one(user_text, max_len=128)
    print(f"Prediction: {IDX2LABEL[pred]}  |  P(not disaster)={probs[0]:.3f}, P(disaster)={probs[1]:.3f}\n")
