In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Version checks
import sys
import torch
import numpy
import pandas as pd
import transformers
import sklearn
import datasets

print(f"Python         : {sys.version}")
print(f"PyTorch        : {torch.__version__}")
print(f"Numpy          : {numpy.__version__}")
print(f"Pandas         : {pd.__version__}")
print(f"Transformers   : {transformers.__version__}")
print(f"Scikit-learn   : {sklearn.__version__}")
print(f"Datasets (HF)  : {datasets.__version__}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device   :", device)


In [None]:
import pandas as pd
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

import pandas as pd
LABEL_SCHEMA = {
    "impact_type": {
        "type": "single-label",
        "classes": [
            "death", "injury", "displacement", "missing", "disease",
            "economic_loss", "conflict", "weather_shock", "policy_change", "none"
        ]
    },
    "resource": {
        "type": "multi-label",
        "classes": [
            "food", "water", "cash_aid", "healthcare", "shelter",
            "livelihoods", "education", "infrastructure", "none"
        ]
    },
    "urgency": {
        "type": "single-label",
        "classes": [
            "low", "moderate", "high", "unclear"
        ]
    }
}

# Load the full labeled dataset
df = pd.read_csv("/content/drive/MyDrive/phd/labeled_chunk_all.csv")

# Parse resource column from string to list
df["resource"] = df["resource"].apply(ast.literal_eval)

# Binarize the resource field
mlb = MultiLabelBinarizer()
resource_binary = mlb.fit_transform(df["resource"])
resource_classes = mlb.classes_

# Add one binary column per resource class
for i, label in enumerate(resource_classes):
    df[f"resource_{label}"] = resource_binary[:, i]

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
df["token_length"] = df["clean_text"].apply(lambda x: len(tokenizer.tokenize(str(x))))

# Combine fields for stratification
df["stratify_label"] = df["impact_type"] + "_" + df["urgency"]

# Split into train and validation sets
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["stratify_label"]
)

# Final feature columns
resource_cols = [f"resource_{label}" for label in resource_classes]
final_columns = ["clean_text", "impact_type", "urgency"] + resource_cols

train_df = train_df[final_columns].reset_index(drop=True)
val_df = val_df[final_columns].reset_index(drop=True)

# Token length stats
print("Token length stats (train):")
print(train_df["clean_text"].apply(lambda x: len(tokenizer.tokenize(str(x)))).describe())
print("\nTrain/Val shape:", train_df.shape, val_df.shape)

# -------------------------------
# Optimized chunking (no decoding)
# -------------------------------

def chunk_input_ids(text, tokenizer, max_length=512, stride=256):
    """
    Splits text into overlapping input_id chunks (no decoding).
    """
    tokens = tokenizer.encode(text, add_special_tokens=False)
    return [tokens[i:i+max_length] for i in range(0, len(tokens), stride) if tokens[i:i+max_length]]

def expand_dataset_fast(df, text_col="clean_text", label_cols=None):
    """
    Applies chunking to every row and replicates labels per chunk.
    Stores input_ids (not raw text).
    """
    label_cols = label_cols or ["impact_type", "urgency"] + [c for c in df.columns if c.startswith("resource_")]
    records = []

    for row in tqdm(df.itertuples(), total=len(df)):
        chunks = chunk_input_ids(getattr(row, text_col), tokenizer)
        for chunk in chunks:
            record = { "input_ids": chunk }
            for col in label_cols:
                record[col] = getattr(row, col)
            records.append(record)

    return pd.DataFrame(records)

# Apply to both train and validation sets
expanded_train_df = expand_dataset_fast(train_df)
expanded_val_df = expand_dataset_fast(val_df)

# Final shapes
print("\nExpanded train shape:", expanded_train_df.shape)
print("Expanded val shape:", expanded_val_df.shape)

# Optional: save to CSV or pickle if needed

# Encode single-label classification targets
impact_enc = LabelEncoder()
urgency_enc = LabelEncoder()

expanded_train_df["impact_type_id"] = impact_enc.fit_transform(expanded_train_df["impact_type"])
expanded_val_df["impact_type_id"] = impact_enc.transform(expanded_val_df["impact_type"])

expanded_train_df["urgency_id"] = urgency_enc.fit_transform(expanded_train_df["urgency"])
expanded_val_df["urgency_id"] = urgency_enc.transform(expanded_val_df["urgency"])

expanded_train_df.to_pickle("/content/drive/MyDrive/phd/expanded_train.pkl")
expanded_val_df.to_pickle("/content/drive/MyDrive/phd/expanded_val.pkl")

In [None]:
import torch, torch.nn as nn
import pandas as pd, numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
from transformers import (
    BertModel, BertTokenizer, BertPreTrainedModel,
    DataCollatorWithPadding, TrainingArguments, Trainer
)
from transformers.modeling_outputs import SequenceClassifierOutput

# ──────────────────────────────────────────────────────────────
# 0. DEVICE
# ──────────────────────────────────────────────────────────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ──────────────────────────────────────────────────────────────
# 1. MULTI-TASK MODEL
# ──────────────────────────────────────────────────────────────
class MultiTaskBERT(BertPreTrainedModel):
    def __init__(self, config, num_impact, num_urgency, num_resource):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.1)
        self.impact_head = nn.Linear(config.hidden_size, num_impact)
        self.urgency_head = nn.Linear(config.hidden_size, num_urgency)
        self.resource_head = nn.Linear(config.hidden_size, num_resource)
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, **kwargs):
        pooled = self.dropout(self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output)
        return {
            "impact": self.impact_head(pooled),
            "urgency": self.urgency_head(pooled),
            "resource": self.resource_head(pooled)
        }

# ──────────────────────────────────────────────────────────────
# 2.  LOAD DATA & ENCODE LABELS (for PyTorch Dataset)
# ──────────────────────────────────────────────────────────────

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load data
df_train = pd.read_pickle("/content/drive/MyDrive/phd/expanded_train.pkl")
df_val   = pd.read_pickle("/content/drive/MyDrive/phd/expanded_val.pkl")

# Encode impact and urgency labels
impact_enc  = LabelEncoder()
urgency_enc = LabelEncoder()

df_train["impact_type_id"] = impact_enc.fit_transform(df_train["impact_type"])
df_val["impact_type_id"]   = impact_enc.transform(df_val["impact_type"])

df_train["urgency_id"] = urgency_enc.fit_transform(df_train["urgency"])
df_val["urgency_id"]   = urgency_enc.transform(df_val["urgency"])
print("val_ds length:", len(df_val))

# Identify resource columns
resource_cols = [c for c in df_train.columns if c.startswith("resource_")]

# Define custom PyTorch dataset
from torch.utils.data import Dataset as TorchDataset
import torch

class CustomTorchDataset(TorchDataset):
    def __init__(self, df, resource_cols):
        self.input_ids = list(df["input_ids"])
        self.impact_type_id = list(df["impact_type_id"])
        self.urgency_id = list(df["urgency_id"])
        self.resource_matrix = df[resource_cols].values.astype("float32")

    def __getitem__(self, idx):
        item = {
            "input_ids": self.input_ids[idx],  # list of ints, not tensor!
            "impact_type_id": self.impact_type_id[idx],
            "urgency_id": self.urgency_id[idx],
        }
        for i, col in enumerate(resource_cols):
            item[col] = self.resource_matrix[idx, i]
        return item
    def __len__(self):
        return len(self.input_ids)


# Instantiate datasets
train_ds = CustomTorchDataset(df_train, resource_cols)
val_ds   = CustomTorchDataset(df_val, resource_cols)

# ──────────────────────────────────────────────────────────────
# 3. TOKENIZER & COLLATOR
# ──────────────────────────────────────────────────────────────
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
collator  = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8, return_tensors="pt")

# (Optional) Debug block: use collator after it's defined!
from torch.utils.data import DataLoader

print("val_ds length:", len(val_ds))
batch = next(iter(DataLoader(val_ds, batch_size=2, collate_fn=collator)))
print("Sample batch keys:", list(batch.keys()))
for k in batch:
    print(f"{k}: {batch[k].shape}, dtype={batch[k].dtype}")
for k in batch:
    print(f"{k} (first 2 values): {batch[k][:2]}")

# ──────────────────────────────────────────────────────────────
# 4. INIT MODEL
# ──────────────────────────────────────────────────────────────
model = MultiTaskBERT.from_pretrained(
    "bert-base-uncased",
    num_impact=len(impact_enc.classes_),
    num_urgency=len(urgency_enc.classes_),
    num_resource=len(resource_cols)
).to(device)

# ──────────────────────────────────────────────────────────────
# 5. CUSTOM TRAINER
# ──────────────────────────────────────────────────────────────
class MultiTaskTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Always copy to avoid mutating Trainer's batch
        inputs = inputs.copy()
        # Move everything to device
        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

        # Pop the targets from inputs
        y_impact   = inputs.pop("impact_type_id")
        y_urgency  = inputs.pop("urgency_id")
        y_resource = torch.stack([inputs.pop(c) for c in resource_cols], dim=1).float()

        # Forward pass
        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

        # Compute losses
        loss_impact   = nn.CrossEntropyLoss()(outputs["impact"], y_impact)
        loss_urgency  = nn.CrossEntropyLoss()(outputs["urgency"], y_urgency)
        loss_resource = nn.BCEWithLogitsLoss()(outputs["resource"], y_resource)

        # Weighted sum of losses (tune these as needed)
        total_loss = 1.0 * loss_impact + 1.0 * loss_urgency + 0.5 * loss_resource

        # Hugging Face expects just the loss for eval, (loss, outputs) for train
        if return_outputs:
            return total_loss, outputs
        return total_loss

# ──────────────────────────────────────────────────────────────
# 6. TRAINING ARGS
# ──────────────────────────────────────────────────────────────

# Identify all label columns
all_label_cols = ["impact_type_id", "urgency_id"] + resource_cols

args = TrainingArguments(
    output_dir="/content/drive/MyDrive/phd/bert_multitask_model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="/content/drive/MyDrive/phd/logs",
    logging_steps=10,
    # ----- FIX HERE -----
    load_best_model_at_end=True,      # keep if you want best-model logic
    metric_for_best_model="eval_impact_acc",# pick any key you return in compute_metrics
    greater_is_better=True,           # set False if you choose a loss
    # ---------------------
    report_to="none",
    remove_unused_columns=True, # Keep True
    fp16=True,
    label_names=all_label_cols # Add label_names here
)


# ──────────────────────────────────────────────────────────────
# 7. METRICS + TRAIN
# ──────────────────────────────────────────────────────────────
def compute_metrics(pred):
    # unpack predictions
    impact_logits, urgency_logits, resource_logits = pred.predictions

    # pred.label_ids is a tuple when remove_unused_columns=True and label_names is used
    # Unpack the tuple into separate label arrays
    impact_labels, urgency_labels = pred.label_ids[:2]
    resource_labels = pred.label_ids[2:]

    # Convert resource_labels to a numpy array if it's a tuple of arrays
    if isinstance(resource_labels, tuple):
        resource_labels = np.stack(resource_labels, axis=1)


    # derive predictions
    impact_preds   = impact_logits.argmax(axis=1)
    urgency_preds  = urgency_logits.argmax(axis=1)
    resource_preds = (torch.sigmoid(torch.tensor(resource_logits)) > 0.5).int().numpy()

    # compute metrics
    metrics = {
        "eval_impact_acc":  accuracy_score(impact_labels, impact_preds),
        "eval_urgency_acc": accuracy_score(urgency_labels, urgency_preds),
        "eval_resource_f1_micro": f1_score(resource_labels, resource_preds, average="micro"),
        "eval_resource_f1_macro": f1_score(resource_labels, resource_preds, average="macro"),
    }
    return metrics


trainer = MultiTaskTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics
)

# Resume if checkpoint exists
import os

checkpoint_dir = "/content/drive/MyDrive/phd/bert_multitask_model"
checkpoints = [os.path.join(checkpoint_dir, d) for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint-")]

if checkpoints:
    latest_checkpoint = max(checkpoints, key=lambda x: int(x.split("-")[-1]))
    print(f"Resuming from: {latest_checkpoint}")
    trainer.train(resume_from_checkpoint=latest_checkpoint)
else:
    trainer.train()

# ──────────────────────────────────────────────────────────────
# 8. SAVE FINAL MODEL
# ──────────────────────────────────────────────────────────────
final_path = "/content/drive/MyDrive/phd/bert_multitask_model/final"
trainer.save_model(final_path)
tokenizer.save_pretrained(final_path)

In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertPreTrainedModel, BertModel
import transformers
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from scipy.stats import mode
import os
import glob
import fastparquet

# ────────────────
# CONFIGURATION
# ────────────────
MODEL_DIR = "/content/drive/MyDrive/phd/bert_multitask_model/final"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LENGTH = 512
STRIDE = 256

# These must exactly match training
impact_classes = [
    "death", "injury", "displacement", "missing", "disease",
    "economic_loss", "conflict", "weather_shock", "policy_change", "none"
]
urgency_classes = ["low", "moderate", "high", "unclear"]
resource_cols = [
    "resource_food", "resource_water", "resource_cash_aid", "resource_healthcare", "resource_shelter",
    "resource_livelihoods", "resource_education", "resource_infrastructure", "resource_none"
]

# ────────────────
# LABEL ENCODERS
# ────────────────
impact_enc = LabelEncoder()
impact_enc.fit(impact_classes)

urgency_enc = LabelEncoder()
urgency_enc.fit(urgency_classes)


# ────────────────
# MODEL DEFINITION
# ────────────────
class MultiTaskBERT(BertPreTrainedModel):
    def __init__(self, config, num_impact, num_urgency, num_resource):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.1)
        self.impact_head = nn.Linear(config.hidden_size, num_impact)
        self.urgency_head = nn.Linear(config.hidden_size, num_urgency)
        self.resource_head = nn.Linear(config.hidden_size, num_resource)
        self.init_weights()
    def forward(self, input_ids=None, attention_mask=None, **kwargs):
        pooled = self.dropout(self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output)
        return {
            "impact": self.impact_head(pooled),
            "urgency": self.urgency_head(pooled),
            "resource": self.resource_head(pooled)
        }

# ────────────────
# LOAD MODEL & TOKENIZER
# ────────────────
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
model = MultiTaskBERT.from_pretrained(
    MODEL_DIR,
    num_impact=len(impact_classes),
    num_urgency=len(urgency_classes),
    num_resource=len(resource_cols),
    local_files_only=True # Add local_files_only=True here as well
)
model.to(DEVICE)
model.eval()

# ────────────────
# CHUNKING FUNCTION (from your training)
# ────────────────
transformers.logging.set_verbosity_error()  # At the top to silence all warnings
def chunk_input_ids(text, tokenizer, max_length=512, stride=256):
    # Get raw tokens (no special tokens)
    tokens = tokenizer.encode(text, add_special_tokens=False)
    max_len_no_special = max_length - 2
    chunks = []
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i + max_len_no_special]
        # Now, turn chunk back into string and re-encode with special tokens, pad/truncate to max_length
        chunk_text = tokenizer.decode(chunk)
        inputs = tokenizer(
            chunk_text,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        chunks.append(inputs)
    return chunks

def predict_for_text(text):
    chunks = chunk_input_ids(str(text), tokenizer, max_length=MAX_LENGTH, stride=STRIDE)
    if not chunks:
        # fallback: [UNK] padded to 512
        inputs = tokenizer(
            tokenizer.unk_token,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        chunks = [inputs]
    input_ids_tensor = torch.cat([chunk['input_ids'] for chunk in chunks]).to(DEVICE)
    attention_mask_tensor = torch.cat([chunk['attention_mask'] for chunk in chunks]).to(DEVICE)

    # --- DEBUG & CATCH ---
    assert input_ids_tensor.shape[1] <= 512, f"Found input_ids length {input_ids_tensor.shape[1]}"

    with torch.no_grad():
        outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
    # ... (rest of your code unchanged)
    impact_preds = outputs["impact"].argmax(dim=1).cpu().numpy()
    urgency_preds = outputs["urgency"].argmax(dim=1).cpu().numpy()
    resource_probs = torch.sigmoid(outputs["resource"]).cpu().numpy()
    resource_preds = (resource_probs > 0.5).astype(int)

    impact_final = mode(impact_preds, keepdims=True)[0][0]
    urgency_final = mode(urgency_preds, keepdims=True)[0][0]
    resource_final = (resource_preds.sum(axis=0) > 0).astype(int)

    impact_label = impact_enc.inverse_transform([impact_final])[0]
    urgency_label = urgency_enc.inverse_transform([urgency_final])[0]
    resource_dict = {col: int(val) for col, val in zip(resource_cols, resource_final)}
    return impact_label, urgency_label, resource_dict

# ────────────────
# APPLY TO DATAFRAME
# ────────────────

# Example: df should have a column 'clean_text'
tqdm.pandas()  # progress bar

# 1) Read data
parquet_files = glob.glob(os.path.join("/content/drive/MyDrive/phd/data/exploded_partial_chunk_*.parquet"))
print(parquet_files)
# Read and concatenate all parquet files
exploded_df = pd.concat([fastparquet.ParquetFile(file).to_pandas() for file in parquet_files], ignore_index=True)
exploded_df = exploded_df.reset_index(drop=True)

results = exploded_df['clean_text'].progress_apply(predict_for_text)

# Add predictions to dataframe
exploded_df['pred_impact_type'] = results.apply(lambda x: x[0])
exploded_df['pred_urgency'] = results.apply(lambda x: x[1])
for i, col in enumerate(resource_cols):
    exploded_df[f'pred_{col}'] = results.apply(lambda x: x[2][col])

exploded_df.to_parquet("/content/drive/MyDrive/phd/data/exploded_full.parquet")
exploded_df.head(20)