In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from data_cleaning_import import clean_create_vectors
from tqdm import tqdm

In [3]:
DATA_PATH    = "data.csv"
MODEL_NAME   = "bert-base-uncased"
MAX_SEQ_LEN  = 128
BATCH_SIZE   = 32
LR           = 2e-5
NUM_EPOCHS   = 3       
WARMUP_RATIO = 0.1
N_FOLDS      = 5
DEVICE       = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv(DATA_PATH)
df = clean_create_vectors(df)
X = df["journal"].tolist()
y = df.drop(columns=["journal","emotion_vectors","activity_vectors"]).astype(int).values
label_names = df.drop(columns=["journal","emotion_vectors","activity_vectors"]).columns.tolist()
num_labels = len(label_names)

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

class JournalDataset(Dataset):
    def __init__(self, texts, labels):
        self.enc = tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=MAX_SEQ_LEN,
            return_tensors="pt"
        )
        self.labels = torch.tensor(labels, dtype=torch.float)
    def __len__(self):  return len(self.labels)
    def __getitem__(self, i):
        return {
            "input_ids":      self.enc.input_ids[i],
            "attention_mask": self.enc.attention_mask[i],
            "labels":         self.labels[i],
        }

test_ds = JournalDataset(X_test, y_test)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)


# k fold train
mskf = MultilabelStratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
all_test_probs = np.zeros((len(X_test), num_labels))

for fold, (tr_idx, val_idx) in enumerate(mskf.split(X_trainval, y_trainval), 1):
    print(f"\n=== Fold {fold}/{N_FOLDS} ===")
    X_tr = [X_trainval[i] for i in tr_idx];  y_tr = y_trainval[tr_idx]
    X_val = [X_trainval[i] for i in val_idx]; y_val = y_trainval[val_idx]

    # dataloaders
    tr_dl  = DataLoader(JournalDataset(X_tr, y_tr), batch_size=BATCH_SIZE, shuffle=True)
    val_dl = DataLoader(JournalDataset(X_val, y_val), batch_size=BATCH_SIZE)

    # fresh model + optimizer + scheduler
    model = BertForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=num_labels, problem_type="multi_label_classification"
    ).to(DEVICE)
    optimizer = AdamW(model.parameters(), lr=LR)
    total_steps = len(tr_dl) * NUM_EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(total_steps * WARMUP_RATIO),
        num_training_steps=total_steps
    )
    loss_fn = nn.BCEWithLogitsLoss()

    # train
    for epoch in range(1, NUM_EPOCHS+1):
        model.train()
        running = 0.0
        for batch in tqdm(tr_dl, desc=f"Fold{fold} Train E{epoch}"):
            optimizer.zero_grad()
            inp = {k:v.to(DEVICE) for k,v in batch.items() if k!="labels"}
            lbl = batch["labels"].to(DEVICE)
            out = model(**inp).logits
            loss = loss_fn(out, lbl)
            loss.backward()
            optimizer.step(); scheduler.step()
            running += loss.item()
        print(f"  → Epoch {epoch} train loss: {running/len(tr_dl):.4f}")

    # predict on the held-out test set
    model.eval()
    fold_probs = []
    with torch.no_grad():
        for batch in test_dl:
            inp = {k:v.to(DEVICE) for k,v in batch.items() if k!="labels"}
            logits = model(**inp).logits
            fold_probs.append(torch.sigmoid(logits).cpu().numpy())
    fold_probs = np.vstack(fold_probs)
    all_test_probs += fold_probs

# average the 5 fold predictions
all_test_probs /= N_FOLDS


#threshold tuning
def find_best_thresholds(probs, labels, num_labels):
    best_ts = []
    for i in range(num_labels):
        best_f, best_t = 0, .5
        for t in np.linspace(0,1,101):
            p = (probs[:,i] >= t).astype(int)
            f = f1_score(labels[:,i], p, zero_division=0)
            if f > best_f:
                best_f, best_t = f, t
        best_ts.append(best_t)
    return best_ts

# (you could hold out a small portion of the trainval for threshold tuning,
#  or just use y_test itself if you’re OK with peeking)
thresholds = find_best_thresholds(all_test_probs, y_test, num_labels)
print("Optimal thresholds per label:", thresholds)

# apply them
final_preds = np.zeros_like(all_test_probs, dtype=int)
for i,t in enumerate(thresholds):
    final_preds[:,i] = (all_test_probs[:,i] >= t).astype(int)

print("\nEnsembled Classification Report:\n")
print(classification_report(y_test, final_preds,
                            target_names=label_names,
                            zero_division=0))



=== Fold 1/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fold1 Train E1: 100%|██████████| 29/29 [01:46<00:00,  3.67s/it]


  → Epoch 1 train loss: 0.5916


Fold1 Train E2: 100%|██████████| 29/29 [01:44<00:00,  3.60s/it]


  → Epoch 2 train loss: 0.4395


Fold1 Train E3: 100%|██████████| 29/29 [01:44<00:00,  3.61s/it]


  → Epoch 3 train loss: 0.3980

=== Fold 2/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold2 Train E1: 100%|██████████| 30/30 [01:45<00:00,  3.52s/it]


  → Epoch 1 train loss: 0.6227


Fold2 Train E2: 100%|██████████| 30/30 [01:49<00:00,  3.65s/it]


  → Epoch 2 train loss: 0.4617


Fold2 Train E3: 100%|██████████| 30/30 [01:52<00:00,  3.73s/it]


  → Epoch 3 train loss: 0.4102

=== Fold 3/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold3 Train E1: 100%|██████████| 30/30 [01:52<00:00,  3.75s/it]


  → Epoch 1 train loss: 0.5765


Fold3 Train E2: 100%|██████████| 30/30 [01:57<00:00,  3.91s/it]


  → Epoch 2 train loss: 0.4219


Fold3 Train E3: 100%|██████████| 30/30 [01:55<00:00,  3.84s/it]


  → Epoch 3 train loss: 0.3797

=== Fold 4/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold4 Train E1: 100%|██████████| 30/30 [01:50<00:00,  3.68s/it]


  → Epoch 1 train loss: 0.6365


Fold4 Train E2: 100%|██████████| 30/30 [01:48<00:00,  3.62s/it]


  → Epoch 2 train loss: 0.4850


Fold4 Train E3: 100%|██████████| 30/30 [01:47<00:00,  3.60s/it]


  → Epoch 3 train loss: 0.4263

=== Fold 5/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold5 Train E1: 100%|██████████| 30/30 [01:50<00:00,  3.67s/it]


  → Epoch 1 train loss: 0.6490


Fold5 Train E2: 100%|██████████| 30/30 [01:48<00:00,  3.63s/it]


  → Epoch 2 train loss: 0.4908


Fold5 Train E3: 100%|██████████| 30/30 [01:46<00:00,  3.56s/it]


  → Epoch 3 train loss: 0.4166
Optimal thresholds per label: [np.float64(0.23), np.float64(0.24), np.float64(0.25), np.float64(0.21), np.float64(0.23), np.float64(0.23), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.28), np.float64(0.27), np.float64(0.49), 0.5, np.float64(0.24), np.float64(0.33), np.float64(0.24), np.float64(0.42), np.float64(0.0), np.float64(0.25), np.float64(0.34), np.float64(0.33), np.float64(0.23), np.float64(0.26), np.float64(0.24), np.float64(0.25), np.float64(0.27), np.float64(0.22), np.float64(0.27), np.float64(0.3)]

Ensembled Classification Report:

              precision    recall  f1-score   support

      afraid       0.01      1.00      0.02         2
       angry       0.03      1.00      0.07         5
     anxious       0.09      0.96      0.16        23
     ashamed       0.02      1.00      0.03         4
     awkward       0.02      0.50      0.03         4
       bored       0.12      0.20      0.15        15
        calm       0

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from transformers import BertTokenizerFast, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW 
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from tqdm import tqdm


In [None]:

DATA_PATH = "data.csv"
MODEL_NAME = "bert-base-uncased"
MAX_SEQ_LEN = 128
BATCH_SIZE = 32
LR = 2e-5
NUM_EPOCHS = 2  # Reduced to 2 epochs
WARMUP_RATIO = 0.1
N_FOLDS = 5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
OUTPUT_DIR = "model_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)


df = pd.read_csv(DATA_PATH)
df = clean_create_vectors(df)
y_columns = [c for c in df.columns if c not in ["journal", "emotion_vectors", "activity_vectors"]]

# remove labels with <10 samples 
label_counts = df[y_columns].sum()
unstable_labels = label_counts[label_counts < 10].index.tolist()
if unstable_labels:
    print(f"Removing unstable labels with <10 samples: {unstable_labels}")
y_columns = [c for c in y_columns if c not in unstable_labels]
label_names = y_columns
num_labels = len(label_names)
X = df["journal"].tolist()
y = df[label_names].astype(int).values

# Fix Class Weight Calc
class_counts = np.sum(y, axis=0)
effective_num = 1.0 / (class_counts + 1e-9)
weights = (1.0 / effective_num) / np.sum(1.0 / effective_num)
class_weights = torch.tensor(weights, dtype=torch.float).to(DEVICE)

# Focal Loss def with class weights
def focal_loss_fn(logits, targets, alpha=0.25, gamma=3.0, weights=None):
    bce_loss = nn.functional.binary_cross_entropy_with_logits(logits, targets, reduction='none')
    if weights is not None:
        bce_loss = bce_loss * weights
    p_t = torch.exp(-bce_loss)
    focal_loss = alpha * (1 - p_t) ** gamma * bce_loss
    return focal_loss.mean()

class JournalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.enc = tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=MAX_SEQ_LEN,
            return_tensors="pt"
        )
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        return {
            "input_ids": self.enc.input_ids[i],
            "attention_mask": self.enc.attention_mask[i],
            "labels": self.labels[i],
        }

# Threshold tuning with Asymmetric Probability Shift
def find_optimal_thresholds(probs, true_labels, label_names):
    shifted_probs = probs.copy()
    for i in range(shifted_probs.shape[1]):
        shifted_probs[:, i] = np.clip(shifted_probs[:, i] * 1.2 - 0.1, 0, 1)

    thresholds = [v/100. for v in range(5, 100, 5)]
    best_thresholds = {}
    for i, label in enumerate(label_names):
        best_score, best_thresh = 0, 0.5
        for t in thresholds:
            preds = (shifted_probs[:, i] >= t).astype(int)
            f1 = f1_score(true_labels[:, i], preds, zero_division=0)
            if f1 > best_score:
                best_score, best_thresh = f1, t
        best_thresholds[label] = best_thresh
        print(f"Label: {label} | Best F1={best_score:.3f} @ threshold={best_thresh}")
    return best_thresholds


if __name__ == "__main__":
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
    mskf = MultilabelStratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    all_test_logits = np.zeros((len(X_test), num_labels))
    all_val_probs = np.zeros((len(X_trainval), num_labels))
    all_val_labels = np.zeros((len(X_trainval), num_labels))

    for fold, (tr_idx, val_idx) in enumerate(mskf.split(X_trainval, y_trainval), 1):
        print(f"\n=== Fold {fold}/{N_FOLDS} ===")
        X_tr = [X_trainval[i] for i in tr_idx]; y_tr = y_trainval[tr_idx]
        X_val = [X_trainval[i] for i in val_idx]; y_val = y_trainval[val_idx]

        tr_ds = JournalDataset(X_tr, y_tr, tokenizer)
        val_ds = JournalDataset(X_val, y_val, tokenizer)
        test_ds = JournalDataset(X_test, y_test, tokenizer)

        tr_dl = DataLoader(tr_ds, batch_size=BATCH_SIZE, shuffle=True)
        val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE)
        test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)


        model = BertForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=num_labels,
            problem_type="multi_label_classification"
        ).to(DEVICE)
        model.dropout = nn.Dropout(0.3)

        optimizer = AdamW(model.parameters(), lr=LR)
        total_steps = len(tr_dl) * NUM_EPOCHS
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(total_steps * WARMUP_RATIO),
            num_training_steps=total_steps
        )
p
        for epoch in range(1, NUM_EPOCHS+1):
            model.train()
            for batch in tqdm(tr_dl, desc=f"Fold{fold} Train E{epoch}"):
                optimizer.zero_grad()
                inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != "labels"}
                labels = batch["labels"].to(DEVICE)
                logits = model(**inputs).logits
                loss = focal_loss_fn(logits, labels, weights=class_weights)
                loss.backward()
                optimizer.step()
                scheduler.step()

        model.eval()
        val_logits, val_labels = [], []
        with torch.no_grad():
            for batch in val_dl:
                inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != "labels"}
                logits = model(**inputs).logits.cpu().numpy()
                val_logits.append(logits)
                val_labels.append(batch["labels"].numpy())
        val_logits = np.vstack(val_logits)
        val_probs = torch.sigmoid(torch.tensor(val_logits)).numpy()
        all_val_probs[val_idx] = val_probs
        all_val_labels[val_idx] = y_val

        fold_test_logits = []
        with torch.no_grad():
            for batch in test_dl:
                inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != "labels"}
                logits = model(**inputs).logits.cpu().numpy()
                fold_test_logits.append(logits)
        all_test_logits += np.vstack(fold_test_logits)
    avg_test_logits = all_test_logits / N_FOLDS
    avg_test_probs = torch.sigmoid(torch.tensor(avg_test_logits)).numpy()

    best_thresholds = find_optimal_thresholds(all_val_probs, all_val_labels, label_names)

    final_preds = np.zeros_like(avg_test_probs, dtype=int)
    for i, label in enumerate(label_names):
        t = best_thresholds[label]
        final_preds[:, i] = (avg_test_probs[:, i] >= t).astype(int)

    print("\nFinal Classification Report:\n")
    print(classification_report(
        y_test, final_preds,
        target_names=label_names,
        zero_division=0,
        sample_weight=(y_test.sum(axis=1) > 0)
    ))

