In [4]:
import os
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from transformers import BertTokenizerFast, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW 
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from tqdm import tqdm



In [6]:
# Configuration
DATA_PATH = "data.csv"
MODEL_NAME = "bert-base-uncased"
MAX_SEQ_LEN = 128
BATCH_SIZE = 32
LR = 2e-5
NUM_EPOCHS = 2  # Reduced to 2-3 epochs
WARMUP_RATIO = 0.1
N_FOLDS = 5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
OUTPUT_DIR = "model_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Focal Loss definition
def focal_loss_fn(logits, targets, alpha=1.0, gamma=2.0):
    bce_loss = nn.functional.binary_cross_entropy_with_logits(logits, targets, reduction='none')
    p_t = torch.exp(-bce_loss)
    focal_loss = alpha * (1 - p_t) ** gamma * bce_loss
    return focal_loss.mean()

# Dataset class
class JournalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.enc = tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=MAX_SEQ_LEN,
            return_tensors="pt"
        )
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        return {
            "input_ids": self.enc.input_ids[i],
            "attention_mask": self.enc.attention_mask[i],
            "labels": self.labels[i],
        }

# Threshold tuning (F1 objective)
def find_optimal_thresholds(probs, true_labels, label_names):
    thresholds = [v/100. for v in range(5, 100, 5)]
    best_thresholds = {}
    for i, label in enumerate(label_names):
        best_score, best_thresh = 0, 0.5
        for t in thresholds:
            preds = (probs[:, i] >= t).astype(int)
            f1 = f1_score(true_labels[:, i], preds, zero_division=0)
            if f1 > best_score:
                best_score, best_thresh = f1, t
        best_thresholds[label] = best_thresh
        print(f"Label: {label} | Best F1={best_score:.3f} @ threshold={best_thresh}")
    return best_thresholds

# Main training and ensembling
if __name__ == "__main__":
    # Load data
    df = pd.read_csv(DATA_PATH)
    df = clean_create_vectors(df)
    X = df["journal"].tolist()
    y_columns = [c for c in df.columns if c not in ["journal", "emotion_vectors", "activity_vectors"]]
    y = df[y_columns].astype(int).values
    label_names = y_columns
    num_labels = len(label_names)

    # Train-test split
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
    mskf = MultilabelStratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

    # Storage for logits and val probs
    all_test_logits = np.zeros((len(X_test), num_labels))
    all_val_probs = np.zeros((len(X_trainval), num_labels))
    all_val_labels = np.zeros((len(X_trainval), num_labels))

    for fold, (tr_idx, val_idx) in enumerate(mskf.split(X_trainval, y_trainval), 1):
        print(f"\n=== Fold {fold}/{N_FOLDS} ===")
        # Prepare data
        X_tr = [X_trainval[i] for i in tr_idx]; y_tr = y_trainval[tr_idx]
        X_val = [X_trainval[i] for i in val_idx]; y_val = y_trainval[val_idx]

        tr_ds = JournalDataset(X_tr, y_tr, tokenizer)
        val_ds = JournalDataset(X_val, y_val, tokenizer)
        test_ds = JournalDataset(X_test, y_test, tokenizer)

        tr_dl = DataLoader(tr_ds, batch_size=BATCH_SIZE, shuffle=True)
        val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE)
        test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)

        # Model
        model = BertForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=num_labels,
            problem_type="multi_label_classification"
        ).to(DEVICE)
        model.dropout = nn.Dropout(0.3)  # add dropout before classifier

        optimizer = AdamW(model.parameters(), lr=LR)
        total_steps = len(tr_dl) * NUM_EPOCHS
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(total_steps * WARMUP_RATIO),
            num_training_steps=total_steps
        )

        # Training loop
        for epoch in range(1, NUM_EPOCHS+1):
            model.train()
            for batch in tqdm(tr_dl, desc=f"Fold{fold} Train E{epoch}"):
                optimizer.zero_grad()
                inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != "labels"}
                labels = batch["labels"].to(DEVICE)
                logits = model(**inputs).logits
                loss = focal_loss_fn(logits, labels)
                loss.backward()
                optimizer.step()
                scheduler.step()

        # Validation probabilities
        model.eval()
        val_logits, val_labels = [], []
        with torch.no_grad():
            for batch in val_dl:
                inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != "labels"}
                logits = model(**inputs).logits.cpu().numpy()
                val_logits.append(logits)
                val_labels.append(batch["labels"].numpy())
        val_logits = np.vstack(val_logits)
        val_probs = torch.sigmoid(torch.tensor(val_logits)).numpy()
        all_val_probs[val_idx] = val_probs
        all_val_labels[val_idx] = y_val

        # Test logits
        fold_test_logits = []
        with torch.no_grad():
            for batch in test_dl:
                inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != "labels"}
                logits = model(**inputs).logits.cpu().numpy()
                fold_test_logits.append(logits)
        all_test_logits += np.vstack(fold_test_logits)

    # Average logits and probs
    avg_test_logits = all_test_logits / N_FOLDS
    avg_test_probs = torch.sigmoid(torch.tensor(avg_test_logits)).numpy()

    # Threshold tuning using F1 objective
    best_thresholds = find_optimal_thresholds(all_val_probs, all_val_labels, label_names)

    # Final predictions
    final_preds = np.zeros_like(avg_test_probs, dtype=int)
    for i, label in enumerate(label_names):
        t = best_thresholds[label]
        final_preds[:, i] = (avg_test_probs[:, i] >= t).astype(int)

    # Report results
    print("\nFinal Classification Report:\n")
    print(classification_report(y_test, final_preds, target_names=label_names, zero_division=0))

    # Save outputs
    pd.DataFrame(avg_test_probs, columns=label_names).to_csv(
        os.path.join(OUTPUT_DIR, 'test_probabilities.csv'), index=False
    )
    pd.DataFrame(final_preds, columns=label_names).to_csv(
        os.path.join(OUTPUT_DIR, 'test_predictions.csv'), index=False
    )
    pd.DataFrame.from_dict(best_thresholds, orient='index', columns=['threshold']).to_csv(
        os.path.join(OUTPUT_DIR, 'optimal_thresholds.csv')
    )



=== Fold 1/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold1 Train E1: 100%|██████████| 29/29 [01:48<00:00,  3.74s/it]
Fold1 Train E2: 100%|██████████| 29/29 [01:43<00:00,  3.55s/it]



=== Fold 2/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold2 Train E1: 100%|██████████| 30/30 [01:47<00:00,  3.59s/it]
Fold2 Train E2: 100%|██████████| 30/30 [01:51<00:00,  3.73s/it]



=== Fold 3/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold3 Train E1: 100%|██████████| 30/30 [01:39<00:00,  3.33s/it]
Fold3 Train E2: 100%|██████████| 30/30 [01:31<00:00,  3.05s/it]



=== Fold 4/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold4 Train E1: 100%|██████████| 30/30 [01:35<00:00,  3.19s/it]
Fold4 Train E2: 100%|██████████| 30/30 [01:33<00:00,  3.10s/it]



=== Fold 5/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold5 Train E1: 100%|██████████| 30/30 [01:37<00:00,  3.24s/it]
Fold5 Train E2: 100%|██████████| 30/30 [01:34<00:00,  3.16s/it]


Label: afraid | Best F1=0.032 @ threshold=0.3
Label: angry | Best F1=0.042 @ threshold=0.35
Label: anxious | Best F1=0.163 @ threshold=0.3
Label: ashamed | Best F1=0.030 @ threshold=0.3
Label: awkward | Best F1=0.022 @ threshold=0.3
Label: bored | Best F1=0.059 @ threshold=0.3
Label: calm | Best F1=0.388 @ threshold=0.35
Label: confused | Best F1=0.052 @ threshold=0.35
Label: disgusted | Best F1=0.032 @ threshold=0.05
Label: excited | Best F1=0.296 @ threshold=0.05
Label: frustrated | Best F1=0.175 @ threshold=0.05
Label: happy | Best F1=0.658 @ threshold=0.45
Label: jealous | Best F1=0.005 @ threshold=0.05
Label: nostalgic | Best F1=0.096 @ threshold=0.3
Label: proud | Best F1=0.375 @ threshold=0.35
Label: sad | Best F1=0.065 @ threshold=0.35
Label: satisfied | Best F1=0.569 @ threshold=0.4
Label: surprised | Best F1=0.090 @ threshold=0.3
Label: exercise | Best F1=0.229 @ threshold=0.35
Label: family | Best F1=0.324 @ threshold=0.35
Label: food | Best F1=0.267 @ threshold=0.35
Label: 

In [9]:

# Configuration
DATA_PATH = "data.csv"
MODEL_NAME = "bert-base-uncased"
MAX_SEQ_LEN = 128
BATCH_SIZE = 32
LR = 2e-5
NUM_EPOCHS = 2  # Reduced to 2-3 epochs
WARMUP_RATIO = 0.1
N_FOLDS = 5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
OUTPUT_DIR = "model_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load and prepare data
df = pd.read_csv(DATA_PATH)
df = clean_create_vectors(df)
# Identify label columns
y_columns = [c for c in df.columns if c not in ["journal", "emotion_vectors", "activity_vectors"]]

# Address Data Sparsity: remove labels with <10 samples
label_counts = df[y_columns].sum()
unstable_labels = label_counts[label_counts < 10].index.tolist()
if unstable_labels:
    print(f"Removing unstable labels with <10 samples: {unstable_labels}")
y_columns = [c for c in y_columns if c not in unstable_labels]

# Final labels and texts
label_names = y_columns
num_labels = len(label_names)
X = df["journal"].tolist()
y = df[label_names].astype(int).values

# Fix Class Weight Calculation
class_counts = np.sum(y, axis=0)
effective_num = 1.0 / (class_counts + 1e-9)
weights = (1.0 / effective_num) / np.sum(1.0 / effective_num)
class_weights = torch.tensor(weights, dtype=torch.float).to(DEVICE)

# Focal Loss definition with class weights
def focal_loss_fn(logits, targets, alpha=0.25, gamma=3.0, weights=None):
    bce_loss = nn.functional.binary_cross_entropy_with_logits(logits, targets, reduction='none')
    if weights is not None:
        bce_loss = bce_loss * weights
    p_t = torch.exp(-bce_loss)
    focal_loss = alpha * (1 - p_t) ** gamma * bce_loss
    return focal_loss.mean()

# Dataset class
class JournalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.enc = tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=MAX_SEQ_LEN,
            return_tensors="pt"
        )
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        return {
            "input_ids": self.enc.input_ids[i],
            "attention_mask": self.enc.attention_mask[i],
            "labels": self.labels[i],
        }

# Threshold tuning with Asymmetric Probability Shift
def find_optimal_thresholds(probs, true_labels, label_names):
    # Apply asymmetric probability shift
    shifted_probs = probs.copy()
    for i in range(shifted_probs.shape[1]):
        shifted_probs[:, i] = np.clip(shifted_probs[:, i] * 1.2 - 0.1, 0, 1)

    thresholds = [v/100. for v in range(5, 100, 5)]
    best_thresholds = {}
    for i, label in enumerate(label_names):
        best_score, best_thresh = 0, 0.5
        for t in thresholds:
            preds = (shifted_probs[:, i] >= t).astype(int)
            f1 = f1_score(true_labels[:, i], preds, zero_division=0)
            if f1 > best_score:
                best_score, best_thresh = f1, t
        best_thresholds[label] = best_thresh
        print(f"Label: {label} | Best F1={best_score:.3f} @ threshold={best_thresh}")
    return best_thresholds

# Main training and ensembling
if __name__ == "__main__":
    # Train-test split
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)
    mskf = MultilabelStratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

    # Storage for logits and val probs
    all_test_logits = np.zeros((len(X_test), num_labels))
    all_val_probs = np.zeros((len(X_trainval), num_labels))
    all_val_labels = np.zeros((len(X_trainval), num_labels))

    for fold, (tr_idx, val_idx) in enumerate(mskf.split(X_trainval, y_trainval), 1):
        print(f"\n=== Fold {fold}/{N_FOLDS} ===")
        # Prepare data
        X_tr = [X_trainval[i] for i in tr_idx]; y_tr = y_trainval[tr_idx]
        X_val = [X_trainval[i] for i in val_idx]; y_val = y_trainval[val_idx]

        tr_ds = JournalDataset(X_tr, y_tr, tokenizer)
        val_ds = JournalDataset(X_val, y_val, tokenizer)
        test_ds = JournalDataset(X_test, y_test, tokenizer)

        tr_dl = DataLoader(tr_ds, batch_size=BATCH_SIZE, shuffle=True)
        val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE)
        test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)

        # Model
        model = BertForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=num_labels,
            problem_type="multi_label_classification"
        ).to(DEVICE)
        model.dropout = nn.Dropout(0.3)

        optimizer = AdamW(model.parameters(), lr=LR)
        total_steps = len(tr_dl) * NUM_EPOCHS
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(total_steps * WARMUP_RATIO),
            num_training_steps=total_steps
        )

        # Training loop
        for epoch in range(1, NUM_EPOCHS+1):
            model.train()
            for batch in tqdm(tr_dl, desc=f"Fold{fold} Train E{epoch}"):
                optimizer.zero_grad()
                inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != "labels"}
                labels = batch["labels"].to(DEVICE)
                logits = model(**inputs).logits
                loss = focal_loss_fn(logits, labels, weights=class_weights)
                loss.backward()
                optimizer.step()
                scheduler.step()

        # Validation probabilities
        model.eval()
        val_logits, val_labels = [], []
        with torch.no_grad():
            for batch in val_dl:
                inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != "labels"}
                logits = model(**inputs).logits.cpu().numpy()
                val_logits.append(logits)
                val_labels.append(batch["labels"].numpy())
        val_logits = np.vstack(val_logits)
        val_probs = torch.sigmoid(torch.tensor(val_logits)).numpy()
        all_val_probs[val_idx] = val_probs
        all_val_labels[val_idx] = y_val

        # Test logits
        fold_test_logits = []
        with torch.no_grad():
            for batch in test_dl:
                inputs = {k: v.to(DEVICE) for k, v in batch.items() if k != "labels"}
                logits = model(**inputs).logits.cpu().numpy()
                fold_test_logits.append(logits)
        all_test_logits += np.vstack(fold_test_logits)

    # Average logits and probs
    avg_test_logits = all_test_logits / N_FOLDS
    avg_test_probs = torch.sigmoid(torch.tensor(avg_test_logits)).numpy()

    # Threshold tuning using F1 objective
    best_thresholds = find_optimal_thresholds(all_val_probs, all_val_labels, label_names)

    # Final predictions
    final_preds = np.zeros_like(avg_test_probs, dtype=int)
    for i, label in enumerate(label_names):
        t = best_thresholds[label]
        final_preds[:, i] = (avg_test_probs[:, i] >= t).astype(int)

    # Report results with sample-wise filtering
    print("\nFinal Classification Report:\n")
    print(classification_report(
        y_test, final_preds,
        target_names=label_names,
        zero_division=0,
        sample_weight=(y_test.sum(axis=1) > 0)
    ))



Removing unstable labels with <10 samples: ['jealous']

=== Fold 1/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold1 Train E1: 100%|██████████| 30/30 [01:39<00:00,  3.33s/it]
Fold1 Train E2: 100%|██████████| 30/30 [01:38<00:00,  3.29s/it]



=== Fold 2/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold2 Train E1: 100%|██████████| 30/30 [01:37<00:00,  3.26s/it]
Fold2 Train E2: 100%|██████████| 30/30 [01:37<00:00,  3.25s/it]



=== Fold 3/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold3 Train E1: 100%|██████████| 30/30 [01:38<00:00,  3.27s/it]
Fold3 Train E2: 100%|██████████| 30/30 [01:43<00:00,  3.46s/it]



=== Fold 4/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold4 Train E1: 100%|██████████| 30/30 [01:39<00:00,  3.31s/it]
Fold4 Train E2: 100%|██████████| 30/30 [01:39<00:00,  3.33s/it]



=== Fold 5/5 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold5 Train E1: 100%|██████████| 30/30 [01:43<00:00,  3.46s/it]
Fold5 Train E2: 100%|██████████| 30/30 [01:49<00:00,  3.64s/it]


Label: afraid | Best F1=0.049 @ threshold=0.6
Label: angry | Best F1=0.042 @ threshold=0.65
Label: anxious | Best F1=0.160 @ threshold=0.35
Label: ashamed | Best F1=0.023 @ threshold=0.45
Label: awkward | Best F1=0.024 @ threshold=0.5
Label: bored | Best F1=0.059 @ threshold=0.45
Label: calm | Best F1=0.385 @ threshold=0.05
Label: confused | Best F1=0.043 @ threshold=0.65
Label: disgusted | Best F1=0.047 @ threshold=0.55
Label: excited | Best F1=0.296 @ threshold=0.05
Label: frustrated | Best F1=0.176 @ threshold=0.35
Label: happy | Best F1=0.659 @ threshold=0.45
Label: nostalgic | Best F1=0.090 @ threshold=0.5
Label: proud | Best F1=0.374 @ threshold=0.05
Label: sad | Best F1=0.065 @ threshold=0.6
Label: satisfied | Best F1=0.569 @ threshold=0.4
Label: surprised | Best F1=0.093 @ threshold=0.45
Label: exercise | Best F1=0.229 @ threshold=0.45
Label: family | Best F1=0.323 @ threshold=0.4
Label: food | Best F1=0.242 @ threshold=0.4
Label: friends | Best F1=0.128 @ threshold=0.4
Label: 

In [1]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, f1_score, precision_recall_curve
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer, 
    AutoModel,
    get_linear_schedule_with_warmup,
    AutoModelForSequenceClassification
)
from tqdm import tqdm
import matplotlib.pyplot as plt
# Import seaborn only if visualization is needed
try:
    import seaborn as sns
except ImportError:
    print("Seaborn not installed. Visualizations will be limited.")
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import warnings
warnings.filterwarnings('ignore')
from data_cleaning_import import clean_create_vectors
from torch.optim import AdamW 

In [2]:
# Configuration
DATA_PATH = "data.csv"
MODEL_NAME = "roberta-base"  # Upgrade to RoBERTa
MAX_SEQ_LEN = 256  # Increased sequence length
BATCH_SIZE = 16    # Smaller batch size for better gradient estimates
LR = 1e-5          # Slightly lower learning rate
NUM_EPOCHS = 4     # More epochs
WARMUP_RATIO = 0.1
N_FOLDS = 5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
OUTPUT_DIR = "model_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)


# Custom BERT model with attention pooling
class EmotionBertModel(nn.Module):
    def __init__(self, model_name, num_labels, dropout_rate=0.3):
        super(EmotionBertModel, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.num_labels = num_labels
        
        # Attention layer for better sequence representation
        self.attention = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, num_labels)
        )
        
    def forward(self, input_ids, attention_mask):
        # Get BERT embeddings
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
        
        # Apply attention
        attention_weights = self.attention(sequence_output)
        context_vector = torch.sum(attention_weights * sequence_output, dim=1)
        
        # Classification
        logits = self.classifier(context_vector)
        return logits

# CB-Loss for better handling of imbalanced classes
class CBLoss(nn.Module):
    def __init__(self, samples_per_class, no_of_classes, beta=0.9999, gamma=2.0):
        super(CBLoss, self).__init__()
        self.beta = beta
        self.gamma = gamma
        self.no_of_classes = no_of_classes
        
        # Calculate effective number of samples
        effective_num = 1.0 - np.power(beta, samples_per_class)
        weights = (1.0 - beta) / np.array(effective_num)
        weights = weights / np.sum(weights) * no_of_classes
        self.weights = torch.tensor(weights, dtype=torch.float).to(DEVICE)
        
    def forward(self, logits, labels):
        # BCEWithLogitsLoss with class weights
        bce = F.binary_cross_entropy_with_logits(
            logits, labels, reduction='none'
        )
        
        # Apply weights
        weighted_bce = bce * self.weights
        
        # Focal component
        p = torch.exp(-bce)
        focal_weights = (1 - p) ** self.gamma
        
        # Final loss
        loss = (focal_weights * weighted_bce).mean()
        return loss

# Dataset class
class JournalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.enc = tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=MAX_SEQ_LEN,
            return_tensors="pt"
        )
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        return {
            "input_ids": self.enc.input_ids[i],
            "attention_mask": self.enc.attention_mask[i],
            "labels": self.labels[i],
        }

# Optimal threshold finder
def find_optimal_thresholds(val_probs, val_labels, label_names):
    best_thresholds = {}
    
    for i, label in enumerate(label_names):
        precision, recall, thresholds = precision_recall_curve(val_labels[:, i], val_probs[:, i])
        # Adding 0 to thresholds for the case when precision[-1] or recall[-1] is used
        thresholds = np.append(thresholds, 1.0)
        
        # Find threshold that maximizes F1
        f1_scores = (2 * precision * recall) / (precision + recall + 1e-10)
        best_idx = np.argmax(f1_scores)
        best_threshold = thresholds[best_idx]
        best_f1 = f1_scores[best_idx]
        
        # Adjust for class frequency
        if np.sum(val_labels[:, i]) < 20:  # If very few positive examples
            # Make threshold stricter to reduce false positives
            best_threshold = min(best_threshold + 0.1, 0.9)
        
        best_thresholds[label] = best_threshold
        print(f"Label: {label} | Best F1={best_f1:.3f} @ threshold={best_threshold:.3f}")
    
    return best_thresholds

# Training function
def train_model(model, train_dl, val_dl, optimizer, scheduler, loss_fn, epochs):
    best_val_loss = float('inf')
    best_model_state = None
    
    for epoch in range(1, epochs+1):
        # Training
        model.train()
        train_loss = 0
        
        for batch in tqdm(train_dl, desc=f"Train E{epoch}"):
            optimizer.zero_grad()
            
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)
            
            logits = model(input_ids, attention_mask)
            loss = loss_fn(logits, labels)
            
            loss.backward()
            optimizer.step()
            scheduler.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        val_logits, val_labels_list = [], []
        
        with torch.no_grad():
            for batch in tqdm(val_dl, desc=f"Val E{epoch}"):
                input_ids = batch["input_ids"].to(DEVICE)
                attention_mask = batch["attention_mask"].to(DEVICE)
                labels = batch["labels"].to(DEVICE)
                
                logits = model(input_ids, attention_mask)
                loss = loss_fn(logits, labels)
                
                val_loss += loss.item()
                val_logits.append(logits.cpu())
                val_labels_list.append(batch["labels"])
        
        avg_train_loss = train_loss / len(train_dl)
        avg_val_loss = val_loss / len(val_dl)
        
        print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")
        
        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict().copy()
    
    # Load best model
    model.load_state_dict(best_model_state)
    return model

# Data augmentation for minority classes
def augment_minority_classes(X, y, label_names, min_samples=20):
    augmented_X = X.copy()
    augmented_y = y.copy()
    
    for i, label in enumerate(label_names):
        # Find indices of positive samples for this label
        positive_indices = np.where(y[:, i] == 1)[0]
        n_positive = len(positive_indices)
        
        # If we have too few samples, augment
        if n_positive > 0 and n_positive < min_samples:
            # Number of duplications needed
            n_augment = min(min_samples - n_positive, n_positive * 3)
            
            # Randomly select samples to duplicate (with replacement)
            augment_indices = np.random.choice(positive_indices, size=n_augment, replace=True)
            
            # Add slight random noise to texts to create variations
            new_texts = []
            new_labels = []
            
            for idx in augment_indices:
                new_texts.append(augment_text(X[idx]))
                new_labels.append(y[idx])
            
            if new_texts:
                augmented_X.extend(new_texts)
                augmented_y = np.vstack([augmented_y, np.array(new_labels)])
    
    print(f"Original dataset size: {len(X)}, Augmented size: {len(augmented_X)}")
    return augmented_X, augmented_y

# Simple text augmentation function
def augment_text(text):
    words = text.split()
    if len(words) <= 5:  # Don't augment very short texts
        return text
    
    # Randomly choose an augmentation strategy
    strategy = np.random.choice([
        'word_swap',
        'word_deletion',
        'synonym_replacement'
    ])
    
    if strategy == 'word_swap':
        # Swap random adjacent words
        if len(words) > 3:
            idx = np.random.randint(0, len(words) - 2)
            words[idx], words[idx + 1] = words[idx + 1], words[idx]
    
    elif strategy == 'word_deletion':
        # Delete a random word
        idx = np.random.randint(0, len(words))
        words.pop(idx)
    
    elif strategy == 'synonym_replacement':
        # This is a simplified version (just add a qualifier)
        qualifiers = ['really', 'very', 'somewhat', 'quite', 'extremely']
        if len(words) > 2:
            idx = np.random.randint(0, len(words) - 1)
            words.insert(idx, np.random.choice(qualifiers))
    
    return ' '.join(words)

# Visualization function for results
def visualize_results(y_true, y_pred, label_names):
    plt.figure(figsize=(14, 10))
    
    # Calculate metrics
    precision = []
    recall = []
    f1 = []
    support = []
    
    for i, label in enumerate(label_names):
        p = precision_score(y_true[:, i], y_pred[:, i], zero_division=0)
        r = recall_score(y_true[:, i], y_pred[:, i], zero_division=0)
        f = f1_score(y_true[:, i], y_pred[:, i], zero_division=0)
        s = np.sum(y_true[:, i])
        
        precision.append(p)
        recall.append(r)
        f1.append(f)
        support.append(s)
    
    # Sort labels by support
    indices = np.argsort(support)[::-1]
    sorted_labels = [label_names[i] for i in indices]
    sorted_precision = [precision[i] for i in indices]
    sorted_recall = [recall[i] for i in indices]
    sorted_f1 = [f1[i] for i in indices]
    sorted_support = [support[i] for i in indices]
    
    # Plot
    x = np.arange(len(sorted_labels))
    width = 0.25
    
    plt.bar(x - width, sorted_precision, width, label='Precision')
    plt.bar(x, sorted_recall, width, label='Recall')
    plt.bar(x + width, sorted_f1, width, label='F1')
    
    plt.xlabel('Labels')
    plt.ylabel('Score')
    plt.title('Model Performance by Label')
    plt.xticks(x, sorted_labels, rotation=90)
    plt.legend()
    plt.tight_layout()
    
    # Add support as text
    for i, v in enumerate(sorted_support):
        plt.text(i, 0.05, f'n={v}', ha='center')
    
    plt.savefig(f"{OUTPUT_DIR}/performance_by_label.png")
    plt.close()
    
    # Plot confusion matrix for each label
    for i, label in enumerate(label_names):
        cm = confusion_matrix(y_true[:, i], y_pred[:, i])
        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=['Negative', 'Positive'],
                    yticklabels=['Negative', 'Positive'])
        plt.title(f'Confusion Matrix: {label}')
        plt.ylabel('True')
        plt.xlabel('Predicted')
        plt.tight_layout()
        plt.savefig(f"{OUTPUT_DIR}/confusion_matrix_{label}.png")
        plt.close()

# Main function
def main():
    df = pd.read_csv(DATA_PATH)
    df = clean_create_vectors(df)
    
    # Identify label columns
    y_columns = [c for c in df.columns if c not in ["journal", "emotion_vectors", "activity_vectors", "text_length"]]
    
    # Filter labels: keep only those with at least 5 samples
    label_counts = df[y_columns].sum()
    unstable_labels = label_counts[label_counts < 5].index.tolist()
    if unstable_labels:
        print(f"Removing unstable labels with <5 samples: {unstable_labels}")
    
    label_names = [c for c in y_columns if c not in unstable_labels]
    num_labels = len(label_names)
    
    # Prepare data
    X = df["journal"].tolist()
    y = df[label_names].astype(int).values
    
    # Get class distribution for loss function
    samples_per_class = np.sum(y, axis=0)
    print(f"Class distribution: {dict(zip(label_names, samples_per_class))}")
    
    # Train-test split - without stratification to handle rare classes
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Print split statistics
    print(f"Training samples: {len(X_trainval)}, Test samples: {len(X_test)}")
    print(f"Label distribution in train/test:")
    for i, label in enumerate(label_names):
        train_count = np.sum(y_trainval[:, i])
        test_count = np.sum(y_test[:, i])
        print(f"  {label}: {train_count} train, {test_count} test")
    
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Cross-validation setup
    try:
        mskf = MultilabelStratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
        # Check if stratification is possible
        folds = list(mskf.split(X_trainval, y_trainval))
        cv_strategy = "stratified"
    except Exception as e:
        print(f"Warning: MultilabelStratifiedKFold failed ({str(e)}), using regular KFold")
        mskf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
        # Create a pseudo-target for stratification based on sum of labels
        y_pseudo = np.clip(y_trainval.sum(axis=1), 0, 3)  # 0, 1, 2, 3+ labels
        folds = list(mskf.split(X_trainval, y_pseudo))
        cv_strategy = "pseudo-stratified"
    
    # Storage for logits and probabilities
    all_test_logits = np.zeros((len(X_test), num_labels))
    all_val_probs = np.zeros((len(X_trainval), num_labels))
    all_val_labels = np.zeros((len(X_trainval), num_labels))
    
    # Train each fold
    for fold, (tr_idx, val_idx) in enumerate(folds, 1):
        print(f"\n=== Fold {fold}/{N_FOLDS} ({cv_strategy}) ===")
        
        # Prepare data with data augmentation for minority classes
        X_tr = [X_trainval[i] for i in tr_idx]
        y_tr = y_trainval[tr_idx]
        X_val = [X_trainval[i] for i in val_idx]
        y_val = y_trainval[val_idx]
        
        # Augment only the training part of this fold
        print("Augmenting minority classes for this fold...")
        X_tr_aug, y_tr_aug = augment_minority_classes(X_tr, y_tr, label_names)
        
        # Create datasets and dataloaders
        tr_ds = JournalDataset(X_tr_aug, y_tr_aug, tokenizer)
        val_ds = JournalDataset(X_val, y_val, tokenizer)
        test_ds = JournalDataset(X_test, y_test, tokenizer)
        
        tr_dl = DataLoader(tr_ds, batch_size=BATCH_SIZE, shuffle=True)
        val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE)
        test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)
        
        # Initialize model
        model = EmotionBertModel(MODEL_NAME, num_labels).to(DEVICE)
        
        # Initialize optimizer and scheduler
        optimizer = AdamW(model.parameters(), lr=LR, weight_decay=0.01)
        total_steps = len(tr_dl) * NUM_EPOCHS
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(total_steps * WARMUP_RATIO),
            num_training_steps=total_steps
        )
        
        # Initialize loss function
        loss_fn = CBLoss(samples_per_class, num_labels)
        
        # Train the model
        model = train_model(model, tr_dl, val_dl, optimizer, scheduler, loss_fn, NUM_EPOCHS)
        
        # Get validation probabilities
        model.eval()
        val_logits = []
        with torch.no_grad():
            for batch in val_dl:
                input_ids = batch["input_ids"].to(DEVICE)
                attention_mask = batch["attention_mask"].to(DEVICE)
                logits = model(input_ids, attention_mask).cpu().numpy()
                val_logits.append(logits)
        
        val_logits = np.vstack(val_logits)
        val_probs = torch.sigmoid(torch.tensor(val_logits)).numpy()
        all_val_probs[val_idx] = val_probs
        all_val_labels[val_idx] = y_val
        
        # Get test logits
        test_logits = []
        with torch.no_grad():
            for batch in test_dl:
                input_ids = batch["input_ids"].to(DEVICE)
                attention_mask = batch["attention_mask"].to(DEVICE)
                logits = model(input_ids, attention_mask).cpu().numpy()
                test_logits.append(logits)
        
        all_test_logits += np.vstack(test_logits)
        
        # Save model
        torch.save(model.state_dict(), f"{OUTPUT_DIR}/model_fold{fold}.pt")
    
    # Average and convert to probabilities
    avg_test_logits = all_test_logits / N_FOLDS
    avg_test_probs = torch.sigmoid(torch.tensor(avg_test_logits)).numpy()
    
    # Find optimal thresholds
    print("\nFinding optimal thresholds...")
    best_thresholds = find_optimal_thresholds(all_val_probs, all_val_labels, label_names)
    
    # Save thresholds
    pd.DataFrame(list(best_thresholds.items()), columns=['label', 'threshold']).to_csv(
        f"{OUTPUT_DIR}/thresholds.csv", index=False
    )
    
    # Apply thresholds to get predictions
    final_preds = np.zeros_like(avg_test_probs, dtype=int)
    for i, label in enumerate(label_names):
        t = best_thresholds[label]
        final_preds[:, i] = (avg_test_probs[:, i] >= t).astype(int)
    
    # Evaluate and visualize results
    print("\nFinal Classification Report:")
    report = classification_report(
        y_test, final_preds,
        target_names=label_names,
        zero_division=0
    )
    print(report)
    
    # Save classification report
    with open(f"{OUTPUT_DIR}/classification_report.txt", "w") as f:
        f.write(report)
    
    # Visualize results
    try:
        from sklearn.metrics import precision_score, recall_score, confusion_matrix
        print("Visualizing results...")
        visualize_results(y_test, final_preds, label_names)
    except Exception as e:
        print(f"Visualization failed: {e}")
    
    print("Done!")

if __name__ == "__main__":
    main()

Removing unstable labels with <5 samples: ['jealous']
Class distribution: {'afraid': np.int64(18), 'angry': np.int64(28), 'anxious': np.int64(125), 'ashamed': np.int64(17), 'awkward': np.int64(15), 'bored': np.int64(49), 'calm': np.int64(368), 'confused': np.int64(28), 'disgusted': np.int64(22), 'excited': np.int64(251), 'frustrated': np.int64(141), 'happy': np.int64(730), 'nostalgic': np.int64(61), 'proud': np.int64(337), 'sad': np.int64(43), 'satisfied': np.int64(591), 'surprised': np.int64(64), 'exercise': np.int64(185), 'family': np.int64(275), 'food': np.int64(203), 'friends': np.int64(100), 'god': np.int64(67), 'health': np.int64(99), 'love': np.int64(67), 'recreation': np.int64(89), 'school': np.int64(19), 'sleep': np.int64(132), 'work': np.int64(235)}
Training samples: 1178, Test samples: 295
Label distribution in train/test:
  afraid: 16 train, 2 test
  angry: 23 train, 5 test
  anxious: 102 train, 23 test
  ashamed: 13 train, 4 test
  awkward: 11 train, 4 test
  bored: 34 tra

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Train E1: 100%|██████████| 62/62 [03:52<00:00,  3.75s/it]
Val E1: 100%|██████████| 15/15 [00:12<00:00,  1.19it/s]


Epoch 1: Train Loss = 0.1022, Val Loss = 0.0425


Train E2: 100%|██████████| 62/62 [03:49<00:00,  3.70s/it]
Val E2: 100%|██████████| 15/15 [00:12<00:00,  1.20it/s]


Epoch 2: Train Loss = 0.0479, Val Loss = 0.0381


Train E3: 100%|██████████| 62/62 [03:36<00:00,  3.50s/it]
Val E3: 100%|██████████| 15/15 [00:12<00:00,  1.21it/s]


Epoch 3: Train Loss = 0.0451, Val Loss = 0.0357


Train E4: 100%|██████████| 62/62 [03:34<00:00,  3.46s/it]
Val E4: 100%|██████████| 15/15 [00:12<00:00,  1.19it/s]


Epoch 4: Train Loss = 0.0427, Val Loss = 0.0348

=== Fold 2/5 (stratified) ===
Augmenting minority classes for this fold...
Original dataset size: 942, Augmented size: 984


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Train E1: 100%|██████████| 62/62 [03:34<00:00,  3.46s/it]
Val E1: 100%|██████████| 15/15 [00:12<00:00,  1.22it/s]


Epoch 1: Train Loss = 0.1033, Val Loss = 0.0436


Train E2: 100%|██████████| 62/62 [03:39<00:00,  3.53s/it]
Val E2: 100%|██████████| 15/15 [00:12<00:00,  1.22it/s]


Epoch 2: Train Loss = 0.0491, Val Loss = 0.0382


Train E3: 100%|██████████| 62/62 [03:39<00:00,  3.54s/it]
Val E3: 100%|██████████| 15/15 [00:12<00:00,  1.21it/s]


Epoch 3: Train Loss = 0.0453, Val Loss = 0.0365


Train E4: 100%|██████████| 62/62 [03:37<00:00,  3.50s/it]
Val E4: 100%|██████████| 15/15 [00:12<00:00,  1.23it/s]


Epoch 4: Train Loss = 0.0429, Val Loss = 0.0357

=== Fold 3/5 (stratified) ===
Augmenting minority classes for this fold...
Original dataset size: 943, Augmented size: 987


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Train E1: 100%|██████████| 62/62 [03:37<00:00,  3.51s/it]
Val E1: 100%|██████████| 15/15 [00:12<00:00,  1.22it/s]


Epoch 1: Train Loss = 0.1087, Val Loss = 0.0481


Train E2: 100%|██████████| 62/62 [03:38<00:00,  3.52s/it]
Val E2: 100%|██████████| 15/15 [00:12<00:00,  1.24it/s]


Epoch 2: Train Loss = 0.0526, Val Loss = 0.0401


Train E3: 100%|██████████| 62/62 [03:38<00:00,  3.53s/it]
Val E3: 100%|██████████| 15/15 [00:12<00:00,  1.24it/s]


Epoch 3: Train Loss = 0.0481, Val Loss = 0.0387


Train E4: 100%|██████████| 62/62 [03:37<00:00,  3.51s/it]
Val E4: 100%|██████████| 15/15 [00:11<00:00,  1.25it/s]


Epoch 4: Train Loss = 0.0461, Val Loss = 0.0379

=== Fold 4/5 (stratified) ===
Augmenting minority classes for this fold...
Original dataset size: 944, Augmented size: 987


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Train E1: 100%|██████████| 62/62 [03:41<00:00,  3.57s/it]
Val E1: 100%|██████████| 15/15 [00:12<00:00,  1.24it/s]


Epoch 1: Train Loss = 0.1059, Val Loss = 0.0454


Train E2: 100%|██████████| 62/62 [03:39<00:00,  3.53s/it]
Val E2: 100%|██████████| 15/15 [00:11<00:00,  1.25it/s]


Epoch 2: Train Loss = 0.0504, Val Loss = 0.0389


Train E3: 100%|██████████| 62/62 [03:42<00:00,  3.59s/it]
Val E3: 100%|██████████| 15/15 [00:12<00:00,  1.25it/s]


Epoch 3: Train Loss = 0.0456, Val Loss = 0.0369


Train E4: 100%|██████████| 62/62 [03:42<00:00,  3.58s/it]
Val E4: 100%|██████████| 15/15 [00:12<00:00,  1.25it/s]


Epoch 4: Train Loss = 0.0426, Val Loss = 0.0363

=== Fold 5/5 (stratified) ===
Augmenting minority classes for this fold...
Original dataset size: 942, Augmented size: 987


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Train E1: 100%|██████████| 62/62 [03:43<00:00,  3.60s/it]
Val E1: 100%|██████████| 15/15 [00:12<00:00,  1.23it/s]


Epoch 1: Train Loss = 0.1055, Val Loss = 0.0479


Train E2: 100%|██████████| 62/62 [03:44<00:00,  3.61s/it]
Val E2: 100%|██████████| 15/15 [00:12<00:00,  1.23it/s]


Epoch 2: Train Loss = 0.0502, Val Loss = 0.0405


Train E3: 100%|██████████| 62/62 [03:42<00:00,  3.59s/it]
Val E3: 100%|██████████| 15/15 [00:12<00:00,  1.24it/s]


Epoch 3: Train Loss = 0.0469, Val Loss = 0.0389


Train E4: 100%|██████████| 62/62 [03:41<00:00,  3.57s/it]
Val E4: 100%|██████████| 15/15 [00:12<00:00,  1.25it/s]


Epoch 4: Train Loss = 0.0453, Val Loss = 0.0382

Finding optimal thresholds...
Label: afraid | Best F1=0.116 @ threshold=0.342
Label: angry | Best F1=0.126 @ threshold=0.237
Label: anxious | Best F1=0.390 @ threshold=0.348
Label: ashamed | Best F1=0.217 @ threshold=0.367
Label: awkward | Best F1=0.154 @ threshold=0.432
Label: bored | Best F1=0.231 @ threshold=0.278
Label: calm | Best F1=0.385 @ threshold=0.352
Label: confused | Best F1=0.182 @ threshold=0.266
Label: disgusted | Best F1=0.205 @ threshold=0.336
Label: excited | Best F1=0.297 @ threshold=0.330
Label: frustrated | Best F1=0.576 @ threshold=0.342
Label: happy | Best F1=0.660 @ threshold=0.429
Label: nostalgic | Best F1=0.137 @ threshold=0.248
Label: proud | Best F1=0.378 @ threshold=0.362
Label: sad | Best F1=0.286 @ threshold=0.272
Label: satisfied | Best F1=0.571 @ threshold=0.428
Label: surprised | Best F1=0.147 @ threshold=0.293
Label: exercise | Best F1=0.271 @ threshold=0.354
Label: family | Best F1=0.364 @ threshold=

<Figure size 1400x1000 with 0 Axes>

In [None]:

import os
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC

from transformers import AutoTokenizer, TFRobertaModel, TFDebertaModel

# <-- import your cleaning functions -->
from data_cleaning_import import clean_create_vectors  # or create_student_vectors

# -------------------------
# Metrics & helpers
# -------------------------
def f1_score(true, pred):
    true, pred = np.array(true), np.array(pred)
    tp = np.sum((true == 1) & (pred == 1))
    pp = np.sum(pred == 1)
    ta = np.sum(true == 1)
    if pp==0 or ta==0: return 0.0
    prec = tp/pp; rec = tp/ta
    return 2*prec*rec/(prec+rec) if (prec+rec)>0 else 0.0

def precision_score(true, pred):
    true, pred = np.array(true), np.array(pred)
    tp = np.sum((true == 1) & (pred == 1))
    pp = np.sum(pred == 1)
    return tp/pp if pp>0 else 0.0

def recall_score(true, pred):
    true, pred = np.array(true), np.array(pred)
    tp = np.sum((true == 1) & (pred == 1))
    ta = np.sum(true == 1)
    return tp/ta if ta>0 else 0.0

def normalized_accuracy(true, pred):
    true, pred = np.array(true), np.array(pred)
    tp = np.sum((true == 1) & (pred == 1))
    tn = np.sum((true == 0) & (pred == 0))
    ta = np.sum(true == 1); tn_all = np.sum(true == 0)
    if ta==0 or tn_all==0: return 0.0
    return (tp/ta + tn/tn_all)/2

def calculate_class_weights(labels):
    counts = np.sum(labels, axis=0)
    total = labels.shape[0]
    weights = {}
    for i,c in enumerate(counts):
        weights[i] = total/(len(counts)*c) if c>0 else 1.0
    return weights

# -------------------------
# Classifier
# -------------------------
class JournalEmotionClassifier:
    def __init__(self, model_name="roberta-base", max_length=128,
                 num_classes=29, class_weights=None, output_dir="./output"):
        self.model_name   = model_name
        self.max_length   = max_length
        self.num_classes  = num_classes
        self.class_weights= class_weights
        self.output_dir   = output_dir
        os.makedirs(output_dir, exist_ok=True)

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.class_names = [
            'afraid','angry','anxious','ashamed','awkward','bored','calm',
            'confused','disgusted','excited','frustrated','happy','jealous',
            'nostalgic','proud','sad','satisfied','surprised',
            'exercise','family','food','friends','god','health','love',
            'recreation','school','sleep','work'
        ]
        self.thresholds = {c:0.5 for c in self.class_names}

    def build_model(self):
        if "deberta" in self.model_name:
            base = TFDebertaModel.from_pretrained(self.model_name)
        else:
            base = TFRobertaModel.from_pretrained(self.model_name)

        ids = Input(shape=(self.max_length,), dtype=tf.int32, name="input_ids")
        mask= Input(shape=(self.max_length,), dtype=tf.int32, name="attention_mask")
        out = base(input_ids=ids, attention_mask=mask)[0][:,0,:]
        x   = Dropout(0.1)(out)
        x   = Dense(512, activation="relu")(x)
        x   = Dropout(0.2)(x)
        logits = Dense(self.num_classes, activation="sigmoid")(x)

        self.model = Model([ids,mask], logits)
        self.model.compile(
            optimizer=Adam(2e-5),
            loss=BinaryCrossentropy(),
            metrics=['accuracy', AUC(multi_label=True, name='auc')]
        )
        return self.model

    def preprocess(self, texts):
        return self.tokenizer(
            texts,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='tf'
        )

    def train(self, df, label_cols, val_split=0.2, batch_size=16, epochs=10):
        texts = df['journal'].tolist()
        labels= df[label_cols].values

        if val_split>0:
            tX,vX,tY,vY = train_test_split(
                texts, labels, test_size=val_split, random_state=42
            )
            ti = self.preprocess(tX); vi = self.preprocess(vX)
            callbacks = [
                EarlyStopping("val_auc", patience=3, mode="max", restore_best_weights=True),
                ModelCheckpoint(os.path.join(self.output_dir,"best_model.h5"),
                                monitor="val_auc", mode="max", save_best_only=True)
            ]
            h = self.model.fit(
                ti, tY,
                validation_data=(vi,vY),
                epochs=epochs,
                batch_size=batch_size,
                callbacks=callbacks,
                class_weight=self.class_weights
            )
            pd.DataFrame(h.history).to_csv(
                os.path.join(self.output_dir, "training_history.csv"), index=False
            )
        else:
            inp = self.preprocess(texts)
            h   = self.model.fit(
                inp, labels,
                epochs=epochs,
                batch_size=batch_size,
                class_weight=self.class_weights
            )
            self.model.save_weights(os.path.join(self.output_dir,"final_model.h5"))

    def optimize_thresholds(self, df, label_cols, metric='f1_score'):
        texts = df['journal'].tolist()
        labels= df[label_cols].values
        inp   = self.preprocess(texts)
        raw   = self.model.predict(inp)
        bests = {}
        for idx,name in enumerate(self.class_names):
            best_score, best_t = 0,0.5
            for t in np.arange(0.1,1.0,0.05):
                binp = (raw[:,idx]>=t).astype(int)
                if metric=='precision':
                    sc = precision_score(labels[:,idx],binp)
                elif metric=='recall':
                    sc = recall_score(labels[:,idx],binp)
                elif metric=='norm_acc':
                    sc = normalized_accuracy(labels[:,idx],binp)
                else:
                    sc = f1_score(labels[:,idx],binp)
                if sc>best_score:
                    best_score, best_t = sc,t
            bests[name]=best_t
        self.thresholds = bests
        pd.DataFrame([bests]).to_csv(
            os.path.join(self.output_dir,"optimal_thresholds.csv"),
            index=False
        )
        return bests

    def evaluate(self, df, label_cols):
        texts = df['journal'].tolist()
        labels= df[label_cols].values
        inp   = self.preprocess(texts)
        raw   = self.model.predict(inp)

        binp  = np.zeros_like(raw)
        for idx,name in enumerate(self.class_names):
            binp[:,idx] = raw[:,idx] >= self.thresholds[name]

        results = {}
        for fn in (f1_score, precision_score, recall_score, normalized_accuracy):
            scores = []
            for i in range(len(self.class_names)):
                sc = fn(labels[:,i], binp[:,i])
                results[f"{fn.__name__}_{self.class_names[i]}"] = sc
                scores.append(sc)
            results[f"{fn.__name__}_macro_avg"] = np.mean(scores)

        pd.DataFrame([results]).to_csv(
            os.path.join(self.output_dir,"evaluation_results.csv"), index=False
        )
        return results

    def save(self, path):
        os.makedirs(path, exist_ok=True)
        self.model.save_weights(os.path.join(path,"model_weights.h5"))
        self.tokenizer.save_pretrained(path)
        np.save(os.path.join(path,"thresholds.npy"), self.thresholds)

    def predict(self, texts):
        inp   = self.preprocess(texts)
        raw   = self.model.predict(inp)
        binp  = np.zeros_like(raw)
        for idx,name in enumerate(self.class_names):
            binp[:,idx] = raw[:,idx] >= self.thresholds[name]
        return raw, binp

# -------------------------
# Main
# -------------------------
def main():
    # 1. read raw CSV & clean
    raw_df = pd.read_csv("data.csv")
    df     = clean_create_vectors(raw_df)

    # 2. define labels
    label_cols = [
        'afraid','angry','anxious','ashamed','awkward','bored','calm',
        'confused','disgusted','excited','frustrated','happy','jealous',
        'nostalgic','proud','sad','satisfied','surprised',
        'exercise','family','food','friends','god','health','love',
        'recreation','school','sleep','work'
    ]

    # 3. train/test split
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    print(f"Train size: {len(train_df)}  Test size: {len(test_df)}")

    # 4. class weights
    cw = calculate_class_weights(train_df[label_cols].values)

    # 5. init & build model
    clf = JournalEmotionClassifier(
        model_name="roberta-base",
        max_length=128,
        num_classes=len(label_cols),
        class_weights=cw,
        output_dir="./journal_output"
    )
    clf.build_model()

    # 6. train
    print("Training…")
    clf.train(train_df, label_cols, val_split=0.1, batch_size=16, epochs=5)

    # 7. optimize thresholds
    print("Optimizing thresholds…")
    best_t = clf.optimize_thresholds(test_df, label_cols)
    print(best_t)

    # 8. evaluate
    print("Evaluating…")
    metrics = clf.evaluate(test_df, label_cols)
    for k,v in metrics.items():
        if k.endswith("_macro_avg"):
            print(f"{k}: {v:.4f}")

    # 9. save final model
    clf.save("./journal_output/final_model")

    # 10. example predictions
    samples = [
        "I spent the evening with friends and felt really happy.",
        "Work was stressful and I couldn't sleep at all.",
        "I had a calm walk in the park and felt nostalgic."
    ]
    raw_preds, bin_preds = clf.predict(samples)
    print("\nExample predictions:")
    for i, txt in enumerate(samples):
        print(f"\n- {txt}")
        for idx in np.where(bin_preds[i]==1)[0]:
            print(f"   • {clf.class_names[idx]} ({raw_preds[i,idx]:.2f})")

if __name__ == "__main__":
    main()
