In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.optim as optim


# **Video HC Traits**

In [3]:
# Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, KFold
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

# Load video dataset
df = pd.read_csv('/kaggle/input/fi-v2-test-val-data/FI V2 COMPLETE FEATURE DATASET/HANDCRAFTED/video_hc_features.csv')

# Drop unnecessary columns
drop_cols = ["Filename", "Segment_ID", "interview", "Gender", "Ethnicity", "AgeGroup"]
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True, errors='ignore')

# Define label columns
label_columns = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

# Separate features and labels
X = df.drop(columns=label_columns)
y = df[label_columns]

# Keep only numeric columns
X = X.select_dtypes(include=[np.number])

# Fill missing values
X = X.fillna(X.mean())
y = y.fillna(y.mean())

# Normalize features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Convert to tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensors = {trait: torch.tensor(y[trait].values, dtype=torch.float32) for trait in label_columns}

# Dataset class
class VideoDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class SimpleTransformerRegressor(nn.Module):
    """
    Simple Transformer Regressor using batch_first=True convention.
    Takes tabular features, projects them, passes through a Transformer Encoder,
    and predicts a single regression value.
    """
    def __init__(self, input_dim, embed_dim=256, num_heads=4, num_layers=2, dropout=0.3, ff_dim_multiplier=4):
        """
        Args:
            input_dim (int): Number of input features.
            embed_dim (int): Dimension for projecting features and for the Transformer. Must be divisible by num_heads.
            num_heads (int): Number of attention heads in the Transformer.
            num_layers (int): Number of layers in the Transformer Encoder.
            dropout (float): Dropout rate.
            ff_dim_multiplier (int): Multiplier for the feed-forward layer dimension within the Transformer.
        """
        super(SimpleTransformerRegressor, self).__init__()

        # Ensure embed_dim is divisible by num_heads
        if embed_dim % num_heads != 0:
            # Adjust embed_dim up to the nearest multiple of num_heads
            original_embed_dim = embed_dim
            embed_dim = (embed_dim // num_heads + 1) * num_heads
            print(f"Warning: embed_dim ({original_embed_dim}) not divisible by num_heads ({num_heads}).")
            print(f"Adjusted embed_dim to {embed_dim}.")

        self.input_dim = input_dim
        self.embed_dim = embed_dim

        # Project input features to embedding dimension
        self.project = nn.Linear(input_dim, embed_dim)

        # Define the Transformer Encoder Layer with batch_first=True
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * ff_dim_multiplier, # Standard practice
            dropout=dropout,
            batch_first=True  # <<< Input tensor shape: (batch, seq_len, features)
        )

        # Stack the encoder layers
        self.encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=num_layers
        )

        # Classifier head
        self.classifier = nn.Sequential(
            nn.LayerNorm(embed_dim),      # Add LayerNorm for stability before classifier
            nn.Linear(embed_dim, 128),    # Linear layer 1
            nn.ReLU(),                    # Activation
            nn.Dropout(dropout),          # Dropout
            nn.Linear(128, 1)             # Final output layer (regression target)
        )

    def forward(self, x):
        """
        Forward pass.
        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, input_dim).
        Returns:
            torch.Tensor: Output tensor of shape (batch_size).
        """
        # 1. Project features
        # x shape: (batch_size, input_dim)
        x = self.project(x)
        # x shape: (batch_size, embed_dim)

        # 2. Add sequence dimension for Transformer
        # TransformerEncoderLayer with batch_first=True expects (batch, seq_len, features)
        x = x.unsqueeze(1)
        # x shape: (batch_size, seq_len=1, embed_dim)

        # 3. Pass through Transformer Encoder
        x = self.encoder(x)
        # x shape: (batch_size, seq_len=1, embed_dim)

        # 4. Remove sequence dimension
        x = x.squeeze(1)
        # x shape: (batch_size, embed_dim)

        # 5. Pass through classifier
        output = self.classifier(x)
        # output shape: (batch_size, 1)

        # 6. Squeeze final dimension for regression output
        return output.squeeze(-1)
        # final shape: (batch_size)

# Metrics
def mean_absolute_error(preds, labels):
    return torch.mean(torch.abs(preds - labels)).item()

def concordance_correlation_coefficient(preds, labels):
    preds_mean = torch.mean(preds)
    labels_mean = torch.mean(labels)
    preds_var = torch.var(preds)
    labels_var = torch.var(labels)
    covariance = torch.mean((preds - preds_mean) * (labels - labels_mean))
    ccc = (2 * covariance) / (preds_var + labels_var + (preds_mean - labels_mean) ** 2)
    return ccc.item()

# Training helpers
def train_one_epoch(model, loader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    return total_loss / len(loader)

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_mae, total_ccc = 0, 0, 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            total_loss += loss.item()
            total_mae += mean_absolute_error(preds, y_batch)
            total_ccc += concordance_correlation_coefficient(preds, y_batch)
    n_batches = len(loader)
    return total_loss/n_batches, total_mae/n_batches, total_ccc/n_batches

def generate_random_configs(search_space, num_configs=10):
    configs = []
    for _ in range(num_configs):
        config = {
            "embed_dim": np.random.choice(search_space["embed_dim"]),
            "num_heads": np.random.choice(search_space["num_heads"]),
            "num_layers": np.random.choice(search_space["num_layers"]),
            "dropout": np.random.choice(search_space["dropout"]),
            "lr": np.random.choice(search_space["lr"]),
            "batch_size": np.random.choice(search_space["batch_size"]),
            "weight_decay": np.random.choice(search_space["weight_decay"])
        }
        configs.append(config)
    return configs

def cross_validate(config, X_tensor, y_tensor, num_folds=3, epochs=10):
    print(f"Evaluating Config: {config}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    fold_metrics = {"val_ccc": [], "val_mae": [], "val_loss": []}

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_tensor)):
        print(f"  Fold {fold+1}/{num_folds}")
        X_train_fold, y_train_fold = X_tensor[train_idx], y_tensor[train_idx]
        X_val_fold, y_val_fold = X_tensor[val_idx], y_tensor[val_idx]

        train_loader = DataLoader(VideoDataset(X_train_fold, y_train_fold), batch_size=int(config["batch_size"]), shuffle=True)
        val_loader = DataLoader(VideoDataset(X_val_fold, y_val_fold), batch_size=int(config["batch_size"]), shuffle=False)

        model = SimpleTransformerRegressor(
            input_dim=X_tensor.shape[1],
            embed_dim=config["embed_dim"],
            num_heads=config["num_heads"],
            num_layers=config["num_layers"],
            dropout=config["dropout"]
        ).to(device)

        optimizer = optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
        criterion = nn.MSELoss()

        best_ccc = -1
        for epoch in range(epochs):
            train_loss = train_one_epoch(model, train_loader, optimizer, scheduler, criterion, device)
            val_loss, val_mae, val_ccc = evaluate(model, val_loader, criterion, device)
            print(f"    Epoch {epoch+1}/{epochs} | Val CCC: {val_ccc:.4f}, MAE: {val_mae:.4f}")
            if val_ccc > best_ccc:
                best_ccc = val_ccc
                best_metrics = (val_loss, val_mae, val_ccc)

        fold_metrics["val_loss"].append(best_metrics[0])
        fold_metrics["val_mae"].append(best_metrics[1])
        fold_metrics["val_ccc"].append(best_metrics[2])

    return {
        "ccc": np.mean(fold_metrics["val_ccc"]),
        "mae": np.mean(fold_metrics["val_mae"]),
        "loss": np.mean(fold_metrics["val_loss"])
    }

def hyperparameter_tuning(X_tensor, y_tensor, num_configs=10):
    search_space = {
        "embed_dim": [128, 256, 512],
        "num_heads": [2, 4, 8],
        "num_layers": [2, 4],
        "dropout": [0.1, 0.3, 0.5],
        "lr": [1e-4, 3e-4, 1e-3],
        "batch_size": [32, 64],
        "weight_decay": [1e-5, 1e-4]
    }
    configs = generate_random_configs(search_space, num_configs)
    best_config = None
    best_ccc = -1

    for i, config in enumerate(configs):
        print(f"\n>>> Config {i+1}/{num_configs}")
        metrics = cross_validate(config, X_tensor, y_tensor)
        print(f"  Avg CCC: {metrics['ccc']:.4f}, MAE: {metrics['mae']:.4f}")
        if metrics["ccc"] > best_ccc:
            best_ccc = metrics["ccc"]
            best_config = config

    print(f"\n>>> Best Config Selected: {best_config}")
    return best_config

def final_train_and_evaluate(best_config, X_train, y_train, X_val, y_val, X_test, y_test, trait_name, epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    X_final_train = torch.cat([X_train, X_val], dim=0)
    y_final_train = torch.cat([y_train, y_val], dim=0)

    final_train_loader = DataLoader(VideoDataset(X_final_train, y_final_train), batch_size=int(best_config["batch_size"]), shuffle=True)
    test_loader = DataLoader(VideoDataset(X_test, y_test), batch_size=int(best_config["batch_size"]), shuffle=False)

    model = SimpleTransformerRegressor(
        input_dim=X_train.shape[1],
        embed_dim=best_config["embed_dim"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        dropout=best_config["dropout"]
    ).to(device)

    optimizer = optim.AdamW(model.parameters(), lr=best_config["lr"], weight_decay=best_config["weight_decay"])
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    criterion = nn.MSELoss()

    print(f"\n>>> Final Training for {trait_name.upper()} ({epochs} epochs)")
    for epoch in range(epochs):
        train_loss = train_one_epoch(model, final_train_loader, optimizer, scheduler, criterion, device)
        print(f"  Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f}")

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            all_preds.append(preds)
            all_labels.append(y_batch)

    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)

    final_mae = mean_absolute_error(all_preds, all_labels)
    final_ccc = concordance_correlation_coefficient(all_preds, all_labels)
    tolerance = 0.1
    correct = torch.abs(all_preds - all_labels) < tolerance
    final_accuracy = correct.float().mean().item()

    print(f"\n==== {trait_name.upper()} Evaluation on Test Set ====")
    print(f"Test CCC: {final_ccc:.4f}, Test MAE: {final_mae:.4f}, Accuracy (±{tolerance}): {final_accuracy*100:.2f}%")
    torch.save(model.state_dict(), f"best_video_transformer_model_{trait_name}.pth")
    model_save_path = f"best_video_transformer_model_{trait_name}.pth"
    print(f"Saving final model for {trait_name} to {model_save_path}")
    torch.save({
        'epoch': epochs,
        'model_state_dict': model.state_dict(), # <<< Weights nested here
        'optimizer_state_dict': optimizer.state_dict(),
        'best_config': best_config,           # <<< Config needed
        'scaler_mean': scaler.mean_,         # <<< Scaler mean needed
        'scaler_scale': scaler.scale_,         # <<< Scaler scale needed
        'test_metrics': {'ccc': final_ccc, 'mae': final_mae, f'acc_{tolerance}': final_accuracy}
    }, model_save_path)
    
# Train model per trait
for trait in label_columns:
    print(f"\n--- Training for Trait: {trait} ---")
    y_trait = y_tensors[trait]
    train_idx, temp_idx = train_test_split(range(len(X)), test_size=0.3, random_state=42)
    val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

    X_train, y_train = X_tensor[train_idx], y_trait[train_idx]
    X_val, y_val = X_tensor[val_idx], y_trait[val_idx]
    X_test, y_test = X_tensor[test_idx], y_trait[test_idx]

    best_config = hyperparameter_tuning(X_tensor, y_trait, num_configs=10)
    final_train_and_evaluate(best_config, X_train, y_train, X_val, y_val, X_test, y_test, trait_name=trait)


  df = pd.read_csv('/kaggle/input/fi-v2-test-val-data/FI V2 COMPLETE FEATURE DATASET/HANDCRAFTED/video_hc_features.csv')



--- Training for Trait: openness ---

>>> Config 1/10
Evaluating Config: {'embed_dim': 128, 'num_heads': 2, 'num_layers': 4, 'dropout': 0.1, 'lr': 0.0003, 'batch_size': 64, 'weight_decay': 1e-05}
  Fold 1/3
    Epoch 1/10 | Val CCC: 0.0609, MAE: 0.1146
    Epoch 2/10 | Val CCC: 0.1013, MAE: 0.1118
    Epoch 3/10 | Val CCC: 0.1038, MAE: 0.1131
    Epoch 4/10 | Val CCC: 0.1826, MAE: 0.1105
    Epoch 5/10 | Val CCC: 0.1556, MAE: 0.1096
    Epoch 6/10 | Val CCC: 0.2075, MAE: 0.1106
    Epoch 7/10 | Val CCC: 0.1374, MAE: 0.1103
    Epoch 8/10 | Val CCC: 0.1616, MAE: 0.1100
    Epoch 9/10 | Val CCC: 0.2003, MAE: 0.1089
    Epoch 10/10 | Val CCC: 0.1774, MAE: 0.1123
  Fold 2/3
    Epoch 1/10 | Val CCC: 0.0534, MAE: 0.1143
    Epoch 2/10 | Val CCC: 0.0893, MAE: 0.1136
    Epoch 3/10 | Val CCC: 0.1224, MAE: 0.1131
    Epoch 4/10 | Val CCC: 0.0812, MAE: 0.1146
    Epoch 5/10 | Val CCC: 0.1639, MAE: 0.1104
    Epoch 6/10 | Val CCC: 0.1622, MAE: 0.1123
    Epoch 7/10 | Val CCC: 0.1674, MAE: 0.110

# **Audio HC Traits**

In [4]:
# Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, KFold
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

# Load audio dataset
df = pd.read_csv('/kaggle/input/fi-v2-test-val-data/FI V2 COMPLETE FEATURE DATASET/HANDCRAFTED/audio_hc_features.csv')

# Drop unnecessary columns
drop_cols = ["Filename", "Segment_ID", "interview", "Gender", "Ethnicity", "AgeGroup"]
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True, errors='ignore')

# Define label columns
label_columns = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

# Separate features and labels
X = df.drop(columns=label_columns)
y = df[label_columns]

# Keep only numeric columns
X = X.select_dtypes(include=[np.number])

# Fill missing values
X = X.fillna(X.mean())
y = y.fillna(y.mean())

# Normalize features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Convert to tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensors = {trait: torch.tensor(y[trait].values, dtype=torch.float32) for trait in label_columns}

# Dataset class
class AudioDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class SimpleTransformerRegressor(nn.Module):
    """
    Simple Transformer Regressor using batch_first=True convention.
    Takes tabular features, projects them, passes through a Transformer Encoder,
    and predicts a single regression value.
    """
    def __init__(self, input_dim, embed_dim=256, num_heads=4, num_layers=2, dropout=0.3, ff_dim_multiplier=4):
        """
        Args:
            input_dim (int): Number of input features.
            embed_dim (int): Dimension for projecting features and for the Transformer. Must be divisible by num_heads.
            num_heads (int): Number of attention heads in the Transformer.
            num_layers (int): Number of layers in the Transformer Encoder.
            dropout (float): Dropout rate.
            ff_dim_multiplier (int): Multiplier for the feed-forward layer dimension within the Transformer.
        """
        super(SimpleTransformerRegressor, self).__init__()

        # Ensure embed_dim is divisible by num_heads
        if embed_dim % num_heads != 0:
            # Adjust embed_dim up to the nearest multiple of num_heads
            original_embed_dim = embed_dim
            embed_dim = (embed_dim // num_heads + 1) * num_heads
            print(f"Warning: embed_dim ({original_embed_dim}) not divisible by num_heads ({num_heads}).")
            print(f"Adjusted embed_dim to {embed_dim}.")

        self.input_dim = input_dim
        self.embed_dim = embed_dim

        # Project input features to embedding dimension
        self.project = nn.Linear(input_dim, embed_dim)

        # Define the Transformer Encoder Layer with batch_first=True
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * ff_dim_multiplier, # Standard practice
            dropout=dropout,
            batch_first=True  # <<< Input tensor shape: (batch, seq_len, features)
        )

        # Stack the encoder layers
        self.encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=num_layers
        )

        # Classifier head
        self.classifier = nn.Sequential(
            nn.LayerNorm(embed_dim),      # Add LayerNorm for stability before classifier
            nn.Linear(embed_dim, 128),    # Linear layer 1
            nn.ReLU(),                    # Activation
            nn.Dropout(dropout),          # Dropout
            nn.Linear(128, 1)             # Final output layer (regression target)
        )

    def forward(self, x):
        """
        Forward pass.
        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, input_dim).
        Returns:
            torch.Tensor: Output tensor of shape (batch_size).
        """
        # 1. Project features
        # x shape: (batch_size, input_dim)
        x = self.project(x)
        # x shape: (batch_size, embed_dim)

        # 2. Add sequence dimension for Transformer
        # TransformerEncoderLayer with batch_first=True expects (batch, seq_len, features)
        x = x.unsqueeze(1)
        # x shape: (batch_size, seq_len=1, embed_dim)

        # 3. Pass through Transformer Encoder
        x = self.encoder(x)
        # x shape: (batch_size, seq_len=1, embed_dim)

        # 4. Remove sequence dimension
        x = x.squeeze(1)
        # x shape: (batch_size, embed_dim)

        # 5. Pass through classifier
        output = self.classifier(x)
        # output shape: (batch_size, 1)

        # 6. Squeeze final dimension for regression output
        return output.squeeze(-1)
        # final shape: (batch_size)

# Metrics
def mean_absolute_error(preds, labels):
    return torch.mean(torch.abs(preds - labels)).item()

def concordance_correlation_coefficient(preds, labels):
    preds_mean = torch.mean(preds)
    labels_mean = torch.mean(labels)
    preds_var = torch.var(preds)
    labels_var = torch.var(labels)
    covariance = torch.mean((preds - preds_mean) * (labels - labels_mean))
    ccc = (2 * covariance) / (preds_var + labels_var + (preds_mean - labels_mean) ** 2)
    return ccc.item()

# Training helpers
def train_one_epoch(model, loader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    for i, (X_batch, y_batch) in enumerate(loader):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    return total_loss / len(loader)

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_mae, total_ccc = 0, 0, 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            total_loss += loss.item()
            total_mae += mean_absolute_error(preds, y_batch)
            total_ccc += concordance_correlation_coefficient(preds, y_batch)
    n_batches = len(loader)
    return total_loss/n_batches, total_mae/n_batches, total_ccc/n_batches

def generate_random_configs(search_space, num_configs=10):
    configs = []
    for _ in range(num_configs):
        config = {
            "embed_dim": np.random.choice(search_space["embed_dim"]),
            "num_heads": np.random.choice(search_space["num_heads"]),
            "num_layers": np.random.choice(search_space["num_layers"]),
            "dropout": np.random.choice(search_space["dropout"]),
            "lr": np.random.choice(search_space["lr"]),
            "batch_size": np.random.choice(search_space["batch_size"]),
            "weight_decay": np.random.choice(search_space["weight_decay"])
        }
        configs.append(config)
    return configs

def cross_validate(config, X_tensor, y_tensor, num_folds=3, epochs=10):
    print(f"Evaluating Config: {config}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    fold_metrics = {"val_ccc": [], "val_mae": [], "val_loss": []}

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_tensor)):
        print(f"  Fold {fold+1}/{num_folds}")
        X_train_fold, y_train_fold = X_tensor[train_idx], y_tensor[train_idx]
        X_val_fold, y_val_fold = X_tensor[val_idx], y_tensor[val_idx]

        train_loader = DataLoader(AudioDataset(X_train_fold, y_train_fold), batch_size=int(config["batch_size"]), shuffle=True)
        val_loader = DataLoader(AudioDataset(X_val_fold, y_val_fold), batch_size=int(config["batch_size"]), shuffle=False)

        model = SimpleTransformerRegressor(
            input_dim=X_tensor.shape[1],
            embed_dim=config["embed_dim"],
            num_heads=config["num_heads"],
            num_layers=config["num_layers"],
            dropout=config["dropout"]
        ).to(device)

        optimizer = optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
        criterion = nn.MSELoss()

        best_ccc = -1
        for epoch in range(epochs):
            print(f"    Epoch {epoch+1}/{epochs}", end=' | ')
            train_loss = train_one_epoch(model, train_loader, optimizer, scheduler, criterion, device)
            val_loss, val_mae, val_ccc = evaluate(model, val_loader, criterion, device)
            print(f"Val CCC: {val_ccc:.4f}, MAE: {val_mae:.4f}")
            if val_ccc > best_ccc:
                best_ccc = val_ccc
                best_metrics = (val_loss, val_mae, val_ccc)

        fold_metrics["val_loss"].append(best_metrics[0])
        fold_metrics["val_mae"].append(best_metrics[1])
        fold_metrics["val_ccc"].append(best_metrics[2])

    return {
        "ccc": np.mean(fold_metrics["val_ccc"]),
        "mae": np.mean(fold_metrics["val_mae"]),
        "loss": np.mean(fold_metrics["val_loss"])
    }

def hyperparameter_tuning(X_tensor, y_tensor, num_configs=10):
    search_space = {
        "embed_dim": [128, 256, 512],
        "num_heads": [2, 4, 8],
        "num_layers": [2, 4],
        "dropout": [0.1, 0.3, 0.5],
        "lr": [1e-4, 3e-4, 1e-3],
        "batch_size": [32, 64],
        "weight_decay": [1e-5, 1e-4]
    }
    configs = generate_random_configs(search_space, num_configs)
    best_config = None
    best_ccc = -1

    for i, config in enumerate(configs):
        print(f"\n>>> Config {i+1}/{num_configs}")
        metrics = cross_validate(config, X_tensor, y_tensor)
        print(f"  Avg CCC: {metrics['ccc']:.4f}, MAE: {metrics['mae']:.4f}")
        if metrics["ccc"] > best_ccc:
            best_ccc = metrics["ccc"]
            best_config = config

    print(f"\n>>> Best Config Selected: {best_config}")
    return best_config

def final_train_and_evaluate(best_config, X_train, y_train, X_val, y_val, X_test, y_test, trait_name, epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    X_final_train = torch.cat([X_train, X_val], dim=0)
    y_final_train = torch.cat([y_train, y_val], dim=0)

    final_train_loader = DataLoader(AudioDataset(X_final_train, y_final_train), batch_size=int(best_config["batch_size"]), shuffle=True)
    test_loader = DataLoader(AudioDataset(X_test, y_test), batch_size=int(best_config["batch_size"]), shuffle=False)

    model = SimpleTransformerRegressor(
        input_dim=X_train.shape[1],
        embed_dim=best_config["embed_dim"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        dropout=best_config["dropout"]
    ).to(device)

    optimizer = optim.AdamW(model.parameters(), lr=best_config["lr"], weight_decay=best_config["weight_decay"])
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    criterion = nn.MSELoss()

    print(f"\n>>> Final Training for {trait_name.upper()} ({epochs} epochs)")
    for epoch in range(epochs):
        train_loss = train_one_epoch(model, final_train_loader, optimizer, scheduler, criterion, device)
        print(f"  Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f}")

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            all_preds.append(preds)
            all_labels.append(y_batch)

    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)

    final_mae = mean_absolute_error(all_preds, all_labels)
    final_ccc = concordance_correlation_coefficient(all_preds, all_labels)
    tolerance = 0.1
    correct = torch.abs(all_preds - all_labels) < tolerance
    final_accuracy = correct.float().mean().item()

    print(f"\n==== {trait_name.upper()} Evaluation on Test Set ====")
    print(f"Test CCC: {final_ccc:.4f}, Test MAE: {final_mae:.4f}, Accuracy (±{tolerance}): {final_accuracy*100:.2f}%")
    torch.save(model.state_dict(), f"best_audio_transformer_model_{trait_name}.pth")
    model_save_path = f"best_audio_transformer_model_{trait_name}.pth"
    print(f"Saving final model for {trait_name} to {model_save_path}")
    torch.save({
        'epoch': epochs,
        'model_state_dict': model.state_dict(), # <<< Weights nested here
        'optimizer_state_dict': optimizer.state_dict(),
        'best_config': best_config,           # <<< Config needed
        'scaler_mean': scaler.mean_,         # <<< Scaler mean needed
        'scaler_scale': scaler.scale_,         # <<< Scaler scale needed
        'test_metrics': {'ccc': final_ccc, 'mae': final_mae, f'acc_{tolerance}': final_accuracy}
    }, model_save_path)

# Train model per trait
for trait in label_columns:
    print(f"\n--- Training for Trait: {trait} ---")
    y_trait = y_tensors[trait]
    train_idx, temp_idx = train_test_split(range(len(X)), test_size=0.3, random_state=42)
    val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

    X_train, y_train = X_tensor[train_idx], y_trait[train_idx]
    X_val, y_val = X_tensor[val_idx], y_trait[val_idx]
    X_test, y_test = X_tensor[test_idx], y_trait[test_idx]

    best_config = hyperparameter_tuning(X_tensor, y_trait, num_configs=10)
    final_train_and_evaluate(best_config, X_train, y_train, X_val, y_val, X_test, y_test, trait_name=trait)


  df = pd.read_csv('/kaggle/input/fi-v2-test-val-data/FI V2 COMPLETE FEATURE DATASET/HANDCRAFTED/audio_hc_features.csv')



--- Training for Trait: openness ---

>>> Config 1/10
Evaluating Config: {'embed_dim': 256, 'num_heads': 4, 'num_layers': 2, 'dropout': 0.1, 'lr': 0.0003, 'batch_size': 64, 'weight_decay': 1e-05}
  Fold 1/3
    Epoch 1/10 | Val CCC: 0.0347, MAE: 0.1155
    Epoch 2/10 | Val CCC: 0.0460, MAE: 0.1150
    Epoch 3/10 | Val CCC: 0.0513, MAE: 0.1153
    Epoch 4/10 | Val CCC: 0.0516, MAE: 0.1147
    Epoch 5/10 | Val CCC: 0.0713, MAE: 0.1156
    Epoch 6/10 | Val CCC: 0.0721, MAE: 0.1147
    Epoch 7/10 | Val CCC: 0.0769, MAE: 0.1147
    Epoch 8/10 | Val CCC: 0.0860, MAE: 0.1152
    Epoch 9/10 | Val CCC: 0.0517, MAE: 0.1148
    Epoch 10/10 | Val CCC: 0.0797, MAE: 0.1164
  Fold 2/3
    Epoch 1/10 | Val CCC: 0.0442, MAE: 0.1149
    Epoch 2/10 | Val CCC: 0.0725, MAE: 0.1146
    Epoch 3/10 | Val CCC: 0.0543, MAE: 0.1144
    Epoch 4/10 | Val CCC: 0.0635, MAE: 0.1142
    Epoch 5/10 | Val CCC: 0.0699, MAE: 0.1162
    Epoch 6/10 | Val CCC: 0.0709, MAE: 0.1152
    Epoch 7/10 | Val CCC: 0.0719, MAE: 0.114

# **Text HC Traits**

In [2]:
# Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, KFold
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('/kaggle/input/fi-v2-test-val-data/FI V2 COMPLETE FEATURE DATASET/HANDCRAFTED/text_hc_features.csv')

# Drop unnecessary columns
drop_cols = ["Filename", "Segment_ID", "interview", "Gender", "Ethnicity", "AgeGroup"]
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True, errors='ignore')

# Define label columns
label_columns = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

# Separate features and labels
X = df.drop(columns=label_columns)
y = df[label_columns]

# Fill missing values
X = X.fillna(X.mean())
y = y.fillna(y.mean())

# Normalize features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Convert to tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensors = {trait: torch.tensor(y[trait].values, dtype=torch.float32) for trait in label_columns}

# Dataset class
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

import torch
import torch.nn as nn

class SimpleTransformerRegressor(nn.Module):
    """
    Simple Transformer Regressor using batch_first=True convention.
    Takes tabular features, projects them, passes through a Transformer Encoder,
    and predicts a single regression value.
    """
    def __init__(self, input_dim, embed_dim=256, num_heads=4, num_layers=2, dropout=0.3, ff_dim_multiplier=4):
        """
        Args:
            input_dim (int): Number of input features.
            embed_dim (int): Dimension for projecting features and for the Transformer. Must be divisible by num_heads.
            num_heads (int): Number of attention heads in the Transformer.
            num_layers (int): Number of layers in the Transformer Encoder.
            dropout (float): Dropout rate.
            ff_dim_multiplier (int): Multiplier for the feed-forward layer dimension within the Transformer.
        """
        super(SimpleTransformerRegressor, self).__init__()

        # Ensure embed_dim is divisible by num_heads
        if embed_dim % num_heads != 0:
            # Adjust embed_dim up to the nearest multiple of num_heads
            original_embed_dim = embed_dim
            embed_dim = (embed_dim // num_heads + 1) * num_heads
            print(f"Warning: embed_dim ({original_embed_dim}) not divisible by num_heads ({num_heads}).")
            print(f"Adjusted embed_dim to {embed_dim}.")

        self.input_dim = input_dim
        self.embed_dim = embed_dim

        # Project input features to embedding dimension
        self.project = nn.Linear(input_dim, embed_dim)

        # Define the Transformer Encoder Layer with batch_first=True
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * ff_dim_multiplier, # Standard practice
            dropout=dropout,
            batch_first=True  # <<< Input tensor shape: (batch, seq_len, features)
        )

        # Stack the encoder layers
        self.encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=num_layers
        )

        # Classifier head
        self.classifier = nn.Sequential(
            nn.LayerNorm(embed_dim),      # Add LayerNorm for stability before classifier
            nn.Linear(embed_dim, 128),    # Linear layer 1
            nn.ReLU(),                    # Activation
            nn.Dropout(dropout),          # Dropout
            nn.Linear(128, 1)             # Final output layer (regression target)
        )

    def forward(self, x):
        """
        Forward pass.
        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, input_dim).
        Returns:
            torch.Tensor: Output tensor of shape (batch_size).
        """
        # 1. Project features
        # x shape: (batch_size, input_dim)
        x = self.project(x)
        # x shape: (batch_size, embed_dim)

        # 2. Add sequence dimension for Transformer
        # TransformerEncoderLayer with batch_first=True expects (batch, seq_len, features)
        x = x.unsqueeze(1)
        # x shape: (batch_size, seq_len=1, embed_dim)

        # 3. Pass through Transformer Encoder
        x = self.encoder(x)
        # x shape: (batch_size, seq_len=1, embed_dim)

        # 4. Remove sequence dimension
        x = x.squeeze(1)
        # x shape: (batch_size, embed_dim)

        # 5. Pass through classifier
        output = self.classifier(x)
        # output shape: (batch_size, 1)

        # 6. Squeeze final dimension for regression output
        return output.squeeze(-1)
        # final shape: (batch_size)

# Metrics

def mean_absolute_error(preds, labels):
    return torch.mean(torch.abs(preds - labels)).item()

def concordance_correlation_coefficient(preds, labels):
    preds_mean = torch.mean(preds)
    labels_mean = torch.mean(labels)
    preds_var = torch.var(preds)
    labels_var = torch.var(labels)
    covariance = torch.mean((preds - preds_mean) * (labels - labels_mean))
    ccc = (2 * covariance) / (preds_var + labels_var + (preds_mean - labels_mean) ** 2)
    return ccc.item()

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_mae, total_ccc = 0, 0, 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            total_loss += loss.item()
            total_mae += mean_absolute_error(preds, y_batch)
            total_ccc += concordance_correlation_coefficient(preds, y_batch)
    n_batches = len(loader)
    return total_loss/n_batches, total_mae/n_batches, total_ccc/n_batches

def train_one_epoch(model, loader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    for i, (X_batch, y_batch) in enumerate(loader):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    return total_loss / len(loader)


def generate_random_configs(search_space, num_configs=10):
    configs = []
    for _ in range(num_configs):
        config = {
            "embed_dim": np.random.choice(search_space["embed_dim"]),
            "num_heads": np.random.choice(search_space["num_heads"]),
            "num_layers": np.random.choice(search_space["num_layers"]),
            "dropout": np.random.choice(search_space["dropout"]),
            "lr": np.random.choice(search_space["lr"]),
            "batch_size": np.random.choice(search_space["batch_size"]),
            "weight_decay": np.random.choice(search_space["weight_decay"])
        }
        configs.append(config)
    return configs

def cross_validate(config, X_tensor, y_tensor, num_folds=3, epochs=10):
    print(f"Evaluating Config: {config}")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    fold_metrics = {"val_ccc": [], "val_mae": [], "val_loss": []}

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_tensor)):
        print(f"  Fold {fold+1}/{num_folds}")
        X_train_fold, y_train_fold = X_tensor[train_idx], y_tensor[train_idx]
        X_val_fold, y_val_fold = X_tensor[val_idx], y_tensor[val_idx]

        train_loader = DataLoader(TextDataset(X_train_fold, y_train_fold), batch_size=int(config["batch_size"]), shuffle=True)
        val_loader = DataLoader(TextDataset(X_val_fold, y_val_fold), batch_size=int(config["batch_size"]), shuffle=False)

        model = SimpleTransformerRegressor(
            input_dim=X_tensor.shape[1],
            embed_dim=config["embed_dim"],
            num_heads=config["num_heads"],
            num_layers=config["num_layers"],
            dropout=config["dropout"]
        ).to(device)

        optimizer = optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
        criterion = nn.MSELoss()

        best_ccc = -1
        for epoch in range(epochs):
            print(f"    Epoch {epoch+1}/{epochs}", end=' | ')
            train_loss = train_one_epoch(model, train_loader, optimizer, scheduler, criterion, device)
            val_loss, val_mae, val_ccc = evaluate(model, val_loader, criterion, device)
            print(f"Val CCC: {val_ccc:.4f}, MAE: {val_mae:.4f}")
            if val_ccc > best_ccc:
                best_ccc = val_ccc
                best_metrics = (val_loss, val_mae, val_ccc)

        fold_metrics["val_loss"].append(best_metrics[0])
        fold_metrics["val_mae"].append(best_metrics[1])
        fold_metrics["val_ccc"].append(best_metrics[2])

    return {
        "ccc": np.mean(fold_metrics["val_ccc"]),
        "mae": np.mean(fold_metrics["val_mae"]),
        "loss": np.mean(fold_metrics["val_loss"])
    }


def hyperparameter_tuning(X_tensor, y_tensor, num_configs=10):
    search_space = {
        "embed_dim": [128, 256, 512],
        "num_heads": [2, 4, 8],
        "num_layers": [2, 4],
        "dropout": [0.1, 0.3, 0.5],
        "lr": [1e-4, 3e-4, 1e-3],
        "batch_size": [32, 64],
        "weight_decay": [1e-5, 1e-4]
    }
    configs = generate_random_configs(search_space, num_configs)
    best_config = None
    best_ccc = -1

    for i, config in enumerate(configs):
        print(f"\n>>> Config {i+1}/{num_configs}")
        metrics = cross_validate(config, X_tensor, y_tensor)
        print(f"  Avg CCC: {metrics['ccc']:.4f}, MAE: {metrics['mae']:.4f}")
        if metrics["ccc"] > best_ccc:
            best_ccc = metrics["ccc"]
            best_config = config

    print(f"\n>>> Best Config Selected: {best_config}")
    return best_config


def final_train_and_evaluate(best_config, X_train, y_train, X_val, y_val, X_test, y_test, trait_name, epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    X_final_train = torch.cat([X_train, X_val], dim=0)
    y_final_train = torch.cat([y_train, y_val], dim=0)

    final_train_loader = DataLoader(TextDataset(X_final_train, y_final_train), batch_size=int(best_config["batch_size"]), shuffle=True)
    test_loader = DataLoader(TextDataset(X_test, y_test), batch_size=int(best_config["batch_size"]), shuffle=False)

    model = SimpleTransformerRegressor(
        input_dim=X_train.shape[1],
        embed_dim=best_config["embed_dim"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        dropout=best_config["dropout"]
    ).to(device)

    optimizer = optim.AdamW(model.parameters(), lr=best_config["lr"], weight_decay=best_config["weight_decay"])
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    criterion = nn.MSELoss()

    print(f"\n>>> Final Training for {trait_name.upper()} ({epochs} epochs)")
    for epoch in range(epochs):
        train_loss = train_one_epoch(model, final_train_loader, optimizer, scheduler, criterion, device)
        print(f"  Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f}")

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            all_preds.append(preds)
            all_labels.append(y_batch)

    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)

    final_mae = mean_absolute_error(all_preds, all_labels)
    final_ccc = concordance_correlation_coefficient(all_preds, all_labels)
    tolerance = 0.1
    correct = torch.abs(all_preds - all_labels) < tolerance
    final_accuracy = correct.float().mean().item()

    print(f"\n==== {trait_name.upper()} Evaluation on Test Set ====")
    print(f"Test CCC: {final_ccc:.4f}, Test MAE: {final_mae:.4f}, Accuracy (±{tolerance}): {final_accuracy*100:.2f}%")
    torch.save(model.state_dict(), f"best_text_transformer_model_{trait_name}.pth")
    model_save_path = f"best_text_transformer_model_{trait_name}.pth"
    print(f"Saving final model for {trait_name} to {model_save_path}")
    torch.save({
        'epoch': epochs,
        'model_state_dict': model.state_dict(), # <<< Weights nested here
        'optimizer_state_dict': optimizer.state_dict(),
        'best_config': best_config,           # <<< Config needed
        'scaler_mean': scaler.mean_,         # <<< Scaler mean needed
        'scaler_scale': scaler.scale_,         # <<< Scaler scale needed
        'test_metrics': {'ccc': final_ccc, 'mae': final_mae, f'acc_{tolerance}': final_accuracy}
    }, model_save_path)

# Train model per trait
for trait in label_columns:
    print(f"\n--- Training for Trait: {trait} ---")
    y_trait = y_tensors[trait]
    train_idx, temp_idx = train_test_split(range(len(X)), test_size=0.3, random_state=42)
    val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

    X_train, y_train = X_tensor[train_idx], y_trait[train_idx]
    X_val, y_val = X_tensor[val_idx], y_trait[val_idx]
    X_test, y_test = X_tensor[test_idx], y_trait[test_idx]

    best_config = hyperparameter_tuning(X_tensor, y_trait, num_configs=10)
    final_train_and_evaluate(best_config, X_train, y_train, X_val, y_val, X_test, y_test, trait_name=trait)



--- Training for Trait: openness ---

>>> Config 1/10
Evaluating Config: {'embed_dim': 512, 'num_heads': 2, 'num_layers': 4, 'dropout': 0.5, 'lr': 0.0001, 'batch_size': 32, 'weight_decay': 1e-05}
  Fold 1/3
    Epoch 1/10 | Val CCC: 0.0602, MAE: 0.1159
    Epoch 2/10 | Val CCC: 0.1260, MAE: 0.1161
    Epoch 3/10 | Val CCC: 0.1145, MAE: 0.1165
    Epoch 4/10 | Val CCC: 0.0791, MAE: 0.1437
    Epoch 5/10 | Val CCC: 0.0848, MAE: 0.1444
    Epoch 6/10 | Val CCC: 0.0980, MAE: 0.1349
    Epoch 7/10 | Val CCC: 0.0958, MAE: 0.1364
    Epoch 8/10 | Val CCC: 0.1027, MAE: 0.1413
    Epoch 9/10 | Val CCC: 0.1190, MAE: 0.1294
    Epoch 10/10 | Val CCC: 0.1190, MAE: 0.1321
  Fold 2/3
    Epoch 1/10 | Val CCC: 0.1257, MAE: 0.1246
    Epoch 2/10 | Val CCC: 0.1305, MAE: 0.1153
    Epoch 3/10 | Val CCC: 0.0945, MAE: 0.1275
    Epoch 4/10 | Val CCC: 0.0793, MAE: 0.1460
    Epoch 5/10 | Val CCC: 0.0942, MAE: 0.1401
    Epoch 6/10 | Val CCC: 0.0881, MAE: 0.1370
    Epoch 7/10 | Val CCC: 0.0788, MAE: 0.155

In [None]:
final_train_and_evaluate(best_config, X_train, y_train, X_val, y_val, X_test, y_test)


In [None]:
# Final evaluation function
def final_train_and_evaluate(best_config, X_train, y_train, X_val, y_val, X_test, y_test, epochs=30):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Combine train and val sets for final training
    X_final_train = torch.cat([X_train, X_val], dim=0)
    y_final_train = torch.cat([y_train, y_val], dim=0)

    final_train_loader = DataLoader(TextDataset(X_final_train, y_final_train), batch_size=int(best_config["batch_size"]), shuffle=True)
    test_loader = DataLoader(TextDataset(X_test, y_test), batch_size=int(best_config["batch_size"]), shuffle=False)

    # Initialize model
    model = SimpleTransformerRegressor(
        input_dim=X_train.shape[1],
        embed_dim=best_config["embed_dim"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        dropout=best_config["dropout"]
    ).to(device)

    optimizer = optim.AdamW(model.parameters(), lr=best_config["lr"], weight_decay=best_config["weight_decay"])
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    criterion = nn.MSELoss()

    # Final training
    for epoch in range(epochs):
        train_loss = train_one_epoch(model, final_train_loader, optimizer, scheduler, criterion, device)
        print(f"Final Train Epoch {epoch+1}/{epochs} - Loss: {train_loss:.4f}")

    # Final evaluation on test set
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            all_preds.append(preds)
            all_labels.append(y_batch)

    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)

    # Metrics
    final_mae = mean_absolute_error(all_preds, all_labels)
    final_ccc = concordance_correlation_coefficient(all_preds, all_labels)
    
    # Define "accuracy" for regression
    tolerance = 0.1  # within 0.1 tolerance considered correct
    correct = torch.abs(all_preds - all_labels) < tolerance
    final_accuracy = correct.float().mean().item()

    print("\n==== Final Evaluation on Test Set ====")
    print(f"Final Test CCC: {final_ccc:.4f}")
    print(f"Final Test MAE: {final_mae:.4f}")
    print(f"Final Test Accuracy (within {tolerance}): {final_accuracy*100:.2f}%")
    torch.save(model.state_dict(), "best_text_transformer_model.pth")
    print("\nModel saved to 'best_text_transformer_model.pth'.")


In [None]:
# Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, KFold
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv('/kaggle/input/fi-v2-test-val-data/FI V2 COMPLETE FEATURE DATASET/HANDCRAFTED/text_hc_features.csv')

# Drop unnecessary columns
drop_cols = ["Filename", "Segment_ID", "interview", "Gender", "Ethnicity", "AgeGroup"]
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True, errors='ignore')

# Define label columns
label_columns = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

# Separate features and labels
X = df.drop(columns=label_columns)
y = df[label_columns]

# Fill missing values
X = X.fillna(X.mean())
y = y.fillna(y.mean())

# Normalize features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Convert to tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)

# Train/val/test split
train_idx, temp_idx = train_test_split(range(len(X)), test_size=0.3, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

X_train, y_train = X_tensor[train_idx], y_tensor[train_idx]
X_val, y_val = X_tensor[val_idx], y_tensor[val_idx]
X_test, y_test = X_tensor[test_idx], y_tensor[test_idx]

# Dataset and DataLoader
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Model
class SimpleTransformerRegressor(nn.Module):
    def __init__(self, input_dim, embed_dim=256, num_heads=4, num_layers=2, dropout=0.3):
        super(SimpleTransformerRegressor, self).__init__()
        self.project = nn.Linear(input_dim, embed_dim)
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout),
            num_layers=num_layers
        )
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 5)  # Predict 5 traits
        )

    def forward(self, x):
        x = self.project(x)
        x = x.unsqueeze(0)  # Add batch dimension for transformer
        x = self.encoder(x)
        x = x.squeeze(0)
        return self.classifier(x)

# Evaluation metrics
def mean_absolute_error(preds, labels):
    return torch.mean(torch.abs(preds - labels)).item()

def concordance_correlation_coefficient(preds, labels):
    preds_mean = torch.mean(preds)
    labels_mean = torch.mean(labels)
    preds_var = torch.var(preds)
    labels_var = torch.var(labels)
    covariance = torch.mean((preds - preds_mean) * (labels - labels_mean))
    ccc = (2 * covariance) / (preds_var + labels_var + (preds_mean - labels_mean) ** 2)
    return ccc.item()

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_mae, total_ccc = 0, 0, 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            total_loss += loss.item()
            total_mae += mean_absolute_error(preds, y_batch)
            total_ccc += concordance_correlation_coefficient(preds, y_batch)
    n_batches = len(loader)
    return total_loss/n_batches, total_mae/n_batches, total_ccc/n_batches

# Training loop
def train_one_epoch(model, loader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    return total_loss / len(loader)

# Hyperparameter tuning
def generate_random_configs(search_space, num_configs=10):
    configs = []
    for _ in range(num_configs):
        config = {
            "embed_dim": np.random.choice(search_space["embed_dim"]),
            "num_heads": np.random.choice(search_space["num_heads"]),
            "num_layers": np.random.choice(search_space["num_layers"]),
            "dropout": np.random.choice(search_space["dropout"]),
            "lr": np.random.choice(search_space["lr"]),
            "batch_size": np.random.choice(search_space["batch_size"]),
            "weight_decay": np.random.choice(search_space["weight_decay"])
        }
        configs.append(config)
    return configs

def cross_validate(config, X_tensor, y_tensor, num_folds=3, epochs=30):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    fold_metrics = {"val_ccc": [], "val_mae": [], "val_loss": []}

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_tensor)):
        print(f"\n=== Fold {fold+1}/{num_folds} ===")

        X_train_fold, y_train_fold = X_tensor[train_idx], y_tensor[train_idx]
        X_val_fold, y_val_fold = X_tensor[val_idx], y_tensor[val_idx]

        train_loader = DataLoader(TextDataset(X_train_fold, y_train_fold), batch_size=int(config["batch_size"]), shuffle=True)
        val_loader = DataLoader(TextDataset(X_val_fold, y_val_fold), batch_size=int(config["batch_size"]), shuffle=False)

        model = SimpleTransformerRegressor(
            input_dim=X_tensor.shape[1],
            embed_dim=config["embed_dim"],
            num_heads=config["num_heads"],
            num_layers=config["num_layers"],
            dropout=config["dropout"]
        ).to(device)

        optimizer = optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
        criterion = nn.MSELoss()

        best_ccc = -1
        for epoch in range(epochs):
            train_loss = train_one_epoch(model, train_loader, optimizer, scheduler, criterion, device)
            val_loss, val_mae, val_ccc = evaluate(model, val_loader, criterion, device)
            print(f"Epoch {epoch+1}/{epochs} - Val Loss: {val_loss:.4f} - Val MAE: {val_mae:.4f} - Val CCC: {val_ccc:.4f}")

            if val_ccc > best_ccc:
                best_ccc = val_ccc
                best_metrics = (val_loss, val_mae, val_ccc)

        fold_metrics["val_loss"].append(best_metrics[0])
        fold_metrics["val_mae"].append(best_metrics[1])
        fold_metrics["val_ccc"].append(best_metrics[2])

    return {
        "ccc": np.mean(fold_metrics["val_ccc"]),
        "mae": np.mean(fold_metrics["val_mae"]),
        "loss": np.mean(fold_metrics["val_loss"])
    }

def hyperparameter_tuning(X_tensor, y_tensor, num_configs=10):
    search_space = {
        "embed_dim": [128, 256, 512],
        "num_heads": [2, 4, 8],
        "num_layers": [2, 4],
        "dropout": [0.1, 0.3, 0.5],
        "lr": [1e-4, 3e-4, 1e-3],
        "batch_size": [32, 64],
        "weight_decay": [1e-5, 1e-4]
    }
    configs = generate_random_configs(search_space, num_configs)
    best_config = None
    best_ccc = -1
    results = []

    for i, config in enumerate(configs):
        print(f"\n=== Testing Config {i+1}/{len(configs)} ===")
        print(config)

        metrics = cross_validate(config, X_tensor, y_tensor)
        results.append((config, metrics))

        if metrics["ccc"] > best_ccc:
            best_ccc = metrics["ccc"]
            best_config = config

        print(f"Config {i+1} Metrics - CCC: {metrics['ccc']:.4f}, MAE: {metrics['mae']:.4f}, Loss: {metrics['loss']:.4f}")

    results_df = pd.DataFrame([
        {"config": str(c), **m} for c, m in results
    ])
    results_df.to_csv("hyperparameter_tuning_text_hc.csv", index=False)

    return best_config

# Start hyperparameter tuning
best_config = hyperparameter_tuning(X_tensor, y_tensor, num_configs=10)
print("\nBest Config Found:", best_config)


In [None]:
# Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, KFold
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

# 1. Load dataset
df = pd.read_csv('/kaggle/input/fi-v2-test-val-data/FI V2 COMPLETE FEATURE DATASET/HANDCRAFTED/audio_hc_features.csv')

# Drop unnecessary non-numeric columns
df = df.select_dtypes(include=[np.number])  # Keep only numeric columns

# 2. Define features and labels
# Assume last 5 columns are ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]
label_columns = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

X = df.drop(columns=label_columns)
y = df[label_columns]

# Fill missing values if any
X = X.fillna(X.mean())
y = y.fillna(y.mean())

# Normalize features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Convert to tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)

# 3. Train/Val/Test Split
train_idx, temp_idx = train_test_split(range(len(X)), test_size=0.3, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

X_train, y_train = X_tensor[train_idx], y_tensor[train_idx]
X_val, y_val = X_tensor[val_idx], y_tensor[val_idx]
X_test, y_test = X_tensor[test_idx], y_tensor[test_idx]

# 4. Dataset and DataLoader
class AudioDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# 5. Model
class SimpleTransformerRegressor(nn.Module):
    def __init__(self, input_dim, embed_dim=256, num_heads=4, num_layers=2, dropout=0.3):
        super(SimpleTransformerRegressor, self).__init__()
        self.project = nn.Linear(input_dim, embed_dim)
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout),
            num_layers=num_layers
        )
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 5)  # 5 labels
        )

    def forward(self, x):
        x = self.project(x)
        x = x.unsqueeze(0)  # Transformer expects (seq_len, batch, feature)
        x = self.encoder(x)
        x = x.squeeze(0)
        return self.classifier(x)

# 6. Evaluation Metrics
def mean_absolute_error(preds, labels):
    return torch.mean(torch.abs(preds - labels)).item()

def concordance_correlation_coefficient(preds, labels):
    preds_mean = torch.mean(preds)
    labels_mean = torch.mean(labels)
    preds_var = torch.var(preds)
    labels_var = torch.var(labels)
    covariance = torch.mean((preds - preds_mean) * (labels - labels_mean))
    ccc = (2 * covariance) / (preds_var + labels_var + (preds_mean - labels_mean)**2)
    return ccc.item()

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, total_mae, total_ccc = 0, 0, 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            total_loss += loss.item()
            total_mae += mean_absolute_error(preds, y_batch)
            total_ccc += concordance_correlation_coefficient(preds, y_batch)
    n_batches = len(loader)
    return total_loss/n_batches, total_mae/n_batches, total_ccc/n_batches

# 7. Training Loop
def train_one_epoch(model, loader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    return total_loss / len(loader)

# 8. Hyperparameter tuning
def generate_random_configs(search_space, num_configs=10):
    configs = []
    for _ in range(num_configs):
        config = {
            "embed_dim": np.random.choice(search_space["embed_dim"]),
            "num_heads": np.random.choice(search_space["num_heads"]),
            "num_layers": np.random.choice(search_space["num_layers"]),
            "dropout": np.random.choice(search_space["dropout"]),
            "lr": np.random.choice(search_space["lr"]),
            "batch_size": np.random.choice(search_space["batch_size"]),
            "weight_decay": np.random.choice(search_space["weight_decay"])
        }
        configs.append(config)
    return configs

def cross_validate(config, X_tensor, y_tensor, num_folds=3, epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    fold_metrics = {"val_ccc": [], "val_mae": [], "val_loss": []}

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_tensor)):
        print(f"\n=== Fold {fold+1}/{num_folds} ===")
        X_train_fold, y_train_fold = X_tensor[train_idx], y_tensor[train_idx]
        X_val_fold, y_val_fold = X_tensor[val_idx], y_tensor[val_idx]

        train_loader = DataLoader(AudioDataset(X_train_fold, y_train_fold), batch_size=int(config["batch_size"]), shuffle=True)
        val_loader = DataLoader(AudioDataset(X_val_fold, y_val_fold), batch_size=int(config["batch_size"]), shuffle=False)

        model = SimpleTransformerRegressor(
            input_dim=X_tensor.shape[1],
            embed_dim=config["embed_dim"],
            num_heads=config["num_heads"],
            num_layers=config["num_layers"],
            dropout=config["dropout"]
        ).to(device)

        optimizer = optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
        criterion = nn.MSELoss()

        best_ccc = -1
        for epoch in range(epochs):
            train_loss = train_one_epoch(model, train_loader, optimizer, scheduler, criterion, device)
            val_loss, val_mae, val_ccc = evaluate(model, val_loader, criterion, device)
            print(f"Epoch {epoch+1}/{epochs} - Val Loss: {val_loss:.4f} - Val MAE: {val_mae:.4f} - Val CCC: {val_ccc:.4f}")

            if val_ccc > best_ccc:
                best_ccc = val_ccc
                best_metrics = (val_loss, val_mae, val_ccc)

        fold_metrics["val_loss"].append(best_metrics[0])
        fold_metrics["val_mae"].append(best_metrics[1])
        fold_metrics["val_ccc"].append(best_metrics[2])

    return {
        "ccc": np.mean(fold_metrics["val_ccc"]),
        "mae": np.mean(fold_metrics["val_mae"]),
        "loss": np.mean(fold_metrics["val_loss"])
    }

def hyperparameter_tuning(X_tensor, y_tensor, num_configs=10):
    search_space = {
        "embed_dim": [128, 256, 512],
        "num_heads": [2, 4, 8],
        "num_layers": [2, 4],
        "dropout": [0.1, 0.3, 0.5],
        "lr": [1e-4, 3e-4, 1e-3],
        "batch_size": [32, 64],
        "weight_decay": [1e-5, 1e-4]
    }
    configs = generate_random_configs(search_space, num_configs)
    best_config = None
    best_ccc = -1
    results = []

    for i, config in enumerate(configs):
        print(f"\n=== Testing Config {i+1}/{len(configs)} ===")
        print(config)

        metrics = cross_validate(config, X_tensor, y_tensor)
        results.append((config, metrics))

        if metrics["ccc"] > best_ccc:
            best_ccc = metrics["ccc"]
            best_config = config

        print(f"Config {i+1} Metrics - CCC: {metrics['ccc']:.4f}, MAE: {metrics['mae']:.4f}, Loss: {metrics['loss']:.4f}")

    results_df = pd.DataFrame([
        {"config": str(c), **m} for c, m in results
    ])
    results_df.to_csv("hyperparameter_tuning_audio_hc.csv", index=False)

    return best_config

# 9. Start hyperparameter tuning
best_config = hyperparameter_tuning(X_tensor, y_tensor, num_configs=10)
print("\nBest Config Found:", best_config)

# 10. Final training and evaluation
def final_train_and_evaluate(best_config, X_train, y_train, X_val, y_val, X_test, y_test, epochs=30):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    X_final_train = torch.cat([X_train, X_val], dim=0)
    y_final_train = torch.cat([y_train, y_val], dim=0)

    final_train_loader = DataLoader(AudioDataset(X_final_train, y_final_train), batch_size=int(best_config["batch_size"]), shuffle=True)
    test_loader = DataLoader(AudioDataset(X_test, y_test), batch_size=int(best_config["batch_size"]), shuffle=False)

    model = SimpleTransformerRegressor(
        input_dim=X_train.shape[1],
        embed_dim=best_config["embed_dim"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        dropout=best_config["dropout"]
    ).to(device)

    optimizer = optim.AdamW(model.parameters(), lr=best_config["lr"], weight_decay=best_config["weight_decay"])
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        train_loss = train_one_epoch(model, final_train_loader, optimizer, scheduler, criterion, device)
        print(f"Final Train Epoch {epoch+1}/{epochs} - Loss: {train_loss:.4f}")

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            all_preds.append(preds)
            all_labels.append(y_batch)

    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)

    final_mae = mean_absolute_error(all_preds, all_labels)
    final_ccc = concordance_correlation_coefficient(all_preds, all_labels)

    tolerance = 0.1
    correct = torch.abs(all_preds - all_labels) < tolerance
    final_accuracy = correct.float().mean().item()

    print("\n==== Final Evaluation on Test Set ====")
    print(f"Final Test CCC: {final_ccc:.4f}")
    print(f"Final Test MAE: {final_mae:.4f}")
    print(f"Final Test Accuracy (within {tolerance}): {final_accuracy*100:.2f}%")# Save model
    torch.save(model.state_dict(), "best_audio_transformer_model.pth")
    print("\nModel saved to 'best_audio_transformer_model.pth'.")

final_train_and_evaluate(best_config, X_train, y_train, X_val, y_val, X_test, y_test)


In [None]:
# Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, KFold
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

# 1. Load dataset
df = pd.read_csv('/kaggle/input/fi-v2-test-val-data/FI V2 COMPLETE FEATURE DATASET/HANDCRAFTED/video_hc_features.csv')  # <-- VIDEO HC path

# Drop unnecessary non-numeric columns
df = df.select_dtypes(include=[np.number])

# 2. Define features and labels
label_columns = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

X = df.drop(columns=label_columns)
y = df[label_columns]

# Fill missing values if any
X = X.fillna(X.mean())
y = y.fillna(y.mean())

# Normalize features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Convert to tensors
X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)

# 3. Train/Val/Test Split
train_idx, temp_idx = train_test_split(range(len(X)), test_size=0.3, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

X_train, y_train = X_tensor[train_idx], y_tensor[train_idx]
X_val, y_val = X_tensor[val_idx], y_tensor[val_idx]
X_test, y_test = X_tensor[test_idx], y_tensor[test_idx]

# 4. Dataset and DataLoader
class VideoDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# 5. Model (Same as before)
class SimpleTransformerRegressor(nn.Module):
    def __init__(self, input_dim, embed_dim=256, num_heads=4, num_layers=2, dropout=0.3):
        super(SimpleTransformerRegressor, self).__init__()
        self.project = nn.Linear(input_dim, embed_dim)
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout),
            num_layers=num_layers
        )
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 5)
        )

    def forward(self, x):
        x = self.project(x)
        x = x.unsqueeze(0)
        x = self.encoder(x)
        x = x.squeeze(0)
        return self.classifier(x)

# 6–8: Metrics, training functions, and hyperparameter tuning — same as before

# Just rename the dataset used for clarity and the CSV/model output
def hyperparameter_tuning(X_tensor, y_tensor, num_configs=10):
    search_space = {
        "embed_dim": [128, 256, 512],
        "num_heads": [2, 4, 8],
        "num_layers": [2, 4],
        "dropout": [0.1, 0.3, 0.5],
        "lr": [1e-4, 3e-4, 1e-3],
        "batch_size": [32, 64],
        "weight_decay": [1e-5, 1e-4]
    }
    configs = generate_random_configs(search_space, num_configs)
    best_config = None
    best_ccc = -1
    results = []

    for i, config in enumerate(configs):
        print(f"\n=== Testing Config {i+1}/{len(configs)} ===")
        print(config)

        metrics = cross_validate(config, X_tensor, y_tensor)
        results.append((config, metrics))

        if metrics["ccc"] > best_ccc:
            best_ccc = metrics["ccc"]
            best_config = config

        print(f"Config {i+1} Metrics - CCC: {metrics['ccc']:.4f}, MAE: {metrics['mae']:.4f}, Loss: {metrics['loss']:.4f}")

    results_df = pd.DataFrame([
        {"config": str(c), **m} for c, m in results
    ])
    results_df.to_csv("hyperparameter_tuning_video_hc.csv", index=False)

    return best_config

# 9. Run tuning
best_config = hyperparameter_tuning(X_tensor, y_tensor, num_configs=10)
print("\nBest Config Found:", best_config)

# 10. Final training
def final_train_and_evaluate(best_config, X_train, y_train, X_val, y_val, X_test, y_test, epochs=30):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    X_final_train = torch.cat([X_train, X_val], dim=0)
    y_final_train = torch.cat([y_train, y_val], dim=0)

    final_train_loader = DataLoader(VideoDataset(X_final_train, y_final_train), batch_size=int(best_config["batch_size"]), shuffle=True)
    test_loader = DataLoader(VideoDataset(X_test, y_test), batch_size=int(best_config["batch_size"]), shuffle=False)

    model = SimpleTransformerRegressor(
        input_dim=X_train.shape[1],
        embed_dim=best_config["embed_dim"],
        num_heads=best_config["num_heads"],
        num_layers=best_config["num_layers"],
        dropout=best_config["dropout"]
    ).to(device)

    optimizer = optim.AdamW(model.parameters(), lr=best_config["lr"], weight_decay=best_config["weight_decay"])
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        train_loss = train_one_epoch(model, final_train_loader, optimizer, scheduler, criterion, device)
        print(f"Final Train Epoch {epoch+1}/{epochs} - Loss: {train_loss:.4f}")

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            all_preds.append(preds)
            all_labels.append(y_batch)

    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)

    final_mae = mean_absolute_error(all_preds, all_labels)
    final_ccc = concordance_correlation_coefficient(all_preds, all_labels)

    tolerance = 0.1
    correct = torch.abs(all_preds - all_labels) < tolerance
    final_accuracy = correct.float().mean().item()

    print("\n==== Final Evaluation on Test Set ====")
    print(f"Final Test CCC: {final_ccc:.4f}")
    print(f"Final Test MAE: {final_mae:.4f}")
    print(f"Final Test Accuracy (within {tolerance}): {final_accuracy*100:.2f}%")
    
    torch.save(model.state_dict(), "best_video_transformer_model.pth")
    print("\nModel saved to 'best_video_transformer_model.pth'.")

final_train_and_evaluate(best_config, X_train, y_train, X_val, y_val, X_test, y_test)


In [2]:
# --- Imports ---
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import shap # Make sure you have shap installed: pip install shap
import os
import gc # Garbage collector
import time # For timing
import traceback # For detailed error printing

# --- Configuration ---
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    # Note: SHAP might not always be deterministic on GPU even with seeds
    # torch.backends.cudnn.deterministic = True # Can sometimes cause issues or slow down
    # torch.backends.cudnn.benchmark = False

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# --- Paths ---
ORIGINAL_DATA_PATH = '/kaggle/input/fi-v2-test-val-data/FI V2 COMPLETE FEATURE DATASET/HANDCRAFTED/text_hc_features.csv' # Or /kaggle/working/text_hc_features.csv if it's there

NEW_DATA_PATH = '/kaggle/input/fi-v2-hc-dataset-for-shapley/renamed_text_hc_features.csv'

MODEL_SAVE_DIR = '/kaggle/working/'

PLOTS_SAVE_DIR = '/kaggle/working/shap/text/'

# --- Constants ---
label_columns = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]
# Columns to drop from BOTH datasets (ensure consistency)
cols_to_drop = ["Filename", "Segment_ID", "interview", "Gender", "Ethnicity", "AgeGroup"]

# --- SHAP Configuration ---
# Samples from ORIGINAL training data split for background summary (KMeans)
SHAP_BACKGROUND_SAMPLES = 100
# Samples from the NEW data to explain. Reduce if kernel dies (RAM issue).
SHAP_EXPLAIN_SAMPLES = 100
# Samples per explanation point for KernelExplainer. More = more accurate but MUCH slower.
SHAP_KERNEL_NSAMPLES = 100

# --- Ensure Plots Directory Exists ---
os.makedirs(PLOTS_SAVE_DIR, exist_ok=True)

class SimpleTransformerRegressor(nn.Module):
    """
    Simple Transformer Regressor using batch_first=True convention.
    Takes tabular features, projects them, passes through a Transformer Encoder,
    and predicts a single regression value.
    """
    def __init__(self, input_dim, embed_dim=256, num_heads=4, num_layers=2, dropout=0.3, ff_dim_multiplier=4):
        """
        Args:
            input_dim (int): Number of input features.
            embed_dim (int): Dimension for projecting features and for the Transformer. Must be divisible by num_heads.
            num_heads (int): Number of attention heads in the Transformer.
            num_layers (int): Number of layers in the Transformer Encoder.
            dropout (float): Dropout rate.
            ff_dim_multiplier (int): Multiplier for the feed-forward layer dimension within the Transformer.
        """
        super(SimpleTransformerRegressor, self).__init__()

        # Ensure embed_dim is divisible by num_heads
        if embed_dim % num_heads != 0:
            # Adjust embed_dim up to the nearest multiple of num_heads
            original_embed_dim = embed_dim
            embed_dim = (embed_dim // num_heads + 1) * num_heads
            print(f"Warning: embed_dim ({original_embed_dim}) not divisible by num_heads ({num_heads}).")
            print(f"Adjusted embed_dim to {embed_dim}.")

        self.input_dim = input_dim
        self.embed_dim = embed_dim

        # Project input features to embedding dimension
        self.project = nn.Linear(input_dim, embed_dim)

        # Define the Transformer Encoder Layer with batch_first=True
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim * ff_dim_multiplier, # Standard practice
            dropout=dropout,
            batch_first=True  # <<< Input tensor shape: (batch, seq_len, features)
        )

        # Stack the encoder layers
        self.encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=num_layers
        )

        # Classifier head
        self.classifier = nn.Sequential(
            nn.LayerNorm(embed_dim),      # Add LayerNorm for stability before classifier
            nn.Linear(embed_dim, 128),    # Linear layer 1
            nn.ReLU(),                    # Activation
            nn.Dropout(dropout),          # Dropout
            nn.Linear(128, 1)             # Final output layer (regression target)
        )

    def forward(self, x):
        """
        Forward pass.
        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, input_dim).
        Returns:
            torch.Tensor: Output tensor of shape (batch_size).
        """
        # 1. Project features
        # x shape: (batch_size, input_dim)
        x = self.project(x)
        # x shape: (batch_size, embed_dim)

        # 2. Add sequence dimension for Transformer
        # TransformerEncoderLayer with batch_first=True expects (batch, seq_len, features)
        x = x.unsqueeze(1)
        # x shape: (batch_size, seq_len=1, embed_dim)

        # 3. Pass through Transformer Encoder
        x = self.encoder(x)
        # x shape: (batch_size, seq_len=1, embed_dim)

        # 4. Remove sequence dimension
        x = x.squeeze(1)
        # x shape: (batch_size, embed_dim)

        # 5. Pass through classifier
        output = self.classifier(x)
        # output shape: (batch_size, 1)

        # 6. Squeeze final dimension for regression output
        return output.squeeze(-1)
        # final shape: (batch_size)



# --- Step 1 & 2: Get Feature Names ---
print("Loading feature names...")
try:
    df_orig_temp = pd.read_csv(ORIGINAL_DATA_PATH)
    X_orig_temp = df_orig_temp.drop(columns=[col for col in cols_to_drop if col in df_orig_temp.columns], errors='ignore')
    X_orig_temp = X_orig_temp.drop(columns=[col for col in label_columns if col in X_orig_temp.columns], errors='ignore')
    original_numeric_feature_order = list(X_orig_temp.columns)
    num_original_features = len(original_numeric_feature_order)
    print(f"Determined original feature order (numeric headers). Count: {num_original_features}")
    del df_orig_temp, X_orig_temp
    gc.collect()
except FileNotFoundError:
    print(f"Error: Original dataset file not found at {ORIGINAL_DATA_PATH}. Cannot proceed.")
    exit()
except Exception as e:
    print(f"Error reading original data file {ORIGINAL_DATA_PATH}: {e}")
    exit()

try:
    df_new_temp = pd.read_csv(NEW_DATA_PATH)
    X_new_temp = df_new_temp.drop(columns=[col for col in cols_to_drop if col in df_new_temp.columns], errors='ignore')
    X_new_temp = X_new_temp.drop(columns=[col for col in label_columns if col in X_new_temp.columns], errors='ignore')
    descriptive_feature_names = list(X_new_temp.columns)
    num_new_features = len(descriptive_feature_names)
    print(f"Found descriptive feature names in new data. Count: {num_new_features}")
    del df_new_temp, X_new_temp
    gc.collect()
except FileNotFoundError:
    print(f"Error: New dataset file not found at {NEW_DATA_PATH}. Cannot proceed.")
    exit()
except Exception as e:
    print(f"Error reading new data file {NEW_DATA_PATH}: {e}")
    exit()

# --- Step 3: Verification ---
print("Verifying feature count consistency...")
if num_original_features != num_new_features:
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("! ERROR: Mismatch in feature counts between original and new data!")
    print(f"! Original data features: {num_original_features}, New data features: {num_new_features}")
    print("! Cannot proceed. Ensure both CSVs have the same features after dropping metadata/labels.")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    exit()
else:
    print("Feature counts match. Proceeding.")
input_dim = num_original_features


# --- Step 4: Load Model Function ---
def load_trained_model(trait_name, input_dim, device):
    """Loads a trained model state and configuration."""
    model_path = os.path.join(MODEL_SAVE_DIR, f"best_text_transformer_model_{trait_name}.pth")
    if not os.path.exists(model_path):
        print(f"Error: Model file not found for trait '{trait_name}' at {model_path}")
        return None, None, None
    print(f"Loading checkpoint for {trait_name} from {model_path}...")
    try:
        # Load onto CPU first, explicitly set weights_only=False as checkpoint contains non-tensor data
        checkpoint = torch.load(model_path, map_location='cpu', weights_only=False) # <<< CRITICAL FIX
    except Exception as e:
        print(f"Error loading checkpoint file {model_path}: {e}")
        # Print detailed traceback if loading fails
        traceback.print_exc()
        return None, None, None

    required_keys = ['best_config', 'model_state_dict', 'scaler_mean', 'scaler_scale']
    if not all(key in checkpoint for key in required_keys):
        missing = [key for key in required if key not in checkpoint]
        print(f"Error: Checkpoint for {trait_name} is missing required keys: {missing}.")
        return None, None, None

    config = checkpoint['best_config']
    scaler_mean = np.array(checkpoint['scaler_mean']) # Ensure numpy array
    scaler_scale = np.array(checkpoint['scaler_scale']) # Ensure numpy array

    required_config_keys = ['embed_dim', 'num_heads', 'num_layers', 'dropout']
    if not all(key in config for key in required_config_keys):
        print(f"Error: Checkpoint config for {trait_name} is missing required model parameters: {required_config_keys}")
        return None, None, None

    try:
        model = SimpleTransformerRegressor(
            input_dim=input_dim, embed_dim=config["embed_dim"], num_heads=config["num_heads"],
            num_layers=config["num_layers"], dropout=config["dropout"]
            # ff_dim_multiplier might not be in older configs, handle potential KeyError or provide default
            # ff_dim_multiplier=config.get("ff_dim_multiplier", 4) # Example handling
        )
        model.load_state_dict(checkpoint['model_state_dict'])
        model = model.to(device)
        model.eval()
        print(f"Model for {trait_name} loaded successfully.")
    except Exception as e:
        print(f"Error reconstructing or loading model state for {trait_name}: {e}")
        traceback.print_exc() # Print detailed error for state_dict issues
        return None, None, None

    if scaler_mean.shape[0] != input_dim or scaler_scale.shape[0] != input_dim:
         print(f"Error: Loaded scaler params shape mismatch for {trait_name}. Mean: {scaler_mean.shape}, Scale: {scaler_scale.shape}, Expected: {input_dim}")
         return None, None, None

    return model, scaler_mean, scaler_scale


# --- Step 5: Preprocessing Function for New Data ---
def preprocess_explanation_data(df_new, target_feature_order, scaler_mean, scaler_scale):
    """Preprocesses the new dataframe using descriptive feature order and loaded scaler parameters."""
    print(f"Preprocessing explanation data. Initial shape: {df_new.shape}")
    try:
        df_features = df_new[target_feature_order].copy()
    except KeyError as e:
        print(f"Error: Columns mismatch during preprocessing. Missing: {e}. Ensure NEW data has all descriptive columns.")
        return None, None
    if len(scaler_mean) != len(target_feature_order) or len(scaler_scale) != len(target_feature_order):
         print(f"Error: Scaler param length mismatch. Mean: {len(scaler_mean)}, Scale: {len(scaler_scale)}, Features: {len(target_feature_order)}")
         return None, None

    original_means_series = pd.Series(scaler_mean, index=target_feature_order)
    df_features_filled = df_features.fillna(original_means_series)

    if df_features_filled.isnull().sum().sum() > 0:
        print(f"Info: Found {df_features_filled.isnull().sum().sum()} NaNs after filling with means. Filling remaining with 0.")
        df_features_filled = df_features_filled.fillna(0)

    try:
        X_new_scaled_values = (df_features_filled.values - scaler_mean) / scaler_scale
    except ValueError as e:
        print(f"Error during scaling: {e}. Check shapes: df values {df_features_filled.values.shape}, mean {scaler_mean.shape}, scale {scaler_scale.shape}")
        return None, None

    X_new_tensor = torch.tensor(X_new_scaled_values, dtype=torch.float32)
    print(f"Preprocessing complete. Final tensor shape: {X_new_tensor.shape}")
    return X_new_tensor, df_features_filled # Return processed df for SHAP labels


# --- Step 6: Prepare Background Data (Once before loop) ---
print("\nPreparing background data for SHAP...")
background_data = None # Initialize
try:
    # Load scaler params from one representative model (e.g., openness)
    _, bg_scaler_mean, bg_scaler_scale = load_trained_model(label_columns[0], input_dim, DEVICE)
    if bg_scaler_mean is None or bg_scaler_scale is None:
        raise ValueError(f"Failed to load scaler parameters from {label_columns[0]} model checkpoint.")

    df_orig_background = pd.read_csv(ORIGINAL_DATA_PATH)
    X_orig_background = df_orig_background.drop(columns=[col for col in cols_to_drop if col in df_orig_background.columns], errors='ignore')
    X_orig_background = X_orig_background.drop(columns=[col for col in label_columns if col in X_orig_background.columns], errors='ignore')
    X_orig_background = X_orig_background[original_numeric_feature_order] # Use numeric order

    orig_means_series_bg = pd.Series(bg_scaler_mean, index=original_numeric_feature_order)
    X_orig_background_filled = X_orig_background.fillna(orig_means_series_bg)
    if X_orig_background_filled.isnull().sum().sum() > 0:
        print(f"Info: Found {X_orig_background_filled.isnull().sum().sum()} NaNs in background data after filling with means. Filling remaining with 0.")
        X_orig_background_filled = X_orig_background_filled.fillna(0)

    X_orig_background_scaled = (X_orig_background_filled.values - bg_scaler_mean) / bg_scaler_scale
    X_orig_background_tensor = torch.tensor(X_orig_background_scaled, dtype=torch.float32)

    train_indices, _ = train_test_split(range(len(X_orig_background_tensor)), test_size=0.3, random_state=SEED)
    X_train_orig_tensor = X_orig_background_tensor[train_indices]

    num_background_available = X_train_orig_tensor.shape[0]
    actual_background_samples = min(SHAP_BACKGROUND_SAMPLES, num_background_available)
    if actual_background_samples < SHAP_BACKGROUND_SAMPLES:
        print(f"Warning: Requested {SHAP_BACKGROUND_SAMPLES} background samples, but only {num_background_available} available. Using {actual_background_samples}.")

    background_indices = np.random.choice(num_background_available, actual_background_samples, replace=False)
    # Keep background data on CPU for potential numpy operations (like kmeans)
    background_data_cpu = X_train_orig_tensor[background_indices].cpu() # Store on CPU
    print(f"Background data prepared. Shape: {background_data_cpu.shape}")

    del df_orig_background, X_orig_background, X_orig_background_filled, X_orig_background_scaled
    del X_orig_background_tensor, X_train_orig_tensor, bg_scaler_mean, bg_scaler_scale
    gc.collect()

except FileNotFoundError:
    print(f"Error: Original dataset file not found at {ORIGINAL_DATA_PATH}. Cannot prepare background data.")
    exit()
except Exception as e:
    print(f"FATAL ERROR during background data preparation: {e}")
    traceback.print_exc()
    exit()


# --- Step 7: Load Explanation Data (Once before loop) ---
print(f"\n--- Loading Explanation Data from {NEW_DATA_PATH} ---")
try:
    df_explain_full = pd.read_csv(NEW_DATA_PATH)
    print(f"Loaded explanation dataset. Full shape: {df_explain_full.shape}")
except FileNotFoundError:
    print(f"Error: Explanation dataset file not found at {NEW_DATA_PATH}. Cannot proceed.")
    exit()
except Exception as e:
    print(f"Error reading explanation dataset {NEW_DATA_PATH}: {e}")
    exit()

# --- Step 8: SHAP Analysis Loop ---
print(f"\n--- Starting SHAP Analysis Loop (Using KernelExplainer) ---")
if background_data_cpu is None:
     print("FATAL ERROR: Background data was not prepared successfully. Stopping.")
     exit()

for trait in label_columns:
    print(f"\n--- Processing Trait: {trait.upper()} ---")
    trait_start_time = time.time()
    # Initialize variables for cleanup in finally block
    model = None
    scaler_mean = None
    scaler_scale = None
    X_explain_tensor = None
    df_explain_processed = None
    df_explain_sample = None
    explainer = None
    shap_values = None
    X_explain_numpy = None
    background_summary = None

    try:
        # 1. Load Model and Scaler for the current trait
        model, scaler_mean, scaler_scale = load_trained_model(trait, input_dim, DEVICE)
        if model is None:
            print(f"Skipping trait {trait} due to model loading issues.")
            continue

        # 2. Sample and Preprocess Explanation Data for this trait
        num_explain_available = len(df_explain_full)
        actual_explain_samples = min(SHAP_EXPLAIN_SAMPLES, num_explain_available)
        if actual_explain_samples < num_explain_available:
             print(f"Sampling {actual_explain_samples} rows from explanation data for analysis.")
             df_explain_sample = df_explain_full.sample(n=actual_explain_samples, random_state=SEED)
        else:
             print(f"Using all {num_explain_available} rows from explanation data.")
             df_explain_sample = df_explain_full

        X_explain_tensor, df_explain_processed = preprocess_explanation_data(
            df_explain_sample.copy(), descriptive_feature_names, scaler_mean, scaler_scale
        )
        if X_explain_tensor is None:
            print(f"Skipping trait {trait} due to preprocessing errors.")
            continue

        # 3. Initialize SHAP KernelExplainer
        print("Initializing SHAP KernelExplainer...")
        kernel_explainer_start_time = time.time()

        # Define prediction wrapper (takes numpy, returns numpy)
        def predict_wrapper_numpy(x_np):
            x_tensor = torch.tensor(x_np, dtype=torch.float32).to(DEVICE)
            with torch.no_grad():
                predictions = model(x_tensor)
            return predictions.cpu().numpy()

        # Summarize background data
        print(f"Summarizing background data ({background_data_cpu.shape[0]} samples) using k-means...")
        num_clusters = min(25, background_data_cpu.shape[0]) # Adjust cluster count if needed
        background_data_np = background_data_cpu.numpy() # Convert background tensor (CPU) to numpy
        background_summary = shap.kmeans(background_data_np, num_clusters) # Run kmeans

        # --- DEBUG INSPECTION of kmeans output ---
        print(f"DEBUG: Type of background_summary: {type(background_summary)}")
        print(f"DEBUG: Attributes of background_summary: {dir(background_summary)}")
        num_means = 0
        summary_data_valid = False
        if hasattr(background_summary, 'data') and isinstance(background_summary.data, np.ndarray):
            num_means = background_summary.data.shape[0]
            print(f"DEBUG: background_summary.data found. Shape: {background_summary.data.shape}")
            print(f"Background data summarized to {num_means} means.")
            summary_data_valid = True
        else:
            print("DEBUG: background_summary.data attribute not found or not a numpy array.")
        # --- END DEBUG INSPECTION ---

        # Proceed only if summary data seems valid
        if summary_data_valid:
            # Initialize Explainer - Pass the summary object directly
            explainer = shap.KernelExplainer(predict_wrapper_numpy, background_summary)

            # 4. Calculate SHAP values
            X_explain_numpy = X_explain_tensor.cpu().numpy() # Use CPU numpy data for explanation
            print(f"Calculating SHAP values using KernelExplainer for {X_explain_numpy.shape[0]} samples (nsamples={SHAP_KERNEL_NSAMPLES})... BE PATIENT!")
            shap_values = explainer.shap_values(X_explain_numpy, nsamples=SHAP_KERNEL_NSAMPLES)
            print(f"SHAP values calculated. Shape: {np.shape(shap_values)}") # Use np.shape for robustness
            kernel_explainer_time = time.time() - kernel_explainer_start_time
            print(f"KernelExplainer calculation took {kernel_explainer_time:.2f} seconds.")

            # 5. Generate and Save Summary Plot
            print("Generating SHAP summary plot...")
            plt.figure(figsize=(10, 8)) # Adjust figure size if needed
            shap.summary_plot(
                shap_values,
                features=df_explain_processed, # Use processed df for values
                feature_names=descriptive_feature_names, # Use descriptive names for labels
                max_display=20, # Show top N features
                show=False
            )
            plt.title(f'SHAP Summary Plot ({trait.capitalize()}) - Text Features')
            plt.tight_layout(rect=[0, 0.03, 1, 0.95])
            plot_filename = os.path.join(PLOTS_SAVE_DIR, f'shap_summary_kernel_{trait}.png')
            plt.savefig(plot_filename, dpi=150, bbox_inches='tight')
            plt.close() # Close figure to free memory
            print(f"SHAP summary plot saved to {plot_filename}")

        else:
            # Handle case where summary data was invalid
            print("ERROR: Could not validate structure of shap.kmeans output from debug info.")
            print("Cannot proceed with KernelExplainer initialization or SHAP value calculation.")
            # No need to raise error here, loop will continue to finally block

    except Exception as e:
        print(f"ERROR during SHAP processing for trait {trait}: {e}")
        print("--- Traceback ---")
        traceback.print_exc()
        print("--- End Traceback ---")
        print(f"Skipping plot generation for {trait} due to error.")

    finally:
        # 6. Clean up memory for the next trait
        print(f"Cleaning up memory after trait {trait}...")
        # Use 'in locals()' or 'in globals()' to check before deleting
        # Check for None as well, as variables might be assigned None on error
        if 'model' in locals() and model is not None: del model
        if 'scaler_mean' in locals() and scaler_mean is not None: del scaler_mean
        if 'scaler_scale' in locals() and scaler_scale is not None: del scaler_scale
        if 'X_explain_tensor' in locals() and X_explain_tensor is not None: del X_explain_tensor
        if 'df_explain_processed' in locals() and df_explain_processed is not None: del df_explain_processed
        if 'df_explain_sample' in locals() and df_explain_sample is not None: del df_explain_sample
        if 'explainer' in locals() and explainer is not None: del explainer
        if 'shap_values' in locals() and shap_values is not None: del shap_values
        if 'X_explain_numpy' in locals() and X_explain_numpy is not None: del X_explain_numpy
        if 'background_summary' in locals() and background_summary is not None: del background_summary
        # Note: background_data_cpu is kept for the next loop's kmeans

        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    trait_time = time.time() - trait_start_time
    print(f"Finished processing {trait}. Total time: {trait_time:.2f}s")


print("\n--- SHAP Analysis Complete ---")
print(f"Plots saved in: {PLOTS_SAVE_DIR}")

Using device: cuda
Loading feature names...
Determined original feature order (numeric headers). Count: 128
Found descriptive feature names in new data. Count: 128
Verifying feature count consistency...
Feature counts match. Proceeding.

Preparing background data for SHAP...
Loading checkpoint for openness from /kaggle/working/best_text_transformer_model_openness.pth...
Model for openness loaded successfully.
Background data prepared. Shape: torch.Size([100, 128])

--- Loading Explanation Data from /kaggle/input/fi-v2-hc-dataset-for-shapley/renamed_text_hc_features.csv ---
Loaded explanation dataset. Full shape: (10000, 138)

--- Starting SHAP Analysis Loop (Using KernelExplainer) ---

--- Processing Trait: OPENNESS ---
Loading checkpoint for openness from /kaggle/working/best_text_transformer_model_openness.pth...
Model for openness loaded successfully.
Sampling 100 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (100, 138)
Preprocessing complet

  0%|          | 0/100 [00:00<?, ?it/s]

SHAP values calculated. Shape: (100, 128)
KernelExplainer calculation took 16.19 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/text/shap_summary_kernel_openness.png
Cleaning up memory after trait openness...
Finished processing openness. Total time: 17.13s

--- Processing Trait: CONSCIENTIOUSNESS ---
Loading checkpoint for conscientiousness from /kaggle/working/best_text_transformer_model_conscientiousness.pth...
Model for conscientiousness loaded successfully.
Sampling 100 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (100, 138)
Preprocessing complete. Final tensor shape: torch.Size([100, 128])
Initializing SHAP KernelExplainer...
Summarizing background data (100 samples) using k-means...
DEBUG: Type of background_summary: <class 'shap.utils._legacy.DenseData'>
DEBUG: Attributes of background_summary: ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__g

  0%|          | 0/100 [00:00<?, ?it/s]

Regressors in active set degenerate. Dropping a regressor, after 2 iterations, i.e. alpha=4.644e-03, with an active set of 2 regressors, and the smallest cholesky pivot element being 2.220e-16. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a regressor, after 21 iterations, i.e. alpha=7.678e-04, with an active set of 15 regressors, and the smallest cholesky pivot element being 2.220e-16. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a regressor, after 36 iterations, i.e. alpha=1.855e-04, with an active set of 28 regressors, and the smallest cholesky pivot element being 2.220e-16. Reduce max_iter or increase eps parameters.
Early stopping the lars path, as the residues are small and the current value of alpha is no longer well controlled. 38 iterations, alpha=1.789e-04, previous alpha=1.751e-04, with an active set of 27 regressors.


SHAP values calculated. Shape: (100, 128)
KernelExplainer calculation took 16.27 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/text/shap_summary_kernel_conscientiousness.png
Cleaning up memory after trait conscientiousness...
Finished processing conscientiousness. Total time: 17.29s

--- Processing Trait: EXTRAVERSION ---
Loading checkpoint for extraversion from /kaggle/working/best_text_transformer_model_extraversion.pth...
Model for extraversion loaded successfully.
Sampling 100 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (100, 138)
Preprocessing complete. Final tensor shape: torch.Size([100, 128])
Initializing SHAP KernelExplainer...
Summarizing background data (100 samples) using k-means...
DEBUG: Type of background_summary: <class 'shap.utils._legacy.DenseData'>
DEBUG: Attributes of background_summary: ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__

  0%|          | 0/100 [00:00<?, ?it/s]

SHAP values calculated. Shape: (100, 128)
KernelExplainer calculation took 16.07 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/text/shap_summary_kernel_extraversion.png
Cleaning up memory after trait extraversion...
Finished processing extraversion. Total time: 17.08s

--- Processing Trait: AGREEABLENESS ---
Loading checkpoint for agreeableness from /kaggle/working/best_text_transformer_model_agreeableness.pth...
Model for agreeableness loaded successfully.
Sampling 100 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (100, 138)
Preprocessing complete. Final tensor shape: torch.Size([100, 128])
Initializing SHAP KernelExplainer...
Summarizing background data (100 samples) using k-means...
DEBUG: Type of background_summary: <class 'shap.utils._legacy.DenseData'>
DEBUG: Attributes of background_summary: ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getat

  0%|          | 0/100 [00:00<?, ?it/s]

SHAP values calculated. Shape: (100, 128)
KernelExplainer calculation took 16.19 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/text/shap_summary_kernel_agreeableness.png
Cleaning up memory after trait agreeableness...
Finished processing agreeableness. Total time: 17.17s

--- Processing Trait: NEUROTICISM ---
Loading checkpoint for neuroticism from /kaggle/working/best_text_transformer_model_neuroticism.pth...
Model for neuroticism loaded successfully.
Sampling 100 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (100, 138)
Preprocessing complete. Final tensor shape: torch.Size([100, 128])
Initializing SHAP KernelExplainer...
Summarizing background data (100 samples) using k-means...
DEBUG: Type of background_summary: <class 'shap.utils._legacy.DenseData'>
DEBUG: Attributes of background_summary: ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribu

  0%|          | 0/100 [00:00<?, ?it/s]

SHAP values calculated. Shape: (100, 128)
KernelExplainer calculation took 15.86 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/text/shap_summary_kernel_neuroticism.png
Cleaning up memory after trait neuroticism...
Finished processing neuroticism. Total time: 16.84s

--- SHAP Analysis Complete ---
Plots saved in: /kaggle/working/shap/text/


In [6]:
# --- Imports ---
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import shap # Make sure you have shap installed: pip install shap
import os
import gc # Garbage collector
import time # For timing
import traceback # For detailed error printing

# --- Configuration ---
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# --- Paths ---

ORIGINAL_DATA_PATH = '/kaggle/input/fi-v2-test-val-data/FI V2 COMPLETE FEATURE DATASET/HANDCRAFTED/video_hc_features.csv'

NEW_DATA_PATH = '/kaggle/input/fi-v2-hc-dataset-for-shapley/renamed_video_hc_features.csv' # e.g., /kaggle/input/my-shap-video-data/renamed_video_hc_features.csv

MODEL_SAVE_DIR = '/kaggle/working/'

PLOTS_SAVE_DIR = '/kaggle/working/shap/video/'

# --- Constants ---
label_columns = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

cols_to_drop = ["Filename", "Segment_ID", "interview", "Gender", "Ethnicity", "AgeGroup"]

# --- SHAP Configuration ---
# Samples from ORIGINAL training data split for background summary (KMeans)
SHAP_BACKGROUND_SAMPLES = 100
# Samples from the NEW data to explain. Reduce if kernel dies (RAM issue).
SHAP_EXPLAIN_SAMPLES = 50
# Samples per explanation point for KernelExplainer. More = more accurate but MUCH slower.
SHAP_KERNEL_NSAMPLES = 1000

# --- Ensure Plots Directory Exists ---
os.makedirs(PLOTS_SAVE_DIR, exist_ok=True)

# --- Model Definition (Must be IDENTICAL to your training script) ---
class SimpleTransformerRegressor(nn.Module):
    def __init__(self, input_dim, embed_dim=256, num_heads=4, num_layers=2, dropout=0.3, ff_dim_multiplier=4):
        super(SimpleTransformerRegressor, self).__init__()
        if embed_dim % num_heads != 0:
            original_embed_dim = embed_dim
            embed_dim = (embed_dim // num_heads + 1) * num_heads
            # print(f"Warning: embed_dim ({original_embed_dim}) not divisible by num_heads ({num_heads}). Adjusted to {embed_dim}.")
        self.input_dim = input_dim
        self.embed_dim = embed_dim
        self.project = nn.Linear(input_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=num_heads,
            dim_feedforward=embed_dim * ff_dim_multiplier,
            dropout=dropout, batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=num_layers)
        self.classifier = nn.Sequential(
            nn.LayerNorm(embed_dim),
            nn.Linear(embed_dim, 128), nn.ReLU(),
            nn.Dropout(dropout), nn.Linear(128, 1)
        )
    def forward(self, x):
        x = self.project(x)
        x = x.unsqueeze(1)
        x = self.encoder(x)
        x = x.squeeze(1)
        output = self.classifier(x)
        return output.squeeze(-1)

# --- Step 1 & 2: Get Feature Names ---
print("Loading feature names...")
try:
    df_orig_temp = pd.read_csv(ORIGINAL_DATA_PATH)
    X_orig_temp = df_orig_temp.drop(columns=[col for col in cols_to_drop if col in df_orig_temp.columns], errors='ignore')
    X_orig_temp = X_orig_temp.drop(columns=[col for col in label_columns if col in X_orig_temp.columns], errors='ignore')
    # Ensure only numeric columns are considered features for order determination
    X_orig_temp = X_orig_temp.select_dtypes(include=np.number)
    original_numeric_feature_order = list(X_orig_temp.columns)
    num_original_features = len(original_numeric_feature_order)
    print(f"Determined original feature order (numeric headers from {os.path.basename(ORIGINAL_DATA_PATH)}). Count: {num_original_features}")
    del df_orig_temp, X_orig_temp
    gc.collect()
except FileNotFoundError:
    print(f"Error: Original dataset file not found at {ORIGINAL_DATA_PATH}. Cannot proceed.")
    exit()
except Exception as e:
    print(f"Error reading original data file {ORIGINAL_DATA_PATH}: {e}")
    traceback.print_exc()
    exit()

try:
    df_new_temp = pd.read_csv(NEW_DATA_PATH)
    X_new_temp = df_new_temp.drop(columns=[col for col in cols_to_drop if col in df_new_temp.columns], errors='ignore')
    X_new_temp = X_new_temp.drop(columns=[col for col in label_columns if col in X_new_temp.columns], errors='ignore')
    # Ensure only numeric columns are considered features for descriptive names
    X_new_temp = X_new_temp.select_dtypes(include=np.number)
    descriptive_feature_names = list(X_new_temp.columns)
    num_new_features = len(descriptive_feature_names)
    print(f"Found descriptive feature names in new data ({os.path.basename(NEW_DATA_PATH)}). Count: {num_new_features}")
    del df_new_temp, X_new_temp
    gc.collect()
except FileNotFoundError:
    print(f"Error: New dataset file not found at {NEW_DATA_PATH}.")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("!!! PLEASE UPDATE THE 'NEW_DATA_PATH' variable in the script to point  !!!")
    print("!!! to your CSV file containing descriptive video feature names.       !!!")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    exit()
except Exception as e:
    print(f"Error reading new data file {NEW_DATA_PATH}: {e}")
    traceback.print_exc()
    exit()

# --- Step 3: Verification ---
print("Verifying feature count consistency...")
if num_original_features != num_new_features:
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("! ERROR: Mismatch in feature counts between original and new data!")
    print(f"! Original data ({os.path.basename(ORIGINAL_DATA_PATH)}) features: {num_original_features}")
    print(f"! New data ({os.path.basename(NEW_DATA_PATH)}) features: {num_new_features}")
    print("! Cannot proceed. Ensure both CSVs have the same number of numeric feature columns")
    print("! after dropping metadata/labels, and that NEW_DATA_PATH is correct.")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    exit()
else:
    print(f"Feature counts match ({num_original_features}). Proceeding.")
input_dim = num_original_features


# --- Step 4: Load Model Function ---
def load_trained_model(trait_name, input_dim_model, device):
    """Loads a trained model state and configuration for VIDEO features."""
    model_path = os.path.join(MODEL_SAVE_DIR, f"best_video_transformer_model_{trait_name}.pth") # VIDEO model name
    if not os.path.exists(model_path):
        print(f"Error: Model file not found for trait '{trait_name}' at {model_path}")
        return None, None, None
    print(f"Loading checkpoint for {trait_name} from {model_path}...")
    try:
        checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
    except Exception as e:
        print(f"Error loading checkpoint file {model_path}: {e}")
        traceback.print_exc()
        return None, None, None

    required_keys = ['best_config', 'model_state_dict', 'scaler_mean', 'scaler_scale']
    if not all(key in checkpoint for key in required_keys):
        missing = [key for key in required_keys if key not in checkpoint]
        print(f"Error: Checkpoint for {trait_name} is missing required keys: {missing}.")
        return None, None, None

    config = checkpoint['best_config']
    # Ensure scaler params are numpy arrays and float type for consistency
    scaler_mean = np.array(checkpoint['scaler_mean'], dtype=np.float64)
    scaler_scale = np.array(checkpoint['scaler_scale'], dtype=np.float64)


    # The ff_dim_multiplier should be handled by the model's default if not in config
    # as SimpleTransformerRegressor has a default for it.
    try:
        model = SimpleTransformerRegressor(
            input_dim=input_dim_model, # Use the input_dim derived from original data
            embed_dim=config["embed_dim"],
            num_heads=config["num_heads"],
            num_layers=config["num_layers"],
            dropout=config["dropout"],
            ff_dim_multiplier=config.get("ff_dim_multiplier", 4) # Use config if available, else default
        )
        model.load_state_dict(checkpoint['model_state_dict'])
        model = model.to(device)
        model.eval()
        print(f"Model for {trait_name} loaded successfully.")
    except Exception as e:
        print(f"Error reconstructing or loading model state for {trait_name}: {e}")
        traceback.print_exc()
        return None, None, None

    if scaler_mean.shape[0] != input_dim_model or scaler_scale.shape[0] != input_dim_model:
         print(f"Error: Loaded scaler params shape mismatch for {trait_name}. Mean: {scaler_mean.shape}, Scale: {scaler_scale.shape}, Expected from data: {input_dim_model}")
         return None, None, None

    return model, scaler_mean, scaler_scale


# --- Step 5: Preprocessing Function for New Data ---
def preprocess_explanation_data(df_new, target_feature_order_desc, scaler_mean_arr, scaler_scale_arr):
    """Preprocesses the new dataframe using descriptive feature order and loaded scaler parameters."""
    print(f"Preprocessing explanation data. Initial shape: {df_new.shape}")
    try:
        # Select only numeric columns that are in target_feature_order_desc
        df_features_numeric = df_new.select_dtypes(include=np.number)
        df_features = df_features_numeric[target_feature_order_desc].copy()
    except KeyError as e:
        missing_cols = [col for col in target_feature_order_desc if col not in df_features_numeric.columns]
        print(f"Error: Columns mismatch during preprocessing. Missing in new data: {missing_cols}. Ensure NEW data has all descriptive columns.")
        return None, None

    if len(scaler_mean_arr) != len(target_feature_order_desc) or len(scaler_scale_arr) != len(target_feature_order_desc):
         print(f"Error: Scaler param length mismatch. Mean: {len(scaler_mean_arr)}, Scale: {len(scaler_scale_arr)}, Features: {len(target_feature_order_desc)}")
         return None, None

    # Fill NaNs: The i-th mean in scaler_mean_arr (from original data) corresponds to the i-th feature in target_feature_order_desc
    # This assumes the order of features in NEW_DATA_PATH columns matches the conceptual order of features in ORIGINAL_DATA_PATH
    means_for_filling = pd.Series(scaler_mean_arr, index=target_feature_order_desc)
    df_features_filled = df_features.fillna(means_for_filling)

    # Handle any remaining NaNs (e.g., if a feature was all NaN in original data, its mean might be NaN)
    if df_features_filled.isnull().sum().sum() > 0:
        print(f"Info: Found {df_features_filled.isnull().sum().sum()} NaNs after filling with original means. Filling remaining with 0 (global column mean might be better if available).")
        df_features_filled = df_features_filled.fillna(0) # Fallback

    try:
        # Scale: Apply scaler_mean_arr and scaler_scale_arr directly.
        # This relies on df_features_filled.values being in the same order as target_feature_order_desc,
        # and scaler_mean_arr/scaler_scale_arr also corresponding to this order.
        X_new_scaled_values = (df_features_filled.values - scaler_mean_arr) / scaler_scale_arr
    except ValueError as e:
        print(f"Error during scaling: {e}. Check shapes: df values {df_features_filled.values.shape}, mean {scaler_mean_arr.shape}, scale {scaler_scale_arr.shape}")
        return None, None

    X_new_tensor = torch.tensor(X_new_scaled_values, dtype=torch.float32)
    print(f"Preprocessing complete. Final tensor shape for explanation: {X_new_tensor.shape}")
    return X_new_tensor, df_features_filled # Return processed df for SHAP labels


# --- Step 6: Prepare Background Data (Once before loop) ---
print("\nPreparing background data for SHAP...")
background_data_cpu_scaled_np = None # Initialize
try:
    # Load scaler params from one representative model (e.g., openness)
    _, bg_scaler_mean, bg_scaler_scale = load_trained_model(label_columns[0], input_dim, DEVICE)
    if bg_scaler_mean is None or bg_scaler_scale is None:
        raise ValueError(f"Failed to load scaler parameters from {label_columns[0]} model checkpoint for background data.")

    df_orig_background = pd.read_csv(ORIGINAL_DATA_PATH)
    X_orig_background_temp = df_orig_background.drop(columns=[col for col in cols_to_drop if col in df_orig_background.columns], errors='ignore')
    X_orig_background_temp = X_orig_background_temp.drop(columns=[col for col in label_columns if col in X_orig_background_temp.columns], errors='ignore')
    
    # Select only numeric types and ensure order matches original_numeric_feature_order
    X_orig_background_numeric = X_orig_background_temp.select_dtypes(include=np.number)
    X_orig_background = X_orig_background_numeric[original_numeric_feature_order].copy() # Enforce original order


    # Fill NaNs using the loaded scaler means (which are in original_numeric_feature_order)
    orig_means_series_bg = pd.Series(bg_scaler_mean, index=original_numeric_feature_order)
    X_orig_background_filled = X_orig_background.fillna(orig_means_series_bg)
    if X_orig_background_filled.isnull().sum().sum() > 0:
        print(f"Info: Found {X_orig_background_filled.isnull().sum().sum()} NaNs in background data after filling with means. Filling remaining with 0.")
        X_orig_background_filled = X_orig_background_filled.fillna(0)

    # Scale using the loaded scaler parameters
    X_orig_background_scaled_np = (X_orig_background_filled.values - bg_scaler_mean) / bg_scaler_scale
    
    # We need a subset of the *training* part of this original data for background
    # Your original training script used a 0.3 test_size, then 0.5 for val from temp.
    # So, train is 0.7 of total.
    # Let's roughly simulate a training split for background.
    train_indices_bg, _ = train_test_split(range(len(X_orig_background_scaled_np)), train_size=0.7, random_state=SEED)
    X_train_orig_scaled_np = X_orig_background_scaled_np[train_indices_bg]

    num_background_available = X_train_orig_scaled_np.shape[0]
    actual_background_samples = min(SHAP_BACKGROUND_SAMPLES, num_background_available)
    if actual_background_samples < SHAP_BACKGROUND_SAMPLES:
        print(f"Warning: Requested {SHAP_BACKGROUND_SAMPLES} background samples, but only {num_background_available} available in train split. Using {actual_background_samples}.")
    if actual_background_samples == 0:
        raise ValueError("No background samples available after splitting. Check data or split logic.")

    background_indices = np.random.choice(num_background_available, actual_background_samples, replace=False)
    background_data_cpu_scaled_np = X_train_orig_scaled_np[background_indices] # This is already scaled and numpy
    print(f"Background data prepared (scaled, numpy). Shape: {background_data_cpu_scaled_np.shape}")

    del df_orig_background, X_orig_background_temp, X_orig_background_numeric, X_orig_background, X_orig_background_filled
    del X_orig_background_scaled_np, X_train_orig_scaled_np, bg_scaler_mean, bg_scaler_scale
    gc.collect()

except FileNotFoundError:
    print(f"Error: Original dataset file not found at {ORIGINAL_DATA_PATH}. Cannot prepare background data.")
    exit()
except Exception as e:
    print(f"FATAL ERROR during background data preparation: {e}")
    traceback.print_exc()
    exit()


# --- Step 7: Load Explanation Data (Once before loop) ---
print(f"\n--- Loading Explanation Data from {NEW_DATA_PATH} ---")
try:
    df_explain_full = pd.read_csv(NEW_DATA_PATH)
    print(f"Loaded explanation dataset. Full shape: {df_explain_full.shape}")
except FileNotFoundError:
    print(f"Error: Explanation dataset file not found at {NEW_DATA_PATH}. Cannot proceed.")
    print("Please ensure 'NEW_DATA_PATH' is correctly set.")
    exit()
except Exception as e:
    print(f"Error reading explanation dataset {NEW_DATA_PATH}: {e}")
    exit()

# --- Step 8: SHAP Analysis Loop ---
print(f"\n--- Starting SHAP Analysis Loop (Using KernelExplainer) ---")
if background_data_cpu_scaled_np is None:
     print("FATAL ERROR: Background data was not prepared successfully. Stopping.")
     exit()

for trait in label_columns:
    print(f"\n--- Processing Trait: {trait.upper()} ---")
    trait_start_time = time.time()
    model, scaler_mean_trait, scaler_scale_trait = None, None, None
    X_explain_tensor, df_explain_processed, df_explain_sample = None, None, None
    explainer, shap_values, background_summary_obj = None, None, None
    X_explain_numpy_scaled = None

    try:
        # 1. Load Model and Scaler for the current trait
        model, scaler_mean_trait, scaler_scale_trait = load_trained_model(trait, input_dim, DEVICE)
        if model is None:
            print(f"Skipping trait {trait} due to model loading issues.")
            continue

        # 2. Sample and Preprocess Explanation Data for this trait
        num_explain_available = len(df_explain_full)
        actual_explain_samples = min(SHAP_EXPLAIN_SAMPLES, num_explain_available)
        if actual_explain_samples < num_explain_available:
             print(f"Sampling {actual_explain_samples} rows from explanation data for analysis.")
             df_explain_sample = df_explain_full.sample(n=actual_explain_samples, random_state=SEED)
        else:
             print(f"Using all {num_explain_available} rows from explanation data.")
             df_explain_sample = df_explain_full.copy() # Use a copy to avoid modifying original df_explain_full

        # Preprocess using descriptive_feature_names and the loaded scaler specific to this trait
        X_explain_tensor, df_explain_processed = preprocess_explanation_data(
            df_explain_sample, descriptive_feature_names, scaler_mean_trait, scaler_scale_trait
        )
        if X_explain_tensor is None:
            print(f"Skipping trait {trait} due to preprocessing errors for explanation data.")
            continue
        X_explain_numpy_scaled = X_explain_tensor.cpu().numpy() # This is the SCALED data for explanation

        # 3. Initialize SHAP KernelExplainer
        print("Initializing SHAP KernelExplainer...")
        kernel_explainer_start_time = time.time()

        def predict_wrapper_numpy(x_np_scaled): # Expects SCALED numpy data
            x_tensor = torch.tensor(x_np_scaled, dtype=torch.float32).to(DEVICE)
            with torch.no_grad():
                predictions = model(x_tensor)
            return predictions.cpu().numpy()

        print(f"Summarizing background data ({background_data_cpu_scaled_np.shape[0]} samples) using k-means...")
        num_clusters = min(25, background_data_cpu_scaled_np.shape[0])
        if num_clusters < 1: # Edge case: very few background samples
            print(f"Warning: Not enough background samples ({background_data_cpu_scaled_np.shape[0]}) for k-means with min clusters. Using raw background.")
            background_summary_obj = background_data_cpu_scaled_np # Use raw if too few for kmeans
        else:
            background_summary_obj = shap.kmeans(background_data_cpu_scaled_np, num_clusters)
        
        print(f"Type of background_summary_obj: {type(background_summary_obj)}")
        # The object returned by shap.kmeans is directly usable. If it's just a numpy array (e.g. raw background), that's also fine.

        explainer = shap.KernelExplainer(predict_wrapper_numpy, background_summary_obj)

        # 4. Calculate SHAP values
        print(f"Calculating SHAP values using KernelExplainer for {X_explain_numpy_scaled.shape[0]} samples (nsamples={SHAP_KERNEL_NSAMPLES})... BE PATIENT!")
        shap_values = explainer.shap_values(X_explain_numpy_scaled, nsamples=SHAP_KERNEL_NSAMPLES)
        # For single output regression, shap_values is (N, M)
        print(f"SHAP values calculated. Shape: {np.shape(shap_values)}")
        kernel_explainer_time = time.time() - kernel_explainer_start_time
        print(f"KernelExplainer calculation took {kernel_explainer_time:.2f} seconds.")

        # 5. Generate and Save Summary Plot
        print("Generating SHAP summary plot...")
        plt.figure() # Create new figure for each plot
        shap.summary_plot(
            shap_values,
            features=df_explain_processed, # This DataFrame has descriptive column names and values corresponding to X_explain_numpy_scaled
            feature_names=descriptive_feature_names, # Explicitly provide descriptive names
            max_display=20,
            show=False
        )
        plt.title(f'SHAP Summary Plot ({trait.capitalize()}) - Video Features')
        plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout
        plot_filename = os.path.join(PLOTS_SAVE_DIR, f'shap_summary_kernel_video_{trait}.png')
        plt.savefig(plot_filename, dpi=150, bbox_inches='tight')
        plt.close()
        print(f"SHAP summary plot saved to {plot_filename}")

    except Exception as e:
        print(f"ERROR during SHAP processing for trait {trait}: {e}")
        print("--- Traceback ---")
        traceback.print_exc()
        print("--- End Traceback ---")
        print(f"Skipping plot generation for {trait} due to error.")

    finally:
        print(f"Cleaning up memory after trait {trait}...")
        del model, scaler_mean_trait, scaler_scale_trait
        del X_explain_tensor, df_explain_processed, df_explain_sample
        del explainer, shap_values, X_explain_numpy_scaled, background_summary_obj
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    trait_time = time.time() - trait_start_time
    print(f"Finished processing {trait}. Total time: {trait_time:.2f}s")

print("\n--- SHAP Analysis Complete for Video Features ---")
print(f"Plots saved in: {PLOTS_SAVE_DIR}")

Using device: cuda
Loading feature names...


Columns (0,944) have mixed types. Specify dtype option on import or set low_memory=False.


Determined original feature order (numeric headers from video_hc_features.csv). Count: 936


Columns (0,944) have mixed types. Specify dtype option on import or set low_memory=False.


Found descriptive feature names in new data (renamed_video_hc_features.csv). Count: 936
Verifying feature count consistency...
Feature counts match (936). Proceeding.

Preparing background data for SHAP...
Loading checkpoint for openness from /kaggle/working/best_video_transformer_model_openness.pth...
Model for openness loaded successfully.


Columns (0,944) have mixed types. Specify dtype option on import or set low_memory=False.


Background data prepared (scaled, numpy). Shape: (100, 936)

--- Loading Explanation Data from /kaggle/input/fi-v2-hc-dataset-for-shapley/renamed_video_hc_features.csv ---


Columns (0,944) have mixed types. Specify dtype option on import or set low_memory=False.


Loaded explanation dataset. Full shape: (51325, 948)

--- Starting SHAP Analysis Loop (Using KernelExplainer) ---

--- Processing Trait: OPENNESS ---
Loading checkpoint for openness from /kaggle/working/best_video_transformer_model_openness.pth...
Model for openness loaded successfully.
Sampling 50 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (50, 948)
Preprocessing complete. Final tensor shape for explanation: torch.Size([50, 936])
Initializing SHAP KernelExplainer...
Summarizing background data (100 samples) using k-means...
Type of background_summary_obj: <class 'shap.utils._legacy.DenseData'>
Calculating SHAP values using KernelExplainer for 50 samples (nsamples=1000)... BE PATIENT!


  0%|          | 0/50 [00:00<?, ?it/s]

SHAP values calculated. Shape: (50, 936)
KernelExplainer calculation took 83.73 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/video/shap_summary_kernel_video_openness.png
Cleaning up memory after trait openness...
Finished processing openness. Total time: 84.82s

--- Processing Trait: CONSCIENTIOUSNESS ---
Loading checkpoint for conscientiousness from /kaggle/working/best_video_transformer_model_conscientiousness.pth...
Model for conscientiousness loaded successfully.
Sampling 50 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (50, 948)
Preprocessing complete. Final tensor shape for explanation: torch.Size([50, 936])
Initializing SHAP KernelExplainer...
Summarizing background data (100 samples) using k-means...
Type of background_summary_obj: <class 'shap.utils._legacy.DenseData'>
Calculating SHAP values using KernelExplainer for 50 samples (nsamples=1000)... BE PATIENT!


  0%|          | 0/50 [00:00<?, ?it/s]

SHAP values calculated. Shape: (50, 936)
KernelExplainer calculation took 83.58 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/video/shap_summary_kernel_video_extraversion.png
Cleaning up memory after trait extraversion...
Finished processing extraversion. Total time: 84.72s

--- Processing Trait: AGREEABLENESS ---
Loading checkpoint for agreeableness from /kaggle/working/best_video_transformer_model_agreeableness.pth...
Model for agreeableness loaded successfully.
Sampling 50 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (50, 948)
Preprocessing complete. Final tensor shape for explanation: torch.Size([50, 936])
Initializing SHAP KernelExplainer...
Summarizing background data (100 samples) using k-means...
Type of background_summary_obj: <class 'shap.utils._legacy.DenseData'>
Calculating SHAP values using KernelExplainer for 50 samples (nsamples=1000)... BE PATIENT!


  0%|          | 0/50 [00:00<?, ?it/s]

SHAP values calculated. Shape: (50, 936)
KernelExplainer calculation took 83.88 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/video/shap_summary_kernel_video_agreeableness.png
Cleaning up memory after trait agreeableness...
Finished processing agreeableness. Total time: 85.03s

--- Processing Trait: NEUROTICISM ---
Loading checkpoint for neuroticism from /kaggle/working/best_video_transformer_model_neuroticism.pth...
Model for neuroticism loaded successfully.
Sampling 50 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (50, 948)
Preprocessing complete. Final tensor shape for explanation: torch.Size([50, 936])
Initializing SHAP KernelExplainer...
Summarizing background data (100 samples) using k-means...
Type of background_summary_obj: <class 'shap.utils._legacy.DenseData'>
Calculating SHAP values using KernelExplainer for 50 samples (nsamples=1000)... BE PATIENT!


  0%|          | 0/50 [00:00<?, ?it/s]

SHAP values calculated. Shape: (50, 936)
KernelExplainer calculation took 87.23 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/video/shap_summary_kernel_video_neuroticism.png
Cleaning up memory after trait neuroticism...
Finished processing neuroticism. Total time: 88.34s

--- SHAP Analysis Complete for Video Features ---
Plots saved in: /kaggle/working/shap/video/


In [1]:
# --- Imports ---
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import shap # Make sure you have shap installed: pip install shap
import os
import gc # Garbage collector
import time # For timing
import traceback # For detailed error printing

# --- Configuration ---
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# --- Paths ---
ORIGINAL_DATA_PATH = '/kaggle/input/fi-v2-test-val-data/FI V2 COMPLETE FEATURE DATASET/HANDCRAFTED/audio_hc_features.csv'

NEW_DATA_PATH = '/kaggle/input/fi-v2-hc-dataset-for-shapley/renamed_audio_hc_features.csv'

MODEL_SAVE_DIR = '/kaggle/working/'

PLOTS_SAVE_DIR = '/kaggle/working/shap/audio/'

# --- Constants ---
label_columns = ["openness", "conscientiousness", "extraversion", "agreeableness", "neuroticism"]

cols_to_drop = ["Filename", "Segment_ID", "interview", "Gender", "Ethnicity", "AgeGroup"]

# --- SHAP Configuration ---
# Samples from ORIGINAL training data split for background summary (KMeans)
SHAP_BACKGROUND_SAMPLES = 100
# Samples from the NEW data to explain. Reduce if kernel dies (RAM issue).
SHAP_EXPLAIN_SAMPLES = 50
# Samples per explanation point for KernelExplainer. More = more accurate but MUCH slower.
SHAP_KERNEL_NSAMPLES = 50

# --- Ensure Plots Directory Exists ---
os.makedirs(PLOTS_SAVE_DIR, exist_ok=True)

# --- Model Definition (Must be IDENTICAL to your training script) ---
class SimpleTransformerRegressor(nn.Module):
    def __init__(self, input_dim, embed_dim=256, num_heads=4, num_layers=2, dropout=0.3, ff_dim_multiplier=4):
        super(SimpleTransformerRegressor, self).__init__()
        if embed_dim % num_heads != 0:
            original_embed_dim = embed_dim
            embed_dim = (embed_dim // num_heads + 1) * num_heads
            # print(f"Warning: embed_dim ({original_embed_dim}) not divisible by num_heads ({num_heads}). Adjusted to {embed_dim}.")
        self.input_dim = input_dim
        self.embed_dim = embed_dim
        self.project = nn.Linear(input_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=num_heads,
            dim_feedforward=embed_dim * ff_dim_multiplier,
            dropout=dropout, batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=num_layers)
        self.classifier = nn.Sequential(
            nn.LayerNorm(embed_dim),
            nn.Linear(embed_dim, 128), nn.ReLU(),
            nn.Dropout(dropout), nn.Linear(128, 1)
        )
    def forward(self, x):
        x = self.project(x)
        x = x.unsqueeze(1)
        x = self.encoder(x)
        x = x.squeeze(1)
        output = self.classifier(x)
        return output.squeeze(-1)

# --- Step 1 & 2: Get Feature Names ---
print("Loading feature names...")
try:
    df_orig_temp = pd.read_csv(ORIGINAL_DATA_PATH)
    X_orig_temp = df_orig_temp.drop(columns=[col for col in cols_to_drop if col in df_orig_temp.columns], errors='ignore')
    X_orig_temp = X_orig_temp.drop(columns=[col for col in label_columns if col in X_orig_temp.columns], errors='ignore')
    # Ensure only numeric columns are considered features for order determination
    X_orig_temp = X_orig_temp.select_dtypes(include=np.number)
    original_numeric_feature_order = list(X_orig_temp.columns)
    num_original_features = len(original_numeric_feature_order)
    print(f"Determined original feature order (numeric headers from {os.path.basename(ORIGINAL_DATA_PATH)}). Count: {num_original_features}")
    del df_orig_temp, X_orig_temp
    gc.collect()
except FileNotFoundError:
    print(f"Error: Original dataset file not found at {ORIGINAL_DATA_PATH}. Cannot proceed.")
    exit()
except Exception as e:
    print(f"Error reading original data file {ORIGINAL_DATA_PATH}: {e}")
    traceback.print_exc()
    exit()

try:
    df_new_temp = pd.read_csv(NEW_DATA_PATH)
    X_new_temp = df_new_temp.drop(columns=[col for col in cols_to_drop if col in df_new_temp.columns], errors='ignore')
    X_new_temp = X_new_temp.drop(columns=[col for col in label_columns if col in X_new_temp.columns], errors='ignore')
    # Ensure only numeric columns are considered features for descriptive names
    X_new_temp = X_new_temp.select_dtypes(include=np.number)
    descriptive_feature_names = list(X_new_temp.columns)
    num_new_features = len(descriptive_feature_names)
    print(f"Found descriptive feature names in new data ({os.path.basename(NEW_DATA_PATH)}). Count: {num_new_features}")
    del df_new_temp, X_new_temp
    gc.collect()
except FileNotFoundError:
    print(f"Error: New dataset file not found at {NEW_DATA_PATH}.")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("!!! PLEASE UPDATE THE 'NEW_DATA_PATH' variable in the script to point  !!!")
    print("!!! to your CSV file containing descriptive video feature names.       !!!")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    exit()
except Exception as e:
    print(f"Error reading new data file {NEW_DATA_PATH}: {e}")
    traceback.print_exc()
    exit()

# --- Step 3: Verification ---
print("Verifying feature count consistency...")
if num_original_features != num_new_features:
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("! ERROR: Mismatch in feature counts between original and new data!")
    print(f"! Original data ({os.path.basename(ORIGINAL_DATA_PATH)}) features: {num_original_features}")
    print(f"! New data ({os.path.basename(NEW_DATA_PATH)}) features: {num_new_features}")
    print("! Cannot proceed. Ensure both CSVs have the same number of numeric feature columns")
    print("! after dropping metadata/labels, and that NEW_DATA_PATH is correct.")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    exit()
else:
    print(f"Feature counts match ({num_original_features}). Proceeding.")
input_dim = num_original_features


# --- Step 4: Load Model Function ---
def load_trained_model(trait_name, input_dim_model, device):
    """Loads a trained model state and configuration for VIDEO features."""
    model_path = os.path.join(MODEL_SAVE_DIR, f"best_audio_transformer_model_{trait_name}.pth") # VIDEO model name
    if not os.path.exists(model_path):
        print(f"Error: Model file not found for trait '{trait_name}' at {model_path}")
        return None, None, None
    print(f"Loading checkpoint for {trait_name} from {model_path}...")
    try:
        checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
    except Exception as e:
        print(f"Error loading checkpoint file {model_path}: {e}")
        traceback.print_exc()
        return None, None, None

    required_keys = ['best_config', 'model_state_dict', 'scaler_mean', 'scaler_scale']
    if not all(key in checkpoint for key in required_keys):
        missing = [key for key in required_keys if key not in checkpoint]
        print(f"Error: Checkpoint for {trait_name} is missing required keys: {missing}.")
        return None, None, None

    config = checkpoint['best_config']
    # Ensure scaler params are numpy arrays and float type for consistency
    scaler_mean = np.array(checkpoint['scaler_mean'], dtype=np.float64)
    scaler_scale = np.array(checkpoint['scaler_scale'], dtype=np.float64)


    # The ff_dim_multiplier should be handled by the model's default if not in config
    # as SimpleTransformerRegressor has a default for it.
    try:
        model = SimpleTransformerRegressor(
            input_dim=input_dim_model, # Use the input_dim derived from original data
            embed_dim=config["embed_dim"],
            num_heads=config["num_heads"],
            num_layers=config["num_layers"],
            dropout=config["dropout"],
            ff_dim_multiplier=config.get("ff_dim_multiplier", 4) # Use config if available, else default
        )
        model.load_state_dict(checkpoint['model_state_dict'])
        model = model.to(device)
        model.eval()
        print(f"Model for {trait_name} loaded successfully.")
    except Exception as e:
        print(f"Error reconstructing or loading model state for {trait_name}: {e}")
        traceback.print_exc()
        return None, None, None

    if scaler_mean.shape[0] != input_dim_model or scaler_scale.shape[0] != input_dim_model:
         print(f"Error: Loaded scaler params shape mismatch for {trait_name}. Mean: {scaler_mean.shape}, Scale: {scaler_scale.shape}, Expected from data: {input_dim_model}")
         return None, None, None

    return model, scaler_mean, scaler_scale


# --- Step 5: Preprocessing Function for New Data ---
def preprocess_explanation_data(df_new, target_feature_order_desc, scaler_mean_arr, scaler_scale_arr):
    """Preprocesses the new dataframe using descriptive feature order and loaded scaler parameters."""
    print(f"Preprocessing explanation data. Initial shape: {df_new.shape}")
    try:
        # Select only numeric columns that are in target_feature_order_desc
        df_features_numeric = df_new.select_dtypes(include=np.number)
        df_features = df_features_numeric[target_feature_order_desc].copy()
    except KeyError as e:
        missing_cols = [col for col in target_feature_order_desc if col not in df_features_numeric.columns]
        print(f"Error: Columns mismatch during preprocessing. Missing in new data: {missing_cols}. Ensure NEW data has all descriptive columns.")
        return None, None

    if len(scaler_mean_arr) != len(target_feature_order_desc) or len(scaler_scale_arr) != len(target_feature_order_desc):
         print(f"Error: Scaler param length mismatch. Mean: {len(scaler_mean_arr)}, Scale: {len(scaler_scale_arr)}, Features: {len(target_feature_order_desc)}")
         return None, None

    # Fill NaNs: The i-th mean in scaler_mean_arr (from original data) corresponds to the i-th feature in target_feature_order_desc
    # This assumes the order of features in NEW_DATA_PATH columns matches the conceptual order of features in ORIGINAL_DATA_PATH
    means_for_filling = pd.Series(scaler_mean_arr, index=target_feature_order_desc)
    df_features_filled = df_features.fillna(means_for_filling)

    # Handle any remaining NaNs (e.g., if a feature was all NaN in original data, its mean might be NaN)
    if df_features_filled.isnull().sum().sum() > 0:
        print(f"Info: Found {df_features_filled.isnull().sum().sum()} NaNs after filling with original means. Filling remaining with 0 (global column mean might be better if available).")
        df_features_filled = df_features_filled.fillna(0) # Fallback

    try:
        # Scale: Apply scaler_mean_arr and scaler_scale_arr directly.
        # This relies on df_features_filled.values being in the same order as target_feature_order_desc,
        # and scaler_mean_arr/scaler_scale_arr also corresponding to this order.
        X_new_scaled_values = (df_features_filled.values - scaler_mean_arr) / scaler_scale_arr
    except ValueError as e:
        print(f"Error during scaling: {e}. Check shapes: df values {df_features_filled.values.shape}, mean {scaler_mean_arr.shape}, scale {scaler_scale_arr.shape}")
        return None, None

    X_new_tensor = torch.tensor(X_new_scaled_values, dtype=torch.float32)
    print(f"Preprocessing complete. Final tensor shape for explanation: {X_new_tensor.shape}")
    return X_new_tensor, df_features_filled # Return processed df for SHAP labels


# --- Step 6: Prepare Background Data (Once before loop) ---
print("\nPreparing background data for SHAP...")
background_data_cpu_scaled_np = None # Initialize
try:
    # Load scaler params from one representative model (e.g., openness)
    _, bg_scaler_mean, bg_scaler_scale = load_trained_model(label_columns[0], input_dim, DEVICE)
    if bg_scaler_mean is None or bg_scaler_scale is None:
        raise ValueError(f"Failed to load scaler parameters from {label_columns[0]} model checkpoint for background data.")

    df_orig_background = pd.read_csv(ORIGINAL_DATA_PATH)
    X_orig_background_temp = df_orig_background.drop(columns=[col for col in cols_to_drop if col in df_orig_background.columns], errors='ignore')
    X_orig_background_temp = X_orig_background_temp.drop(columns=[col for col in label_columns if col in X_orig_background_temp.columns], errors='ignore')
    
    # Select only numeric types and ensure order matches original_numeric_feature_order
    X_orig_background_numeric = X_orig_background_temp.select_dtypes(include=np.number)
    X_orig_background = X_orig_background_numeric[original_numeric_feature_order].copy() # Enforce original order


    # Fill NaNs using the loaded scaler means (which are in original_numeric_feature_order)
    orig_means_series_bg = pd.Series(bg_scaler_mean, index=original_numeric_feature_order)
    X_orig_background_filled = X_orig_background.fillna(orig_means_series_bg)
    if X_orig_background_filled.isnull().sum().sum() > 0:
        print(f"Info: Found {X_orig_background_filled.isnull().sum().sum()} NaNs in background data after filling with means. Filling remaining with 0.")
        X_orig_background_filled = X_orig_background_filled.fillna(0)

    # Scale using the loaded scaler parameters
    X_orig_background_scaled_np = (X_orig_background_filled.values - bg_scaler_mean) / bg_scaler_scale
    
    # We need a subset of the *training* part of this original data for background
    # Your original training script used a 0.3 test_size, then 0.5 for val from temp.
    # So, train is 0.7 of total.
    # Let's roughly simulate a training split for background.
    train_indices_bg, _ = train_test_split(range(len(X_orig_background_scaled_np)), train_size=0.7, random_state=SEED)
    X_train_orig_scaled_np = X_orig_background_scaled_np[train_indices_bg]

    num_background_available = X_train_orig_scaled_np.shape[0]
    actual_background_samples = min(SHAP_BACKGROUND_SAMPLES, num_background_available)
    if actual_background_samples < SHAP_BACKGROUND_SAMPLES:
        print(f"Warning: Requested {SHAP_BACKGROUND_SAMPLES} background samples, but only {num_background_available} available in train split. Using {actual_background_samples}.")
    if actual_background_samples == 0:
        raise ValueError("No background samples available after splitting. Check data or split logic.")

    background_indices = np.random.choice(num_background_available, actual_background_samples, replace=False)
    background_data_cpu_scaled_np = X_train_orig_scaled_np[background_indices] # This is already scaled and numpy
    print(f"Background data prepared (scaled, numpy). Shape: {background_data_cpu_scaled_np.shape}")

    del df_orig_background, X_orig_background_temp, X_orig_background_numeric, X_orig_background, X_orig_background_filled
    del X_orig_background_scaled_np, X_train_orig_scaled_np, bg_scaler_mean, bg_scaler_scale
    gc.collect()

except FileNotFoundError:
    print(f"Error: Original dataset file not found at {ORIGINAL_DATA_PATH}. Cannot prepare background data.")
    exit()
except Exception as e:
    print(f"FATAL ERROR during background data preparation: {e}")
    traceback.print_exc()
    exit()


# --- Step 7: Load Explanation Data (Once before loop) ---
print(f"\n--- Loading Explanation Data from {NEW_DATA_PATH} ---")
try:
    df_explain_full = pd.read_csv(NEW_DATA_PATH)
    print(f"Loaded explanation dataset. Full shape: {df_explain_full.shape}")
except FileNotFoundError:
    print(f"Error: Explanation dataset file not found at {NEW_DATA_PATH}. Cannot proceed.")
    print("Please ensure 'NEW_DATA_PATH' is correctly set.")
    exit()
except Exception as e:
    print(f"Error reading explanation dataset {NEW_DATA_PATH}: {e}")
    exit()

# --- Step 8: SHAP Analysis Loop ---
print(f"\n--- Starting SHAP Analysis Loop (Using KernelExplainer) ---")
if background_data_cpu_scaled_np is None:
     print("FATAL ERROR: Background data was not prepared successfully. Stopping.")
     exit()

for trait in label_columns:
    print(f"\n--- Processing Trait: {trait.upper()} ---")
    trait_start_time = time.time()
    model, scaler_mean_trait, scaler_scale_trait = None, None, None
    X_explain_tensor, df_explain_processed, df_explain_sample = None, None, None
    explainer, shap_values, background_summary_obj = None, None, None
    X_explain_numpy_scaled = None

    try:
        # 1. Load Model and Scaler for the current trait
        model, scaler_mean_trait, scaler_scale_trait = load_trained_model(trait, input_dim, DEVICE)
        if model is None:
            print(f"Skipping trait {trait} due to model loading issues.")
            continue

        # 2. Sample and Preprocess Explanation Data for this trait
        num_explain_available = len(df_explain_full)
        actual_explain_samples = min(SHAP_EXPLAIN_SAMPLES, num_explain_available)
        if actual_explain_samples < num_explain_available:
             print(f"Sampling {actual_explain_samples} rows from explanation data for analysis.")
             df_explain_sample = df_explain_full.sample(n=actual_explain_samples, random_state=SEED)
        else:
             print(f"Using all {num_explain_available} rows from explanation data.")
             df_explain_sample = df_explain_full.copy() # Use a copy to avoid modifying original df_explain_full

        # Preprocess using descriptive_feature_names and the loaded scaler specific to this trait
        X_explain_tensor, df_explain_processed = preprocess_explanation_data(
            df_explain_sample, descriptive_feature_names, scaler_mean_trait, scaler_scale_trait
        )
        if X_explain_tensor is None:
            print(f"Skipping trait {trait} due to preprocessing errors for explanation data.")
            continue
        X_explain_numpy_scaled = X_explain_tensor.cpu().numpy() # This is the SCALED data for explanation

        # 3. Initialize SHAP KernelExplainer
        print("Initializing SHAP KernelExplainer...")
        kernel_explainer_start_time = time.time()

        def predict_wrapper_numpy(x_np_scaled): # Expects SCALED numpy data
            x_tensor = torch.tensor(x_np_scaled, dtype=torch.float32).to(DEVICE)
            with torch.no_grad():
                predictions = model(x_tensor)
            return predictions.cpu().numpy()

        print(f"Summarizing background data ({background_data_cpu_scaled_np.shape[0]} samples) using k-means...")
        num_clusters = min(25, background_data_cpu_scaled_np.shape[0])
        if num_clusters < 1: # Edge case: very few background samples
            print(f"Warning: Not enough background samples ({background_data_cpu_scaled_np.shape[0]}) for k-means with min clusters. Using raw background.")
            background_summary_obj = background_data_cpu_scaled_np # Use raw if too few for kmeans
        else:
            background_summary_obj = shap.kmeans(background_data_cpu_scaled_np, num_clusters)
        
        print(f"Type of background_summary_obj: {type(background_summary_obj)}")
        # The object returned by shap.kmeans is directly usable. If it's just a numpy array (e.g. raw background), that's also fine.

        explainer = shap.KernelExplainer(predict_wrapper_numpy, background_summary_obj)

        # 4. Calculate SHAP values
        print(f"Calculating SHAP values using KernelExplainer for {X_explain_numpy_scaled.shape[0]} samples (nsamples={SHAP_KERNEL_NSAMPLES})... BE PATIENT!")
        shap_values = explainer.shap_values(X_explain_numpy_scaled, nsamples=SHAP_KERNEL_NSAMPLES)
        # For single output regression, shap_values is (N, M)
        print(f"SHAP values calculated. Shape: {np.shape(shap_values)}")
        kernel_explainer_time = time.time() - kernel_explainer_start_time
        print(f"KernelExplainer calculation took {kernel_explainer_time:.2f} seconds.")

        # 5. Generate and Save Summary Plot
        print("Generating SHAP summary plot...")
        plt.figure() # Create new figure for each plot
        shap.summary_plot(
            shap_values,
            features=df_explain_processed, # This DataFrame has descriptive column names and values corresponding to X_explain_numpy_scaled
            feature_names=descriptive_feature_names, # Explicitly provide descriptive names
            max_display=20,
            show=False
        )
        plt.title(f'SHAP Summary Plot ({trait.capitalize()}) - Audio Features')
        plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout
        plot_filename = os.path.join(PLOTS_SAVE_DIR, f'shap_summary_kernel_audio_{trait}.png')
        plt.savefig(plot_filename, dpi=150, bbox_inches='tight')
        plt.close()
        print(f"SHAP summary plot saved to {plot_filename}")

    except Exception as e:
        print(f"ERROR during SHAP processing for trait {trait}: {e}")
        print("--- Traceback ---")
        traceback.print_exc()
        print("--- End Traceback ---")
        print(f"Skipping plot generation for {trait} due to error.")

    finally:
        print(f"Cleaning up memory after trait {trait}...")
        del model, scaler_mean_trait, scaler_scale_trait
        del X_explain_tensor, df_explain_processed, df_explain_sample
        del explainer, shap_values, X_explain_numpy_scaled, background_summary_obj
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    trait_time = time.time() - trait_start_time
    print(f"Finished processing {trait}. Total time: {trait_time:.2f}s")

print("\n--- SHAP Analysis Complete for Video Features ---")
print(f"Plots saved in: {PLOTS_SAVE_DIR}")

Using device: cuda
Loading feature names...


Columns (33) have mixed types. Specify dtype option on import or set low_memory=False.


Determined original feature order (numeric headers from audio_hc_features.csv). Count: 25


Columns (33) have mixed types. Specify dtype option on import or set low_memory=False.


Found descriptive feature names in new data (renamed_audio_hc_features.csv). Count: 25
Verifying feature count consistency...
Feature counts match (25). Proceeding.

Preparing background data for SHAP...
Loading checkpoint for openness from /kaggle/working/best_audio_transformer_model_openness.pth...
Model for openness loaded successfully.


Columns (33) have mixed types. Specify dtype option on import or set low_memory=False.


Background data prepared (scaled, numpy). Shape: (100, 25)

--- Loading Explanation Data from /kaggle/input/fi-v2-hc-dataset-for-shapley/renamed_audio_hc_features.csv ---


Columns (33) have mixed types. Specify dtype option on import or set low_memory=False.


Loaded explanation dataset. Full shape: (57949, 37)

--- Starting SHAP Analysis Loop (Using KernelExplainer) ---

--- Processing Trait: OPENNESS ---
Loading checkpoint for openness from /kaggle/working/best_audio_transformer_model_openness.pth...
Model for openness loaded successfully.
Sampling 50 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (50, 37)
Preprocessing complete. Final tensor shape for explanation: torch.Size([50, 25])
Initializing SHAP KernelExplainer...
Summarizing background data (100 samples) using k-means...
Type of background_summary_obj: <class 'shap.utils._legacy.DenseData'>
Calculating SHAP values using KernelExplainer for 50 samples (nsamples=50)... BE PATIENT!


  0%|          | 0/50 [00:00<?, ?it/s]

SHAP values calculated. Shape: (50, 25)
KernelExplainer calculation took 1.81 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/audio/shap_summary_kernel_audio_openness.png
Cleaning up memory after trait openness...
Finished processing openness. Total time: 2.83s

--- Processing Trait: CONSCIENTIOUSNESS ---
Loading checkpoint for conscientiousness from /kaggle/working/best_audio_transformer_model_conscientiousness.pth...
Model for conscientiousness loaded successfully.
Sampling 50 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (50, 37)
Preprocessing complete. Final tensor shape for explanation: torch.Size([50, 25])
Initializing SHAP KernelExplainer...
Summarizing background data (100 samples) using k-means...
Type of background_summary_obj: <class 'shap.utils._legacy.DenseData'>
Calculating SHAP values using KernelExplainer for 50 samples (nsamples=50)... BE PATIENT!


  0%|          | 0/50 [00:00<?, ?it/s]

SHAP values calculated. Shape: (50, 25)
KernelExplainer calculation took 1.71 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/audio/shap_summary_kernel_audio_conscientiousness.png
Cleaning up memory after trait conscientiousness...
Finished processing conscientiousness. Total time: 2.69s

--- Processing Trait: EXTRAVERSION ---
Loading checkpoint for extraversion from /kaggle/working/best_audio_transformer_model_extraversion.pth...
Model for extraversion loaded successfully.
Sampling 50 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (50, 37)
Preprocessing complete. Final tensor shape for explanation: torch.Size([50, 25])
Initializing SHAP KernelExplainer...
Summarizing background data (100 samples) using k-means...
Type of background_summary_obj: <class 'shap.utils._legacy.DenseData'>
Calculating SHAP values using KernelExplainer for 50 samples (nsamples=50)... BE PATIENT!


  0%|          | 0/50 [00:00<?, ?it/s]

SHAP values calculated. Shape: (50, 25)
KernelExplainer calculation took 1.56 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/audio/shap_summary_kernel_audio_extraversion.png
Cleaning up memory after trait extraversion...
Finished processing extraversion. Total time: 2.46s

--- Processing Trait: AGREEABLENESS ---
Loading checkpoint for agreeableness from /kaggle/working/best_audio_transformer_model_agreeableness.pth...
Model for agreeableness loaded successfully.
Sampling 50 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (50, 37)
Preprocessing complete. Final tensor shape for explanation: torch.Size([50, 25])
Initializing SHAP KernelExplainer...
Summarizing background data (100 samples) using k-means...
Type of background_summary_obj: <class 'shap.utils._legacy.DenseData'>
Calculating SHAP values using KernelExplainer for 50 samples (nsamples=50)... BE PATIENT!


  0%|          | 0/50 [00:00<?, ?it/s]

Regressors in active set degenerate. Dropping a regressor, after 2 iterations, i.e. alpha=3.562e-04, with an active set of 2 regressors, and the smallest cholesky pivot element being 2.220e-16. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a regressor, after 4 iterations, i.e. alpha=1.145e-04, with an active set of 4 regressors, and the smallest cholesky pivot element being 2.220e-16. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a regressor, after 8 iterations, i.e. alpha=3.636e-05, with an active set of 8 regressors, and the smallest cholesky pivot element being 2.220e-16. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a regressor, after 8 iterations, i.e. alpha=2.481e-05, with an active set of 8 regressors, and the smallest cholesky pivot element being 2.220e-16. Reduce max_iter or increase eps parameters.
Regressors in active set degenerate. Dropping a 

SHAP values calculated. Shape: (50, 25)
KernelExplainer calculation took 1.53 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/audio/shap_summary_kernel_audio_agreeableness.png
Cleaning up memory after trait agreeableness...
Finished processing agreeableness. Total time: 2.40s

--- Processing Trait: NEUROTICISM ---
Loading checkpoint for neuroticism from /kaggle/working/best_audio_transformer_model_neuroticism.pth...
Model for neuroticism loaded successfully.
Sampling 50 rows from explanation data for analysis.
Preprocessing explanation data. Initial shape: (50, 37)
Preprocessing complete. Final tensor shape for explanation: torch.Size([50, 25])
Initializing SHAP KernelExplainer...
Summarizing background data (100 samples) using k-means...
Type of background_summary_obj: <class 'shap.utils._legacy.DenseData'>
Calculating SHAP values using KernelExplainer for 50 samples (nsamples=50)... BE PATIENT!


  0%|          | 0/50 [00:00<?, ?it/s]

SHAP values calculated. Shape: (50, 25)
KernelExplainer calculation took 1.61 seconds.
Generating SHAP summary plot...
SHAP summary plot saved to /kaggle/working/shap/audio/shap_summary_kernel_audio_neuroticism.png
Cleaning up memory after trait neuroticism...
Finished processing neuroticism. Total time: 2.62s

--- SHAP Analysis Complete for Video Features ---
Plots saved in: /kaggle/working/shap/audio/


In [4]:
!zip -r /kaggle/working/working_dir.zip /kaggle/working/ -x "/kaggle/working/working_dir.zip"


updating: kaggle/working/ (stored 0%)
updating: kaggle/working/best_audio_transformer_model_neuroticism.pth (deflated 7%)
updating: kaggle/working/best_video_transformer_model_openness.pth (deflated 8%)
updating: kaggle/working/best_audio_transformer_model_agreeableness.pth (deflated 8%)
updating: kaggle/working/best_text_transformer_model_neuroticism.pth (deflated 9%)
updating: kaggle/working/best_audio_transformer_model_openness.pth (deflated 8%)
updating: kaggle/working/best_text_transformer_model_conscientiousness.pth (deflated 9%)
updating: kaggle/working/best_text_transformer_model_extraversion.pth (deflated 9%)
updating: kaggle/working/best_text_transformer_model_openness.pth (deflated 9%)
updating: kaggle/working/best_video_transformer_model_extraversion.pth (deflated 8%)
updating: kaggle/working/best_video_transformer_model_agreeableness.pth (deflated 8%)
updating: kaggle/working/best_audio_transformer_model_conscientiousness.pth (deflated 8%)
updating: kaggle/working/.virtual