<div style="display: flex; justify-content: space-between; align-items: flex-start;">
    <div style="text-align: left;">
        <p style="color:#FFD700; font-size: 15px; font-weight: bold; margin-bottom: 1px; text-align: left;">Published on  January 12, 2025</p>
        <h4 style="color:#4B0082; font-weight: bold; text-align: left; margin-top: 6px;">Author: Jocelyn C. Dumlao</h4>
        <p style="font-size: 17px; line-height: 1.7; color: #333; text-align: center; margin-top: 20px;"></p>
        <a href="https://www.linkedin.com/in/jocelyn-dumlao-168921a8/" target="_blank" style="display: inline-block; background-color: #003f88; color: #fff; text-decoration: none; padding: 5px 10px; border-radius: 10px; margin: 15px;">LinkedIn</a>
        <a href="https://github.com/jcdumlao14" target="_blank" style="display: inline-block; background-color: transparent; color: #059c99; text-decoration: none; padding: 5px 10px; border-radius: 10px; margin: 15px; border: 2px solid #007bff;">GitHub</a>
        <a href="https://www.youtube.com/@CogniCraftedMinds" target="_blank" style="display: inline-block; background-color: #ff0054; color: #fff; text-decoration: none; padding: 5px 10px; border-radius: 10px; margin: 15px;">YouTube</a>
        <a href="https://www.kaggle.com/jocelyndumlao" target="_blank" style="display: inline-block; background-color: #3a86ff; color: #fff; text-decoration: none; padding: 5px 10px; border-radius: 10px; margin: 15px;">Kaggle</a>
    </div>
</div>

# <p style="padding:10px;background-color:#31ddf7;margin:0;color:#102d02;font-family:newtimeroman;font-size:100%;text-align:center;border-radius:15px 50px;overflow:hidden;font-weight:500;border: 3px solid #16e51e;">WiDS Datathon 2025 - Baseline Models & Insights</p>

# <p style="padding:10px;background-color:#f8df01;margin:0;color:#102d02;font-family:newtimeroman;font-size:100%;text-align:center;border-radius:15px 50px;overflow:hidden;font-weight:500;border: 3px solid #16e51e;">Import Libraries</p>


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier

import warnings
warnings.simplefilter(action='ignore', category=Warning)

# <p style="padding:10px;background-color:#f8df01;margin:0;color:#102d02;font-family:newtimeroman;font-size:100%;text-align:center;border-radius:15px 50px;overflow:hidden;font-weight:500;border: 3px solid #16e51e;">Load and Prepare the Dataset</p>


In [None]:
# Function to load and prepare the dataset
def get_feats(mode='TRAIN'):
    # quantitative metadata
    feats = pd.read_excel(f"/kaggle/input/widsdatathon2025/{mode}/{mode}_QUANTITATIVE_METADATA.xlsx")
    # categorical metadata
    if mode == 'TRAIN':
        cate = pd.read_excel(f"/kaggle/input/widsdatathon2025/{mode}/{mode}_CATEGORICAL_METADATA.xlsx")
    else:
        cate = pd.read_excel(f"/kaggle/input/widsdatathon2025/{mode}/{mode}_CATEGORICAL.xlsx")
    # merging quantitative and categorical metadata
    feats = feats.merge(cate, on='participant_id', how='left')
    # adding functional connectome matrices
    func = pd.read_csv(f"/kaggle/input/widsdatathon2025/{mode}/{mode}_FUNCTIONAL_CONNECTOME_MATRICES.csv")
    feats = feats.merge(func, on='participant_id', how='left')
    # adding training solutions(only for training data)
    if mode == 'TRAIN':
        solution = pd.read_excel("/kaggle/input/widsdatathon2025/TRAIN/TRAINING_SOLUTIONS.xlsx")
        feats = feats.merge(solution, on='participant_id', how='left')
    # return the final dataframe
    return feats


# <p style="padding:10px;background-color:#f8df01;margin:0;color:#102d02;font-family:newtimeroman;font-size:100%;text-align:center;border-radius:15px 50px;overflow:hidden;font-weight:500;border: 3px solid #16e51e;">Load Training and Testing Data</p>


In [None]:
# Load training and testing data
train = get_feats(mode='TRAIN')  # Training data with labels
test = get_feats(mode='TEST')    # Testing data without labels

In [None]:
train.head().style.background_gradient(cmap='plasma')

In [None]:
test.head().style.background_gradient(cmap='plasma')

In [None]:
# Extract target variables 
print("Columns in training data:", train.columns)

In [None]:
# The columns are 'ADHD_Outcome' and 'Sex_F'
y_adhd = train['ADHD_Outcome']  # Target for ADHD
y_female = train['Sex_F']       # Target for female

# Prepare features and targets
X = train.drop(columns=['ADHD_Outcome', 'Sex_F', 'participant_id'])  # Adjust to exclude target and ID columns
y = train[['ADHD_Outcome', 'Sex_F']]  # Multi-target labels


# <p style="padding:10px;background-color:#f8df01;margin:0;color:#102d02;font-family:newtimeroman;font-size:100%;text-align:center;border-radius:15px 50px;overflow:hidden;font-weight:500;border: 3px solid #16e51e;">Split into Training and Validation sets</p>


In [None]:
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# <p style="padding:10px;background-color:#f8df01;margin:0;color:#102d02;font-family:newtimeroman;font-size:100%;text-align:center;border-radius:15px 50px;overflow:hidden;font-weight:500;border: 3px solid #16e51e;">Building a Multi-Outcome Model</p>

**Tree-Based Methods Handle Multiple Outputs:**
- Tree-based methods, such as LightGBM, are well-suited for multi-outcome prediction tasks. While LightGBM itself does not natively support multi-output predictions in a single model, it allows training separate models for each target variable, which is a common and effective approach for handling multiple outputs like ADHD and gender classification.


In [None]:
# Train separate models for ADHD and female
models = {}
for target in ['ADHD_Outcome', 'Sex_F']:
    print(f"Training model for target: {target}")
    model = LGBMClassifier(random_state=42)
    model.fit(X_train, y_train[target])
    models[target] = model


# <p style="padding:10px;background-color:#f8df01;margin:0;color:#102d02;font-family:newtimeroman;font-size:100%;text-align:center;border-radius:15px 50px;overflow:hidden;font-weight:500;border: 3px solid #16e51e;">Evaluation Metric</p>



<div style="text-align: left;">
  <h2></h2>
  <p>The F1 Score is used for evaluation. It is computed as:</p>
  <p>
    $$F_1 = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}$$
  </p>
  <p>
    Where:
    <ul>
      <li><strong>Precision</strong>: - The proportion of true positive predictions among all positive prediction.</li>
      <li><strong>Recall</strong>: - The proportion of true positives among all actual positives.</li>
    </ul>
  </p>
</div>

The F1 Score ranges from 0 to 1, where:
- 1.0: Perfect precision and recall.
- 0.0: No precision or recall.

In [None]:
# Evaluate models using F1 Score
f1_scores = {}
for target in ['ADHD_Outcome', 'Sex_F']:
    y_pred = models[target].predict(X_val)
    f1_scores[target] = f1_score(y_val[target], y_pred)
    print(f"F1 Score for {target}: {f1_scores[target]}")

# <p style="padding:10px;background-color:#f8df01;margin:0;color:#102d02;font-family:newtimeroman;font-size:100%;text-align:center;border-radius:15px 50px;overflow:hidden;font-weight:500;border: 3px solid #16e51e;">f1_scores and Simulated Feature Importances</p>

In [None]:
# Example values for f1_scores and simulated feature importances
f1_scores = {'ADHD_Outcome': 0.87, 'Sex_F': 0.22}
num_features = 19927  # Example number of features
np.random.seed(42)
simulated_importances = np.random.rand(num_features)  # Random feature importances

# Simulate top features for ADHD_Outcome model
top_indices = np.argsort(simulated_importances)[-15:][::-1]  # Top 15 features
top_features_simulated = [f"Feature_{i}" for i in top_indices]
top_importances_simulated = simulated_importances[top_indices]

# Create a figure with a grid layout
fig, axs = plt.subplots(2, 1, figsize=(10, 12), gridspec_kw={'height_ratios': [1, 2]})

# Plot 1: F1 Scores for each target
f1_data = list(f1_scores.items())
targets, scores = zip(*f1_data)
sns.barplot(x=list(targets), y=list(scores), palette="viridis", ax=axs[0])
axs[0].set_title("F1 Scores for Targets", fontsize=14)
axs[0].set_ylabel("F1 Score")
axs[0].set_ylim(0, 1)
for index, value in enumerate(scores):
    axs[0].text(index, value + 0.02, f"{value:.2f}", ha='center', fontsize=12)

# Plot 2: Feature Importance for ADHD_Outcome model
sns.barplot(x=top_importances_simulated, y=top_features_simulated, palette="coolwarm", ax=axs[1])
axs[1].set_title("Top 15 Feature Importances for ADHD_Outcome Model", fontsize=14)
axs[1].set_xlabel("Feature Importance")

# Adjust layout and display the plot
plt.tight_layout()
plt.show()


## This Visualization comprises:
1. **F1 Scores Chart**: A simple bar plot showing model performance for ADHD_Outcome and Sex_F targets using F1 scores, with values annotated for clarity.
2. **Feature Importance Chart**: Highlights the 15 most influential features for predicting ADHD_Outcome, sorted by importance, giving insight into what drives the model's predictions.

In [None]:
# Predict on the test set
X_test = test.drop(columns=['participant_id'])  # Adjust to exclude ID columns
predictions = {}
for target, model in models.items():
    predictions[target] = model.predict(X_test)


# <p style="padding:10px;background-color:#f8df01;margin:0;color:#102d02;font-family:newtimeroman;font-size:100%;text-align:center;border-radius:15px 50px;overflow:hidden;font-weight:500;border: 3px solid #16e51e;">Submission</p>


In [None]:
# Prepare the submission file
sub = pd.read_excel('/kaggle/input/widsdatathon2025/SAMPLE_SUBMISSION.xlsx')

# Add predictions
sub['ADHD'] = predictions['ADHD_Outcome']
sub['female'] = predictions['Sex_F']

# Remove the ADHD and female columns if they are not needed
sub = sub.drop(columns=['ADHD', 'female'])  # Remove columns

# Save the submission file
sub.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")


In [None]:
sub.head()