<section id="back"> </section>

<nav style="margin-top: 20px; padding: 10px; background-color: #f0f0f0; border: 10px solid #ccc;">
    <div style="background-color:#2C41FF">
  <h2 style="margin: 0;text-align: center;">Table of Contents</h2>
    </div>
  <ul style="list-style: none; padding: 0;">
    <li style="margin: 5px 0;"><a href="#preSec" style="text-decoration: none; color: #333;">Dataset Explanation </a></li>
    <li style="margin: 5px 0;"><a href="#sec1" style="text-decoration: none; color: #333;">Imports</a></li>
    <li style="margin: 5px 0;"><a href="#sec2" style="text-decoration: none; color: #333;">EDA</a></li>
    <li style="margin: 5px 0;"><a href="#sec3" style="text-decoration: none; color: #333;">Preprocessing</a></li>
    <li style="margin: 5px 0;"><a href="#sec4" style="text-decoration: none; color: #333;">Modeling</a></li>
    <li style="margin: 5px 0;"><a href="#sec5" style="text-decoration: none; color: #333;">Predictions</a></li>
  </ul>
</nav>

<section id="preSec"> </section>

<!DOCTYPE html>
<html>
<head>
</head>
<body>
    <h1 align="center">Dataset Features</h1>
<table>
  <tr>
    <th>Feature Name</th>
    <th>Explanation</th>
  </tr>
  <tr>
    <td>Demographics</td>
    <td>Information about age and sex of participants.</td>
  </tr>
  <tr>
    <td>Internet Use</td>
    <td>Number of hours of using computer/internet per day.</td>
  </tr>
  <tr>
    <td>Children's Global Assessment Scale</td>
    <td>Numeric scale used by mental health clinicians to rate the general functioning of youths under the age of 18.</td>
  </tr>
  <tr>
    <td>Physical Measures</td>
    <td>Collection of blood pressure, heart rate, height, weight and waist, and hip measurements.</td>
  </tr>
  <tr>
    <td>FitnessGram Vitals and Treadmill</td>
    <td>Measurements of cardiovascular fitness assessed using the NHANES treadmill protocol.</td>
  </tr>
  <tr>
    <td>FitnessGram Child</td>
    <td>Health related physical fitness assessment measuring five different parameters including aerobic capacity, muscular strength, muscular endurance, flexibility, and body composition.</td>
  </tr>
  <tr>
    <td>Bio-electric Impedance Analysis</td>
    <td>Measure of key body composition elements, including BMI, fat, muscle, and water content.</td>
  </tr>
  <tr>
    <td>Physical Activity Questionnaire</td>
    <td>Information about children's participation in vigorous activities over the last 7 days.</td>
  </tr>
  <tr>
    <td>Sleep Disturbance Scale</td>
    <td>Scale to categorize sleep disorders in children.</td>
  </tr>
  <tr>
    <td>Actigraphy</td>
    <td>Objective measure of ecological physical activity through a research-grade biotracker.</td>
  </tr>
  <tr>
    <td>Parent-Child Internet Addiction Test </td>
    <td>20-item scale that measures characteristics and behaviors associated with compulsive use of the Internet including compulsivity, escapism, and dependency.</td>
  </tr>
</table>
    <br>
    <p><b>Special Note - <br></b>
    Note in particular the field PCIAT-PCIAT_Total. The   target sii for this competition is derived from this field as described in the data dictionary: 0 for None, 1 for Mild, 2 for Moderate, and 3 for Severe. Additionally, each participant has been assigned a unique identifier id.</p>
</body>
</html>

<section id="sec1"> </section>
<h1> Imports </h1>

<a href="#back" style="text-decoration: none; color: #333;">Back to table of contents</a>

In [None]:
# Basic imports
import numpy as np
import pandas as pd
import time
import random
import datetime
import warnings
from tqdm import tqdm
from colorama import Fore, Style
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", message="Parameters: { 'verbose' } are not used.")

##################### Preprocessing imports 
from sklearn.metrics import make_scorer
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedStratifiedKFold
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor
from sklearn.inspection import permutation_importance
from sklearn.base import clone
from scipy.optimize import minimize

##################### Models
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.preprocessing import SplineTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

##################### optuna library import
import optuna
import shap
random_state = 42
n_splits = 5

  from pandas.core import (


In [None]:
train=pd.read_csv("../data/processed/train_processed.csv")
test=pd.read_csv("../data/processed/test_processed.csv")
sample = pd.read_csv('../data/raw/sample_submission.csv')

train = train.dropna(subset='sii').reset_index().drop('index',axis=1)
train = train.select_dtypes(include='number')

In [None]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

In [None]:
# Function to evaluate multiple models and find the best one based on optimized QWK
def evaluate_models(models, X, y, test, n_splits=5):
    """
    Evaluate multiple models and return the best model based on optimized QWK.
    
    Parameters:
        models: list of models to evaluate
        X_train: training features
        y_train: training labels
        X_test: testing features
        y_test: testing labels
        n_splits: number of splits for cross-validation

    Returns:
        best_model: the model with the highest QWK score
        model_scores: dictionary of models and their QWK scores
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    model_scores = {}
    best_model = None
    best_qwk = -np.inf  # Initialize the best QWK as negative infinity
    
    for model in models:
        print(f"Evaluating model: {model}")
        
        oof_non_rounded = np.zeros(len(y))
        test_preds = np.zeros((len(test), n_splits))
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
            X_fold_train, X_fold_val = X.iloc[train_idx], X.iloc[val_idx]
            y_fold_train, y_fold_val = y.iloc[train_idx], y.iloc[val_idx]
            
            # Train the model
            model.fit(X_fold_train, y_fold_train)
            
            # Predict on validation set
            oof_non_rounded[val_idx] = model.predict(X_fold_val)
            
            # Predict on test set
            test_preds[:, fold] = model.predict(test)
        
        # Optimize thresholds on the validation predictions
        initial_thresholds = [0.5, 1.5, 2.5]
        result = minimize(evaluate_predictions, x0=initial_thresholds, 
                          args=(y, oof_non_rounded), method='Nelder-Mead')
        
        # Apply the optimized thresholds
        optimized_thresholds = result.x
        final_predictions = threshold_rounder(oof_non_rounded, optimized_thresholds)
        
        # Calculate QWK for the model
        qwk_score = quadratic_weighted_kappa(y, final_predictions)
        model_scores[model] = qwk_score
        
        print(f"Model: {model}, QWK Score: {qwk_score:.4f}, Optimized Thresholds: {optimized_thresholds}")
        
        # Update the best model if this one is better
        if qwk_score > best_qwk:
            best_qwk = qwk_score
            best_model = model
    
    print(f"\nBest Model: {best_model}, Best QWK Score: {best_qwk:.4f}")
    return best_model, model_scores

In [None]:
from IPython.display import clear_output
def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

In [None]:
Light = LGBMRegressor(verbose = -1)
XGB_Model = XGBRegressor()
CatBoost_Model = CatBoostRegressor(verbose = False)


voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

Submission = TrainML(voting_model, test)

Submission

Training Folds: 100%|██████████| 5/5 [00:52<00:00, 10.45s/it]

Mean Train QWK --> 0.9384
Mean Validation QWK ---> 0.3946





----> || Optimized QWK SCORE :: [36m[1m 0.434[0m


Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,0
2,00105258,0
3,00115b9f,1
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,1


In [None]:
Submission.to_csv("submission.csv")