In [1]:
import numpy as np
import pandas as pd
import os
import re
import copy
import pickle
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline

import plotly.express as px

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
SEED = 42
n_splits = 5

In [2]:
import random
import torch
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(100)

# Define function

In [3]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, input_dim),
            nn.Sigmoid()
        )

        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')
                 
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
        
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    
    return df_encoded

In [4]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]


def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, X, y, test_data):
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)
    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    print('OPTIMIZED THRESHOLDS', KappaOPtimizer.x)
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })
    optimized_thresholds = KappaOPtimizer.x
    return submission, oof_tuned, oof_non_rounded, y, optimized_thresholds



# Define features

## Normal features

In [5]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

total_features = list(test.columns)
total_features.remove('id')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

In [6]:
noseason_features = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW','BMI_PHR']

## Loading timeseries

In [7]:
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

100%|██████████| 996/996 [01:12<00:00, 13.76it/s]
100%|██████████| 2/2 [00:00<00:00, 11.43it/s]


In [8]:
df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

In [9]:
df_total = pd.concat([df_train, df_test], axis=0, ignore_index=True)
df_total

Unnamed: 0,stat_0,stat_1,stat_2,stat_3,stat_4,stat_5,stat_6,stat_7,stat_8,stat_9,stat_10,stat_11,stat_12,stat_13,stat_14,stat_15,stat_16,stat_17,stat_18,stat_19,stat_20,stat_21,stat_22,stat_23,stat_24,stat_25,stat_26,stat_27,stat_28,stat_29,stat_30,stat_31,stat_32,stat_33,stat_34,stat_35,stat_36,stat_37,stat_38,stat_39,stat_40,stat_41,stat_42,stat_43,stat_44,stat_45,stat_46,stat_47,stat_48,stat_49,stat_50,stat_51,stat_52,stat_53,stat_54,stat_55,stat_56,stat_57,stat_58,stat_59,stat_60,stat_61,stat_62,stat_63,stat_64,stat_65,stat_66,stat_67,stat_68,stat_69,stat_70,stat_71,stat_72,stat_73,stat_74,stat_75,stat_76,stat_77,stat_78,stat_79,stat_80,stat_81,stat_82,stat_83,stat_84,stat_85,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95
0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,-0.054638,-0.163923,-0.114302,0.045252,-7.805897,0.000000,46.009533,4027.514893,5.415475e+13,4.438860,2.000000,30.202068,0.633126,0.513286,0.500372,0.132576,34.917873,0.000000,205.862213,108.451317,1.876976e+13,1.825557,0.000000,11.773107,-1.812031,-2.631380,-1.798073,0.0,-89.987045,0.0,0.0,3829.000000,0.000000e+00,1.0,2.0,15.0,-0.701660,-0.619076,-0.536432,0.007953,-32.948602,0.000000,2.520257,3958.000000,4.325125e+13,3.0,2.0,17.0,0.015846,-0.141810,-0.104193,0.019257,-6.358004,0.0,8.230733,4029.0,5.630500e+13,5.0,2.0,28.0,0.437897,0.148919,0.223770,0.036048,13.095750,0.0,24.750000,4146.000000,6.978000e+13,6.0,2.0,38.0,1.850391,3.580182,1.738203,5.314874,89.422226,0.0,2626.199951,4187.0,8.639500e+13,7.0,2.0,57.0
1,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,0.113277,0.093139,-0.106038,0.028960,-6.065619,0.046508,56.437958,3829.466064,4.331149e+13,3.840885,2.000000,232.909103,0.507897,0.541129,0.603787,0.096825,44.034721,0.208482,206.625092,167.600983,2.509136e+13,1.957999,0.000000,5.701968,-1.807955,-2.887664,-1.004992,0.0,-89.654587,0.0,0.0,3098.166748,0.000000e+00,1.0,2.0,223.0,-0.231743,-0.257600,-0.595426,0.000367,-37.326844,0.000000,4.000000,3724.000000,2.128500e+13,2.0,2.0,228.0,0.094074,0.068143,-0.228500,0.005257,-13.454103,0.0,10.050480,3812.0,4.360500e+13,4.0,2.0,233.0,0.517859,0.542323,0.312333,0.020598,18.462269,0.0,27.490936,3958.000000,6.511000e+13,5.0,2.0,238.0,1.928769,3.234613,2.475326,3.966906,89.080330,1.0,2628.199951,4146.0,8.639500e+13,7.0,2.0,243.0
2,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,-0.499738,0.046381,-0.181152,0.056544,-11.934993,0.000000,77.305130,4106.425781,4.481677e+13,3.148264,3.000000,100.144516,0.454021,0.510668,0.412588,0.140594,27.367514,0.000000,274.848145,50.734318,2.038156e+13,1.169176,0.000000,5.653936,-1.903281,-3.150104,-1.020313,0.0,-89.540176,0.0,0.0,3853.000000,4.500000e+10,1.0,3.0,97.0,-0.873151,-0.255299,-0.485521,0.005643,-30.154542,0.000000,2.918126,4089.625000,2.888500e+13,3.0,3.0,98.0,-0.644505,0.088542,-0.191693,0.018467,-11.570901,0.0,7.863636,4111.0,4.727000e+13,3.0,3.0,99.0,-0.242422,0.381953,0.088555,0.048282,5.009753,0.0,21.022933,4140.000000,6.094500e+13,4.0,3.0,100.0,1.021510,1.016589,1.746797,5.066334,86.987267,0.0,2618.199951,4183.0,8.636500e+13,7.0,3.0,134.0
3,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,0.007430,0.007583,-0.196510,0.053544,-12.847143,0.000000,9.369678,3958.604492,4.836642e+13,4.273992,2.303057,60.025017,0.586100,0.542189,0.474437,0.103401,32.552841,0.000000,54.104408,122.706802,1.868773e+13,2.023705,1.487018,7.396456,-1.684624,-2.405738,-1.023798,0.0,-89.968369,0.0,0.0,3468.000000,0.000000e+00,1.0,1.0,48.0,-0.530198,-0.412805,-0.556091,0.009947,-34.965618,0.000000,0.893617,3841.000000,3.526000e+13,3.0,1.0,53.0,0.022344,0.009674,-0.245181,0.027653,-15.000056,0.0,2.340206,3947.0,4.881000e+13,4.0,1.0,60.0,0.536801,0.443383,0.084469,0.057278,4.816339,0.0,6.200000,4064.000000,6.330000e+13,6.0,4.0,67.0,5.908000,2.083693,1.269051,6.134459,89.976074,0.0,2502.000000,6000.0,8.639500e+13,7.0,4.0,72.0
4,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,0.086653,-0.115162,-0.138969,0.040399,-11.009835,0.000000,5.049157,3992.347656,5.833895e+13,4.541829,4.000000,46.192024,0.509845,0.494897,0.639449,0.090201,47.933723,0.000000,15.590773,126.121590,2.146206e+13,2.081796,0.000000,18.615358,-1.675859,-1.071042,-1.012266,0.0,-89.770241,0.0,0.0,3815.083252,3.500000e+10,1.0,4.0,20.0,-0.224805,-0.444297,-0.685736,0.005364,-46.348264,0.000000,1.438378,3837.333252,5.161375e+13,3.0,4.0,32.0,0.053034,-0.087422,-0.225430,0.024135,-13.665493,0.0,2.897436,4000.0,6.427000e+13,4.0,4.0,42.0,0.544297,0.153125,0.347474,0.043690,20.726226,0.0,4.942201,4087.000000,7.393625e+13,7.0,4.0,69.0,3.231563,1.033620,1.071875,2.774382,89.300034,0.0,1046.800049,4199.0,8.601500e+13,7.0,4.0,76.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,-0.147508,-0.047232,-0.242875,0.027135,-18.903458,0.222337,10.387013,3841.772705,4.316802e+13,4.002807,1.000000,67.532288,0.478085,0.499994,0.622155,0.109624,48.017563,0.410910,75.709877,164.142853,2.506494e+13,1.929882,0.000000,6.580971,-1.508058,-2.958281,-1.013423,0.0,-89.887924,0.0,0.0,3098.166748,0.000000e+00,1.0,1.0,56.0,-0.552659,-0.354082,-0.850300,0.000000,-58.557291,0.000000,0.555556,3741.000000,2.137000e+13,2.0,1.0,62.0,-0.112749,0.003331,-0.333463,0.002575,-20.125556,0.0,2.107143,3812.0,4.307000e+13,4.0,1.0,68.0,0.140716,0.280936,0.231454,0.012770,13.528161,0.0,5.281850,3958.000000,6.502500e+13,6.0,1.0,73.0,0.999923,1.043029,1.547813,3.692727,89.333710,1.0,2592.199951,4178.0,8.639500e+13,7.0,1.0,79.0
994,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,-0.441574,-0.080691,-0.270330,0.037183,-17.535593,0.000000,11.325677,4123.798828,4.597792e+13,2.487963,1.000000,154.201294,0.502446,0.457471,0.470241,0.064660,32.590225,0.000000,35.017689,28.219002,2.436134e+13,2.188225,0.000000,0.726917,-1.073320,-1.455156,-1.016536,0.0,-87.998444,0.0,0.0,4073.000000,0.000000e+00,1.0,1.0,153.0,-0.831641,-0.369779,-0.664401,0.009702,-41.512409,0.000000,2.748235,4099.000000,2.505000e+13,1.0,1.0,154.0,-0.599089,-0.068216,-0.282813,0.020775,-16.773024,0.0,5.729136,4123.0,4.992000e+13,2.0,1.0,154.0,-0.214362,0.210247,0.034375,0.039810,1.885406,0.0,10.699164,4146.000000,6.662500e+13,2.0,1.0,155.0,1.004674,0.981576,0.999219,1.673958,88.629547,0.0,1875.199951,4183.0,8.639500e+13,7.0,1.0,155.0
995,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,-0.181627,-0.300301,0.240738,0.002993,14.728157,0.749290,5.465665,3891.058105,4.310022e+13,4.144179,2.860763,10.118834,0.260311,0.324098,0.799344,0.009499,60.556572,0.421324,29.646894,124.940826,2.503642e+13,1.964386,1.455972,5.731455,-1.019361,-1.177506,-1.011560,0.0,-89.530304,0.0,0.0,3718.000000,0.000000e+00,1.0,1.0,0.0,-0.267668,-0.609891,-0.947463,0.000000,-72.318169,0.488889,0.872747,3788.000000,2.139500e+13,3.0,1.0,5.0,-0.144425,-0.288146,0.719540,0.000011,46.380806,1.0,2.205128,3848.5,4.279000e+13,4.0,4.0,10.0,-0.061189,-0.132583,0.863528,0.004280,57.963976,1.0,5.808605,3982.000000,6.500000e+13,6.0,4.0,15.0,1.015231,1.051578,1.006835,1.009104,88.652969,1.0,1196.599976,4176.0,8.639500e+13,7.0,4.0,20.0
996,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,-0.316384,0.016009,-0.167890,0.047388,-10.580416,0.000000,42.296310,4053.579102,5.046215e+13,4.470182,3.000000,53.201683,0.453665,0.502702,0.585710,0.106351,42.947170,0.000000,208.168976,112.404045,1.942842e+13,1.931421,0.000000,14.244914,-1.746094,-2.905339,-1.048372,0.0,-89.833092,0.0,0.0,3824.000000,5.500000e+10,1.0,3.0,41.0,-0.684180,-0.309863,-0.649974,0.006432,-41.541863,0.000000,2.392969,4028.666748,3.689000e+13,3.0,3.0,42.0,-0.366849,0.024974,-0.245378,0.023637,-15.086617,0.0,6.926828,4070.0,5.347750e+13,5.0,3.0,50.0,-0.010677,0.400677,0.204727,0.041420,12.220764,0.0,15.000000,4147.000000,6.640875e+13,6.0,3.0,53.0,1.507865,1.666354,1.546979,4.004276,89.751656,0.0,2633.250000,4188.5,8.611000e+13,7.0,3.0,85.0


In [10]:
# train_ts_encoded = perform_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
# test_ts_encoded = perform_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)
total_ts_encoded = perform_autoencoder(df_total, encoding_dim=60, epochs=100, batch_size=32)

Epoch [10/100], Loss: 1.3438]
Epoch [20/100], Loss: 1.2988]
Epoch [30/100], Loss: 1.2560]
Epoch [40/100], Loss: 1.2517]
Epoch [50/100], Loss: 1.2493]
Epoch [60/100], Loss: 1.2415]
Epoch [70/100], Loss: 1.2394]
Epoch [80/100], Loss: 1.2374]
Epoch [90/100], Loss: 1.2351]
Epoch [100/100], Loss: 1.2149]


In [11]:
df_train.shape[1]

96

In [12]:
train_ts_encoded = total_ts_encoded.head(df_train.shape[0])
test_ts_encoded = total_ts_encoded.drop(train_ts_encoded.index, axis=0)
test_ts_encoded.reset_index(inplace=True, drop=True)

In [13]:
test_ts_encoded

Unnamed: 0,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_30,Enc_31,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_58,Enc_59,Enc_60
0,0.0,1.09423,2.256393,5.124166,0.0,0.0,0.0,0.107032,0.0,0.786884,0.459363,0.652831,0.0,0.0,1.166632,3.056435,0.0,2.928739,4.444079,2.896902,3.543743,5.913033,4.553603,1.358083,2.386111,3.930094,0.0,0.0,6.151674,3.435974,1.083235,1.190311,0.0,0.815278,0.0,5.340208,0.793149,2.689621,1.04798,0.0,9.533504,4.65321,5.165917,1.981167,2.671114,2.794984,0.0,0.0,0.0,0.643714,0.0,0.0,0.0,0.0,0.107868,0.872594,5.484861,0.0,0.0,0.0
1,2.69017,1.914039,0.0,3.089341,0.133772,0.0,0.0,4.69789,6.488383,3.675978,0.0,4.902103,0.0,0.0,4.069793,2.508291,0.0,2.378166,4.974399,0.0,4.726307,4.137166,0.0,0.0,4.070021,0.556466,0.0,0.465911,0.0,1.244815,3.309381,1.847435,0.0,1.690895,0.0,4.0228,0.0,1.299309,5.148935,0.0,3.391294,0.0,1.849268,0.0,3.098395,0.796231,0.0,0.0,0.0,0.0,0.0,4.092525,0.0,0.0,6.390243,4.60259,6.508249,3.933432,8.316557,0.0


In [14]:
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

## Features timeseries

In [15]:
time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")
time_encoded_cols = train_ts_encoded.columns.tolist()
time_encoded_cols.remove("id")

# Submission 1

In [16]:
# features_sub1 = total_features + time_series_cols

# train_sub1 = pd.merge(train, train_ts, how="left", on='id')
# test_sub1 = pd.merge(test, test_ts, how="left", on='id')

# train_sub1 = train_sub1.drop('id', axis=1)
# test_sub1 = test_sub1.drop('id', axis=1)   

# train_sub1 = train_sub1.dropna(subset='sii')


In [17]:
# train_sub1 = update(train_sub1)
# test_sub1 = update(test_sub1)
# for col in cat_c:
#     mapping = create_mapping(col, train_sub1)
#     mappingTe = create_mapping(col, test_sub1)
    
#     train_sub1[col] = train_sub1[col].replace(mapping).astype(int)
#     test_sub1[col] = test_sub1[col].replace(mappingTe).astype(int)

In [18]:
# X_sub1 = train_sub1[features_sub1]
# y_sub1 = train_sub1['sii']
# test_sub1 = test_sub1[features_sub1]

In [19]:
# SEED = 42
# n_splits = 5

# model = XGBRegressor(
#     learning_rate=0.05,
#     max_depth=6,
#     n_estimators=200,
#     subsample=0.8,
#     colsample_bytree = 0.8,
#     reg_alpha=1,
#     reg_lambda=5,
#     random_state=SEED
# )

# # we get out of fold predictions for further exploration
# submission, y_pred, y_pred_non_rounded, y_true, optimized_thresholds = TrainML(model, X_sub1, y_sub1, test_sub1)

# Submission 2

In [20]:
!pip -q install /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl

In [21]:
from pytorch_tabnet.tab_model import TabNetRegressor


In [22]:
def feature_engineering(df):
    season_cols = [col for col in df.columns if 'Season' in col]
    df = df.drop(season_cols, axis=1) 
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    df['BMI_PHR'] = df['Physical-BMI'] * df['Physical-HeartRate']
    
    return df

In [23]:
# notna_df = train.dropna(subset='sii', ignore_index=True)
# na_df = train.drop(notna_df.index, axis=0)
# na_df

In [24]:
train_sub2 = pd.merge(train, train_ts_encoded, how="left", on='id')
test_sub2 = pd.merge(test, test_ts_encoded, how="left", on='id')

# imputer = KNNImputer(n_neighbors=5)
# numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
# imputed_data = imputer.fit_transform(train_sub2[numeric_cols])
# train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
# train_imputed['sii'] = train_imputed['sii'].round().astype(int)

# for col in train_sub2.columns:
#     if col not in numeric_cols:
#         train_imputed[col] = train_sub2[col]
        
# train_sub2 = train_imputed

train_sub2 = feature_engineering(train_sub2)
train_sub2 = train_sub2.dropna(subset='sii', ignore_index=True)
test_sub2 = feature_engineering(test_sub2)

train_sub2 = train_sub2.drop('id', axis=1)
test_sub2  = test_sub2.drop('id', axis=1)   

In [25]:
# train_sub2 = train_sub2.drop(na_df.index, axis=0)
# train_sub2.reset_index(inplace=True, drop=True)
train_sub2

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,sii,Enc_1,Enc_2,Enc_3,Enc_4,Enc_5,Enc_6,Enc_7,Enc_8,Enc_9,Enc_10,Enc_11,Enc_12,Enc_13,Enc_14,Enc_15,Enc_16,Enc_17,Enc_18,Enc_19,Enc_20,Enc_21,Enc_22,Enc_23,Enc_24,Enc_25,Enc_26,Enc_27,Enc_28,Enc_29,Enc_30,Enc_31,Enc_32,Enc_33,Enc_34,Enc_35,Enc_36,Enc_37,Enc_38,Enc_39,Enc_40,Enc_41,Enc_42,Enc_43,Enc_44,Enc_45,Enc_46,Enc_47,Enc_48,Enc_49,Enc_50,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_58,Enc_59,Enc_60,BMI_Age,Internet_Hours_Age,BMI_Internet_Hours,BFP_BMI,FFMI_BFP,FMI_BFP,LST_TBW,BFP_BMR,BFP_DEE,BMR_Weight,DEE_Weight,SMM_Height,Muscle_to_Fat,Hydration_Status,ICW_TBW,BMI_PHR
0,5,0,51.0,16.877316,46.0,50.8,,,,,,,,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,2.0,2.66855,16.8792,932.498,1492.00,8.25598,41.5862,13.8177,3.061430,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,,,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,84.386578,15.0,50.631947,0.545865,1.499679,0.332267,1.190475,8591.822097,13746.944840,18.356260,29.370079,0.424811,6.383063,0.643522,0.747453,
1,9,0,,14.035590,48.0,46.0,22.0,75.0,70.0,122.0,,,,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.211720,3.97085,1.0,21.0352,14.97400,39.4497,15.4107,27.0552,,2.340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,64.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,126.320313,0.0,0.000000,0.282883,3.229888,0.305154,1.458119,3719.320478,5950.914352,20.362087,32.579348,0.321056,12.718037,0.588157,0.777492,982.491320
2,10,1,71.0,16.648696,56.5,75.6,,65.0,94.0,117.0,5.0,7.0,33.0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,2.170,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,38.0,54.0,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,166.486961,20.0,33.297392,,,,,,,,,,,,,1564.977430
3,9,0,71.0,18.292347,56.0,81.6,,60.0,97.0,117.0,6.0,9.0,37.0,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3.0,3.84191,18.2943,1131.430,1923.44,15.59250,62.7757,14.0740,4.220330,18.82430,2.0,30.4041,16.77900,58.9338,26.4798,45.9966,,2.451,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,31.0,45.0,0.0,1.0,0.000000,1.094230,2.256393,5.124166,0.000000,0.000000,0.0,0.107032,0.000000,0.786884,0.459363,0.652831,0.0,0.0,1.166632,3.056435,0.0,2.928739,4.444079,2.896902,3.543743,5.913033,4.553603,1.358083,2.386111,3.930094,0.0,0.000000,6.151674,3.435974,1.083235,1.190311,0.0,0.815278,0.0,5.340208,0.793149,2.689621,1.047980,0.0,9.533504,4.653210,5.165917,1.981167,2.671114,2.794984,0.0,0.000000,0.000000,0.643714,0.000000,0.000000,0.000000,0.0,0.107868,0.872594,5.484861,0.000000,0.000000,0.0,164.631122,0.0,0.000000,1.028971,0.747651,0.224196,1.281264,21298.377749,36207.411592,13.865564,23.571569,0.472854,6.274343,0.563684,0.661008,1774.357653
4,13,1,50.0,22.279952,59.5,112.2,,60.0,73.0,102.0,,,,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,2.0,4.33036,30.1865,1330.970,1996.45,30.21240,84.0285,16.6877,13.498800,67.97150,2.0,32.9141,20.90200,79.6982,35.3804,63.1265,,4.110,3.0,3.0,3.0,0.0,2.0,1.0,0.0,2.0,2.0,1.0,0.0,1.0,3.0,3.0,2.0,1.0,3.0,1.0,2.0,1.0,34.0,40.0,56.0,0.0,1.0,2.690170,1.914039,0.000000,3.089341,0.133772,0.000000,0.0,4.697890,6.488383,3.675978,0.000000,4.902103,0.0,0.0,4.069793,2.508291,0.0,2.378166,4.974399,0.000000,4.726307,4.137166,0.000000,0.000000,4.070021,0.556466,0.0,0.465911,0.000000,1.244815,3.309381,1.847435,0.0,1.690895,0.0,4.022800,0.000000,1.299309,5.148935,0.0,3.391294,0.000000,1.849268,0.000000,3.098395,0.796231,0.0,0.000000,0.000000,0.000000,0.000000,4.092525,0.000000,0.0,6.390243,4.602590,6.508249,3.933432,8.316557,0.0,289.639376,0.0,0.000000,2.251718,0.245510,0.198595,1.262516,90468.027355,135701.701175,11.862478,17.793672,0.594629,2.621003,0.562625,0.521399,1626.436495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2731,8,0,,17.139810,52.5,67.2,25.0,60.0,65.0,112.0,,,,0.0,0.0,,,,,0.0,0.0,8.0,1.0,10.0,1.0,12.0,1.0,3.0,3.20303,17.1417,1035.270,1759.96,11.00630,52.5331,13.4004,3.741300,14.66690,1.0,25.7118,15.81500,49.3301,20.2645,36.7181,,3.440,3.0,3.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,3.0,0.0,2.0,2.0,1.0,22.0,41.0,58.0,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,137.118476,16.0,34.279619,0.855627,0.913649,0.255085,1.343482,15184.201563,25813.157324,15.405804,26.189881,0.385990,5.416433,0.546400,0.700249,1114.087619
2732,7,1,,13.927006,48.5,46.6,23.0,65.0,75.0,105.0,,,,0.0,0.0,,,,,0.0,0.0,9.0,0.0,8.5,0.0,4.5,0.0,1.0,2.36680,13.6457,966.287,1256.17,9.98802,45.1853,13.2315,0.414263,1.41470,1.0,20.0572,15.14000,42.8185,18.0937,30.0453,,,1.0,3.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,5.0,1.0,0.0,5.0,3.0,3.0,3.0,0.0,33.0,48.0,67.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,97.489042,0.0,0.000000,0.103674,9.352866,0.292827,1.425131,1367.006219,1777.103699,20.735773,26.956438,0.373066,43.676843,0.644749,0.667565,1044.525455
2733,13,0,60.0,16.362460,59.5,82.4,,71.0,70.0,104.0,,,,16.0,0.0,18.0,1.0,19.9,2.0,10.0,1.0,8.0,1.0,9.0,1.0,12.0,1.0,3.0,4.52277,16.3642,1206.880,2051.70,19.46110,70.8117,14.0629,2.301380,11.58830,1.0,33.3709,17.97970,66.2889,29.7790,52.8320,,3.260,3.0,3.0,3.0,2.0,3.0,2.0,2.0,2.0,2.0,1.0,2.0,0.0,2.0,0.0,1.0,0.0,2.0,1.0,1.0,0.0,32.0,35.0,50.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,212.711984,13.0,16.362460,0.708149,1.213543,0.198595,1.254711,13985.687504,23775.715110,14.646602,24.899272,0.500487,12.939628,0.641165,0.631642,1145.372220
2734,11,0,68.0,21.441500,60.0,109.8,,79.0,99.0,116.0,,,,15.0,1.0,18.5,2.0,15.8,2.0,0.0,0.0,10.0,1.0,10.0,1.0,14.0,1.0,2.0,4.41305,21.4438,1253.740,2005.99,20.48250,75.8033,14.8043,6.639520,33.99670,2.0,33.9805,21.34030,71.3903,28.7792,54.4630,,2.729,5.0,5.0,3.0,0.0,5.0,1.0,0.0,2.0,0.0,2.0,1.0,0.0,1.0,3.0,0.0,0.0,1.0,1.0,0.0,1.0,31.0,56.0,77.0,0.0,1.0,2.751797,3.372016,0.182782,1.759222,1.144387,8.067786,0.0,4.327469,6.318560,2.578479,3.539616,4.657750,0.0,0.0,5.000157,5.730145,0.0,0.473726,4.354885,1.680674,0.000000,0.000000,0.000000,2.286090,0.000000,0.000000,0.0,4.753076,1.169155,0.000000,2.983762,2.141799,0.0,2.262909,0.0,1.271944,2.008945,1.697994,4.977267,0.0,0.557994,0.643278,1.318359,0.521990,2.812748,0.010672,0.0,0.000000,0.000000,4.151740,5.311646,15.858729,2.678568,0.0,7.260725,2.009896,0.234238,4.710312,0.000000,0.0,235.856500,0.0,0.000000,1.585386,0.435463,0.195299,1.310804,42623.022658,68197.040233,11.418397,18.269490,0.479653,4.334530,0.496020,0.623919,2122.708500


In [26]:
features_sub2 = noseason_features + time_encoded_cols

# train_sub2 = pd.merge(train, train_ts, how="left", on='id')
# test_sub2 = pd.merge(test, test_ts, how="left", on='id')

train_sub2 = train_sub2.dropna(subset='sii')


In [27]:
if np.any(np.isinf(train_sub2)):
    train_sub2 = train_sub2.replace([np.inf, -np.inf], np.nan)

In [28]:
X_sub2 = train_sub2[features_sub2]
y_sub2 = train_sub2['sii']
test_sub2 = test_sub2[features_sub2]

In [29]:
y_sub2

0       2.0
1       0.0
2       0.0
3       1.0
4       1.0
       ... 
2731    0.0
2732    1.0
2733    1.0
2734    1.0
2735    0.0
Name: sii, Length: 2736, dtype: float64

In [30]:
# Model parameters for LightGBM
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01,  # Increased from 2.68e-06
    'device': 'cpu'

}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED,
    'tree_method': 'gpu_hist',

}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10,  # Increase this value
    'task_type': 'GPU'

}

In [31]:
# New: TabNet

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from pytorch_tabnet.callbacks import Callback
import os
import torch
from pytorch_tabnet.callbacks import Callback

class TabNetWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.model = TabNetRegressor(**kwargs)
        self.kwargs = kwargs
        self.imputer = SimpleImputer(strategy='median')
        self.best_model_path = 'best_tabnet_model.pt'
        
    def fit(self, X, y):
        # Handle missing values
        X_imputed = self.imputer.fit_transform(X)
        
        if hasattr(y, 'values'):
            y = y.values
            
        # Create internal validation set
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_imputed, 
            y, 
            test_size=0.2,
            random_state=42
        )
        
        # Train TabNet model
        history = self.model.fit(
            X_train=X_train,
            y_train=y_train.reshape(-1, 1),
            eval_set=[(X_valid, y_valid.reshape(-1, 1))],
            eval_name=['valid'],
            eval_metric=['mse'],
            max_epochs=200,
            patience=20,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False,
            callbacks=[
                TabNetPretrainedModelCheckpoint(
                    filepath=self.best_model_path,
                    monitor='valid_mse',
                    mode='min',
                    save_best_only=True,
                    verbose=True
                )
            ]
        )
        
        # Load the best model
        if os.path.exists(self.best_model_path):
            self.model.load_model(self.best_model_path)
            os.remove(self.best_model_path)  # Remove temporary file
        
        return self
    
    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        return self.model.predict(X_imputed).flatten()
    
    def __deepcopy__(self, memo):
        # Add deepcopy support for scikit-learn
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result

# TabNet hyperparameters
TabNet_Params = {
    'n_d': 64,              # Width of the decision prediction layer
    'n_a': 64,              # Width of the attention embedding for each step
    'n_steps': 5,           # Number of steps in the architecture
    'gamma': 1.5,           # Coefficient for feature selection regularization
    'n_independent': 2,     # Number of independent GLU layer in each GLU block
    'n_shared': 2,          # Number of shared GLU layer in each GLU block
    'lambda_sparse': 1e-4,  # Sparsity regularization
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2, weight_decay=1e-5),
    'mask_type': 'entmax',
    'scheduler_params': dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'verbose': 1,
    'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
}

class TabNetPretrainedModelCheckpoint(Callback):
    def __init__(self, filepath, monitor='val_loss', mode='min', 
                 save_best_only=True, verbose=1):
        super().__init__()  # Initialize parent class
        self.filepath = filepath
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.verbose = verbose
        self.best = float('inf') if mode == 'min' else -float('inf')
        
    def on_train_begin(self, logs=None):
        self.model = self.trainer  # Use trainer itself as model
        
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            return
        
        # Check if current metric is better than best
        if (self.mode == 'min' and current < self.best) or \
           (self.mode == 'max' and current > self.best):
            if self.verbose:
                print(f'\nEpoch {epoch}: {self.monitor} improved from {self.best:.4f} to {current:.4f}')
            self.best = current
            if self.save_best_only:
                self.model.save_model(self.filepath)  # Save the entire model

In [32]:
def TrainML_sub2(model_class, X, y, test_data):
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))
    
    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        model = clone(model_class)
        
        imputer = KNNImputer(n_neighbors=5)
        imputer.fit(X_train)
        train_imputed = imputer.fit_transform(X_train)
        X_train = pd.DataFrame(train_imputed, columns=X_train.columns)
        val_imputed = imputer.fit_transform(X_val)
        X_val = pd.DataFrame(val_imputed, columns=X_val.columns)
        test_cpy = test_data.copy()
        test_imputed = imputer.fit_transform(test_cpy)
        test_cpy = pd.DataFrame(test_imputed, columns=test_cpy.columns)
        
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_cpy)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)
    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    print('OPTIMIZED THRESHOLDS', KappaOPtimizer.x)
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })
    optimized_thresholds = KappaOPtimizer.x
    return submission, oof_tuned, oof_non_rounded, y, optimized_thresholds

In [33]:
# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
TabNet_Model = TabNetWrapper(**TabNet_Params) 
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    ('tabnet', TabNet_Model)
],weights=[4.0,4.0,5.0,4.0])


In [34]:
submission2, _, _, _, _= TrainML_sub2(voting_model, X_sub2, y_sub2, test_sub2)

Training Folds: 100%|██████████| 5/5 [01:11<00:00, 14.31s/it]

Mean Train QWK --> 0.7153
Mean Validation QWK ---> 0.3265





OPTIMIZED THRESHOLDS [0.51913628 0.87590931 2.76547224]
----> || Optimized QWK SCORE :: [36m[1m 0.415[0m


# Submission 3

In [35]:
train_sub3 = pd.merge(train, train_ts, how="left", on='id')
test_sub3 = pd.merge(test, test_ts, how="left", on='id')

train_sub3 = train_sub3.drop('id', axis=1)
test_sub3 = test_sub3.drop('id', axis=1) 

In [36]:
features_sub3 = total_features + time_series_cols

In [37]:
train_sub3 = train_sub3.dropna(subset='sii')

In [38]:
train_sub3 = update(train_sub3)
test_sub3 = update(test_sub3)

for col in cat_c:
    mapping = create_mapping(col, train_sub3)
    mappingTe = create_mapping(col, test_sub3)
    
    train_sub3[col] = train_sub3[col].replace(mapping).astype(int)
    test_sub3[col] = test_sub3[col].replace(mappingTe).astype(int)


In [39]:
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01  # Increased from 2.68e-06
}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED
}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'cat_features': cat_c,
    'verbose': 0,
    'l2_leaf_reg': 10  # Increase this value
}


In [40]:
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])


In [41]:
X_sub3 = train_sub3[features_sub3]
y_sub3 = train_sub3['sii']
test_sub3 = test_sub3[features_sub3]

In [42]:
submission3, _, _, _, _= TrainML(voting_model, X_sub3, y_sub3, test_sub3)

Training Folds: 100%|██████████| 5/5 [00:50<00:00, 10.09s/it]

Mean Train QWK --> 0.7595
Mean Validation QWK ---> 0.3926





OPTIMIZED THRESHOLDS [0.5882359  0.96172902 2.67307225]
----> || Optimized QWK SCORE :: [36m[1m 0.457[0m


# Submission 4

In [43]:
imputer = SimpleImputer(strategy='median')

ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))]))
])


submission4, _, _, _, _= TrainML(ensemble, X_sub3, y_sub3, test_sub3)

Training Folds: 100%|██████████| 5/5 [02:01<00:00, 24.24s/it]

Mean Train QWK --> 0.9175
Mean Validation QWK ---> 0.3803





OPTIMIZED THRESHOLDS [0.61742371 0.93213353 2.71038242]
----> || Optimized QWK SCORE :: [36m[1m 0.450[0m


# Final

In [44]:
sub1 = submission2
sub2 = submission3
sub3 = submission4

sub1 = sub1.sort_values(by='id').reset_index(drop=True)
sub2 = sub2.sort_values(by='id').reset_index(drop=True)
sub3 = sub3.sort_values(by='id').reset_index(drop=True)

combined = pd.DataFrame({
    'id': sub1['id'],
    'sii_1': sub1['sii'],
    'sii_2': sub2['sii'],
    'sii_3': sub3['sii']
})

def majority_vote(row):
    return row.mode()[0]

combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_3']].apply(majority_vote, axis=1)

final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})

final_submission.to_csv('submission.csv', index=False)

print("Majority voting completed and saved to 'Final_Submission.csv'")

Majority voting completed and saved to 'Final_Submission.csv'


In [45]:
final_submission

Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,0
