In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics import cohen_kappa_score

# As a simple approach:
# - Only use labeled tabular data for now.
# - Ignore parquet data.
# - Ignore PCIAT columns which don't exist in test set (these are directly used to calculate sii, so we could consider them as intermediate targets to predict)
# - One-hot encode strings.
# - Impute missing numbers as mean of that feature. This includes string one-hot encodings for now.
# - Avoid further prep by using XGBoost as model
# - Use simple set-aside test set for local evaluation. Do not tune hyperparameters yet.

df_train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
df_test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

studies_to_drop = ['PCIAT', 'Season', 'BIA']
for study in studies_to_drop:
    df_train = df_train.drop(columns=df_train.filter(like=study).columns)
    df_test = df_test.drop(columns=df_test.filter(like=study).columns)

#df_train = df_train.drop(columns=['id'], axis=1)
df_train_labeled = df_train.query('sii==sii')

#df_test, index_test = df_test.drop(columns=['id'], axis=1), df_test['id']


In [None]:
run_cv = False          # set this to True if you want to run k-fold CV, set to False if not
run_output = False      # set this to True if you want to run the model on the test data to generate an output

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
import keras

def apply_triang_nn_model(trainX, trainY, testX, learning_rate=0.007, n_epochs=5, verbose=False):
    # create an 80/20 test split to train the MLP
    tX, vX, tY, vY = train_test_split(trainX, trainY, test_size=0.2, random_state=0)
    
    Init = keras.initializers.RandomNormal(seed=0)

    # creates an MLP with 3 hidden layers and an output to calculate between 0-1, which then gets scaled up to 0-5
    m = keras.models.Sequential()
    m.add(keras.layers.Flatten(input_shape=[tX.shape[1]], name="input"))      # creates input layer with n_features perceptrons
    m.add(keras.layers.Dense(16, activation="relu", kernel_initializer=Init, name='hidden_1'))    # creates hidden layer with 16 perceptrons
    m.add(keras.layers.Dense(8, activation="relu", kernel_initializer=Init, name='hidden_2'))    # creates hidden layer with 18 perceptrons
    m.add(keras.layers.Dense(4, activation="relu", kernel_initializer=Init, name='hidden_3'))    # creates hidden layer with 4 perceptrons
    m.add(keras.layers.Dense(1, activation='sigmoid', kernel_initializer=Init, name='output'))  # create output layer with 1 perceptron (regressor with bins)
    if verbose: m.summary()   # displays all model alyers
    m.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate))

    if verbose:
        hist = m.fit(tX, tY/5, epochs=n_epochs, validation_data=(vX, vY/5))
    else:
        hist = m.fit(tX, tY/5, epochs=n_epochs, validation_data=(vX, vY/5), verbose=0)

    pred_out = m.predict(testX) * 5
    Y_pred = pred_out.copy()
    Y_pred = np.round(Y_pred)

    return Y_pred, hist

In [None]:
def calculate_FGC_PU_Zone(input_X):
    for i, row in input_X.iterrows():
        PU = row['FGC-FGC_PU']
        sex = row['Basic_Demos-Sex']
        age = row['Basic_Demos-Age']
        # calculates the FGC PU Zone based on # pushups, sex, and age
        if sex == 1: # female
            table = {5: 3,
                    6: 3,
                    7: 4,
                    8: 5,
                    9: 6}
            if age in table.keys():
                if PU >= table[age]:
                    input_X.at[i,'FGC-FGC_PU_Zone'] = 1
                else:
                    input_X.at[i,'FGC-FGC_PU_Zone'] = 0
            else:
                if PU >= 7:
                    input_X.at[i,'FGC-FGC_PU_Zone'] = 1
                else:
                    input_X.at[i,'FGC-FGC_PU_Zone'] = 0
        elif sex == 0: # male
            table = {5: 3,
                    6: 3,
                    7: 4,
                    8: 5,
                    9: 6,
                    10: 7,
                    11: 8,
                    12: 10,
                    13: 12,
                    14: 14,
                    15: 16}
            if age in table.keys():
                if PU >= table[age]:
                    input_X.at[i,'FGC-FGC_PU_Zone'] = 1
                else:
                    input_X.at[i,'FGC-FGC_PU_Zone'] = 0
            else:
                if PU >= 18:
                    input_X.at[i,'FGC-FGC_PU_Zone'] = 1
                else:
                    input_X.at[i,'FGC-FGC_PU_Zone'] = 0
        else:
            input_X.at[i,'FGC-FGC_PU_Zone'] = np.NaN
    return input_X

def calculate_FGC_SR_Zone(input_X, side='L'):
    if side=='L':
        col = 'FGC-FGC_SRL'
        zone = 'FGC-FGC_SRL_Zone'
    elif side=='R':
        col = 'FGC-FGC_SRR'
        zone = 'FGC-FGC_SRR_Zone'
    else:
        return input_X
    
    for i, row in input_X.iterrows():
        SR = row[col]
        sex = row['Basic_Demos-Sex']
        age = row['Basic_Demos-Age']
        # calculates the FGC SRL/R Zone based on SRL/SRR, sex, and age
        if sex == 1: # female
            table = {11:10,
                    12:10,
                    13:10,
                    14:10,
                    15:12,
                    16:12,
                    17:12,
                    18:12}
            if age in table.keys():
                if SR >= table[age]:
                    input_X.at[i, zone] = 1
                else:
                    input_X.at[i, zone] = 0
            else:
                if SR >= 9:
                    input_X.at[i, zone] = 1
                else:
                    input_X.at[i, zone] = 0
        elif sex == 0: # male
            if SR >= 8:
                input_X.at[i, zone] = 1
            else:
                input_X.at[i, zone] = 0
        else:
            input_X.at[i, zone] = np.NaN
    return input_X

def calculate_FGC_TL_Zone(input_X):
    for i, row in input_X.iterrows():
        TL = row['FGC-FGC_TL']
        age = row['Basic_Demos-Age']
        # calculates the FGC TL Zone based on TL and age
        if age <= 9:
            if TL >= 6:
                input_X.at[i,'FGC-FGC_TL_Zone'] = 1
            else:
                input_X.at[i,'FGC-FGC_TL_Zone'] = 0
        else:
            if TL >= 9:
                input_X.at[i,'FGC-FGC_TL_Zone'] = 1
            else:
                input_X.at[i,'FGC-FGC_TL_Zone'] = 0
        
    return input_X

def correct_errors(input_X):
    X = input_X.copy()
    
    # physical exam
    features_to_impute = ['Physical-Diastolic_BP', 'Physical-Systolic_BP', 'Physical-HeartRate']
    X.loc[X['Physical-Weight'] == 0, 'Physical-BMI'] = None # maybe need to look into imputing bmi and weight?
    X.loc[X['Physical-Weight'] == 0, 'Physical-Weight'] = None
    
    # fitnessgram vitals and treadmill have no observable errors
    
    # fitnessgram child
    # any grip strength reading of 0 lbs is likely an error
    X.loc[X['FGC-FGC_GSND'] == 0, 'FGC-FGC_GSND'] = None
    X.loc[X['FGC-FGC_GSND'] == 0, 'FGC-FGC_GSND_Zone'] = None
    
    X.loc[X['FGC-FGC_GSD'] == 0, 'FGC-FGC_GSD'] = None
    X.loc[X['FGC-FGC_GSD'] == 0, 'FGC-FGC_GSD_Zone'] = None
    
    
    # if PU_Zone is blank but PU is not blank, compute PU_Zone
    mask = (X['FGC-FGC_PU_Zone'] != X['FGC-FGC_PU_Zone']) & (X['FGC-FGC_PU'] == X['FGC-FGC_PU'])
    X_mask = X[mask]
    X_mask = calculate_FGC_PU_Zone(X_mask)
    X.loc[X_mask.index, 'FGC-FGC_PU_Zone'] = X_mask['FGC-FGC_PU_Zone']
    
    X.loc[X['FGC-FGC_SRL'] == 0, 'FGC-FGC_SRL_Zone'] = 0  # if SRL is zero, the zone should also be zero
    # if SRL_Zone is blank but X['FGC-FGC_SRL'] is not blank, compute SRL_Zone
    mask = (X['FGC-FGC_SRL_Zone'] != X['FGC-FGC_SRL_Zone']) & (X['FGC-FGC_SRL'] == X['FGC-FGC_SRL'])
    X_mask = X[mask]
    X_mask = calculate_FGC_SR_Zone(X_mask, side='L')
    X.loc[X_mask.index, 'FGC-FGC_SRL_Zone'] = X_mask['FGC-FGC_SRL_Zone']
    
    
    X.loc[X['FGC-FGC_SRR'] == 0, 'FGC-FGC_SRR_Zone'] = 0  # if SRR is zero, the zone should also be zero
    # if SRR_Zone is blank but X['FGC-FGC_SRR'] is not blank, compute SRR_Zone
    mask = (X['FGC-FGC_SRR_Zone'] != X['FGC-FGC_SRR_Zone']) & (X['FGC-FGC_SRR'] == X['FGC-FGC_SRR'])
    X_mask = X[mask]
    X_mask = calculate_FGC_SR_Zone(X_mask, side='R')
    X.loc[X_mask.index, 'FGC-FGC_SRR_Zone'] = X_mask['FGC-FGC_SRR_Zone']
    
    X.loc[X['FGC-FGC_TL'] == 0, 'FGC-FGC_TL_Zone'] = 0  # if TL is zero, the zone should also be zero
    # if TL_Zone is blank but X['FGC-FGC_TL'] is not blank, compute TL_Zone
    mask = (X['FGC-FGC_TL_Zone'] != X['FGC-FGC_TL_Zone']) & (X['FGC-FGC_TL'] == X['FGC-FGC_TL'])
    X_mask = X[mask]
    X_mask = calculate_FGC_TL_Zone(X_mask)
    X.loc[X_mask.index, 'FGC-FGC_TL_Zone'] = X_mask['FGC-FGC_TL_Zone']
    
    # bio-electric impedance analysis
    
    # physical activity questionnaire, PAQ-C vs PAQ-A? no visible measurement errors
    
    # sleep disturbance scale, no visible measurement errors
    
    return X

In [None]:
from sklearn.impute import KNNImputer

def remove_outliers(input_X, features, std_threshold=3, impute=False, thresholds=None):
    # replaces any outliers outside the std threshold
    X = input_X.copy()

    if thresholds == None:
        thresholds = {}
        new_threshold = True
    else:
        new_threshold = False
    
    for feature in features:
        if new_threshold:
            std = np.std(X[feature])
            mean = np.mean(X[feature])
            min = mean - std_threshold*std
            max = mean + std_threshold*std
            X_missing = (X[feature] < min) | (X[feature] > max)
            X.loc[X_missing, feature] = np.NaN
            thresholds[feature] = (min, max)
        else:
            min = thresholds[feature][0]
            max = thresholds[feature][1]
            X_missing = (X[feature] < min) | (X[feature] > max)
            X.loc[X_missing, feature] = np.NaN

      
    if impute:
        # imputes any NaN values using kNN imputer
        imputer = KNNImputer(n_neighbors = 5, missing_values=-1)
        X_subset = X.loc[X[features] > 0]
        X_subset = X_subset[features]
        X_new_subset = pd.DataFrame(imputer.fit_transform(X_subset), columns=features)
        X[features] = X_new_subset
    return X, thresholds

def impute_numeric_cols(input_X):
    X = input_X.copy()
    
    imputer = KNNImputer(n_neighbors=5)
    numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
    imputed_data = imputer.fit_transform(X[numeric_cols])
    X_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
    for col in X.columns:
        if col not in numeric_cols:
            X_imputed[col] = X[col]
            
    return X

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from concurrent.futures import ThreadPoolExecutor

def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df


class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim*3),
            nn.ReLU(),
            nn.Linear(input_dim*3, input_dim),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# TODO: write own autoencoder
def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')
                 
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
        
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    
    return df_encoded

In [None]:
def encode_ts(input_shape, encoding_shape):
    input_img = keras.Input(shape=(input_shape,))
    encoded = layers.Dense(encoding_shape*3, activation='relu')(input_img)
    encoded = layers.Dense(encoding_shape*2, activation='relu')(encoded)
    encoded = layers.Dense(encoding_shape, activation='relu')(encoded)

    return encoded

def decode_ts(input_shape, encoding_shape)
    decoded = layers.Dense(encoding_shape, activation='relu')(encoded)
    decoded = layers.Dense(encoding_shape*2, activation='relu')(decoded)
    decoded = layers.Dense(encoding_shape*3, activation='relu')(decoded)
    decoded = layers.Dense(input_shape, activation='sigmoid')(decoded)

    return decoded


In [None]:
# starting code for ensemble model and params from https://www.kaggle.com/code/honganzhu/cmi-piu-competition
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from scipy.optimize import minimize
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from sklearn.ensemble import VotingRegressor
from IPython.display import clear_output
from colorama import Fore, Style

Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01,  # Increased from 2.68e-06
    'device': 'gpu'

}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': 0,
    'tree_method': 'gpu_hist',

}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': 0,
    'verbose': 0,
    'l2_leaf_reg': 10,  # Increase this value
    'task_type': 'GPU'

}

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def apply_voting_model(trainX, trainY, testX, verbose=False, seed=0):
    np.random.seed(seed)
    
    X = trainX.copy()
    Y = trainY.copy()
    
    SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    
    train_scores = []
    test_scores = []
    
    oof_non_rounded = np.zeros(len(Y), dtype=float)
    test_preds = np.zeros((len(testX), 5))
    
    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, Y), desc="Training Folds", total=5)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        Y_train, Y_val = Y.iloc[train_idx], Y.iloc[test_idx]
        
        lgbm_model = LGBMRegressor(**Params, random_state=seed, verbose=-1, n_estimators=300)
        xgb_model = XGBRegressor(**XGB_Params)
        catboost_model = CatBoostRegressor(**CatBoost_Params)

        model = VotingRegressor(estimators=[
            ('lightgbm', lgbm_model),
            ('xgboost', xgb_model),
            ('catboost', catboost_model)
        ])
        model.fit(X_train, Y_train)
        
        Y_train_pred = model.predict(X_train)
        Y_val_pred = model.predict(X_val)
        
        oof_non_rounded[test_idx] = Y_val_pred
        Y_val_pred_rounded = Y_val_pred.round(0).astype(int)
        
        train_kappa = quadratic_weighted_kappa(Y_train, Y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(Y_val, Y_val_pred_rounded)
        
        train_scores.append(train_kappa)
        test_scores.append(val_kappa)
        
        test_preds[:, fold] = model.predict(testX)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)
        
    print(f"Mean Train QWK --> {np.mean(train_scores):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_scores):.4f}")
    
    kappa_optimizer = minimize(evaluate_predictions, x0=[0.5, 1.5, 2.5], args=(Y, oof_non_rounded), method='Nelder-Mead')
    assert kappa_optimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_rounder(oof_non_rounded, kappa_optimizer.x)
    t_kappa = quadratic_weighted_kappa(Y, oof_tuned)
    
    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {t_kappa:.3f}{Style.RESET_ALL}")
    
    tpm = test_preds.mean(axis=1)
    tp_tuned = threshold_rounder(tpm, kappa_optimizer.x)
    
    submission = pd.DataFrame({'id': sample['id'],
                              'sii': tp_tuned})
    return submission

X_train, Y_train = df_train_labeled.drop('sii', axis=1), df_train_labeled['sii']
X_test = df_test.copy()

#BIA_features = X_train.filter(like='BIA').columns    # removes any BIA values that are > 3 standard deviations from below the mean
#X_train, thresholds = remove_outliers(X_train, BIA_features)
#X_test, _ = remove_outliers(X_test, BIA_features, thresholds=thresholds)

X_train = correct_errors(X_train)
X_test = correct_errors(X_test)

X_train.to_csv("test.csv")

train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

#train_ts_encoded = perform_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
#test_ts_encoded = perform_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

time_series_cols = train_ts_encoded.columns.tolist()
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

train = pd.merge(X_train, train_ts_encoded, how="left", on='id')
test = pd.merge(X_test, test_ts_encoded, how="left", on='id')

#X_train = impute_numeric_cols(X_train)
#X_test = impute_numeric_cols(X_test)

X_train = train.drop('id', axis=1)
X_test = test.drop('id', axis=1)
submission = apply_voting_model(X_train, Y_train, X_test, verbose=False, seed=0)
submission.to_csv('submission.csv', index=False)

In [None]:
df_train_labeled