In [1]:
!pip -q install /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl

In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.impute import SimpleImputer
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBRegressor
from scipy.optimize import minimize
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from catboost import CatBoostRegressor
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.base import BaseEstimator, RegressorMixin

import warnings
warnings.filterwarnings('ignore')

train_df = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/train.csv")
test_df = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/test.csv")

Extract features

In [3]:
FEATURE_COLS = [
    'Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-Season',
    'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI', 'Physical-Height',
    'Physical-Weight', 'Physical-Waist_Circumference', 'Physical-Diastolic_BP',
    'Physical-HeartRate', 'Physical-Systolic_BP', 'Fitness_Endurance-Season',
    'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins',
    'Fitness_Endurance-Time_Sec', 'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone',
    'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone',
    'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone',
    'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
    'BIA-Season', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
    'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM', 'BIA-BIA_FFMI',
    'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num', 'BIA-BIA_ICW',
    'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 'PAQ_A-Season',
    'PAQ_A-PAQ_A_Total', 'PAQ_C-Season', 'PAQ_C-PAQ_C_Total', 'SDS-Season',
    'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
    'PreInt_EduHx-computerinternet_hoursday'
]

CATEGORY_COLS = [
    'Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
    'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season',
    'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season'
]

TARGET_COL = 'sii'

In [4]:
train_data = train_df.dropna(subset=[TARGET_COL]).copy()
test_data = test_df.copy()

train_ids = train_data['id']
train_y = train_data[TARGET_COL].values
test_ids = test_data['id']

train_X = train_data[FEATURE_COLS]
test_X = test_data[FEATURE_COLS]

In [5]:
for col in CATEGORY_COLS:
    train_X[col] = train_X[col].fillna('Missing')
    test_X[col] = test_X[col].fillna('Missing')
    
    unique_values = pd.concat([train_X[col], test_X[col]]).unique()
    mapping = {k: i for i, k in enumerate(unique_values)}
    
    train_X[col] = train_X[col].map(mapping)
    test_X[col] = test_X[col].map(mapping)

In [6]:
NUM_COLS = [col for col in FEATURE_COLS if train_X[col].nunique() > 5]
CATEGORY_NUM_COLS = [col for col in FEATURE_COLS if train_X[col].nunique() <= 5]


In [7]:
def impute_by_group(df, group_col, cols_to_impute):
    grouped = df.groupby(group_col)
    for col in cols_to_impute:
        # Calculate mean of the group
        # Fill missing value with mean
        df[col] = df.groupby(group_col)[col].transform(lambda x: x.fillna(x.mean()))
    return df

train_X = impute_by_group(train_X, 'Basic_Demos-Age', NUM_COLS)
test_X = impute_by_group(test_X, 'Basic_Demos-Age', NUM_COLS)

imputer = SimpleImputer(strategy='mean')
train_X[NUM_COLS] = imputer.fit_transform(train_X[NUM_COLS])
test_X[NUM_COLS] = imputer.transform(test_X[NUM_COLS])


imputer = SimpleImputer(strategy='mean')
train_X[CATEGORY_NUM_COLS] = imputer.fit_transform(train_X[CATEGORY_NUM_COLS])
test_X[CATEGORY_NUM_COLS] = imputer.transform(test_X[CATEGORY_NUM_COLS])

pd.set_option('display.max_rows', None)


# for col in NUM_COLS:
#     nan_groups = train_X.groupby('Basic_Demos-Age')[col].apply(lambda x: x.isna().all())
#     for group in nan_groups[nan_groups].index:
#         print(f"Nhóm tuổi: {group} có tất cả NaN trong cột: {col}")



In [8]:
# print(train_X.isnull().sum())
# print(test_X.isnull().sum())


Train model

In [9]:
xgb_params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,
    'reg_lambda': 5,
    'random_state': 42,
    'tree_method': 'hist'
}
lgb_params = {
    'learning_rate': 0.05,
    'max_depth': 14,
    'num_leaves': 500,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,
    'lambda_l2': 0.01,
    'random_state': 42,
    'verbosity': -1,
}
cat_params = {
    'learning_rate': 0.04,
    'depth': 6,
    'iterations': 200,
    'random_seed': 42,
    'verbose': 0,
    'l2_leaf_reg': 10,
}
tabnet_params = {
    'n_d': 128,
    'n_a': 128,
    'n_steps': 5,
    'gamma': 1.5,
    'n_independent': 2,
    'n_shared': 2,
    'lambda_sparse': 1e-4,
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2, weight_decay=1e-5),
    'mask_type': 'entmax',
    'verbose': 0,
    'seed': 42,
    'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
}

In [10]:
class TabNetRegressorWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **params):
        self.tabnet_params = params
        self.model = None

    def fit(self, X, y):
        self.model = TabNetRegressor(**self.tabnet_params)
        self.model.fit(
            X_train=X,
            y_train=y.reshape(-1, 1),
            max_epochs=200,
            patience=20,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False
        )
        return self

    def predict(self, X):
        return self.model.predict(X).squeeze()

In [11]:
xgb_model = XGBRegressor(**xgb_params)
lgb_model = LGBMRegressor(**lgb_params)
cat_model = CatBoostRegressor(**cat_params)
tabnet_model = TabNetRegressorWrapper(**tabnet_params)

voting_model = VotingRegressor(estimators=[
    ('xgb', xgb_model),
    ('lgb', lgb_model),
    ('cat', cat_model),
    #('tabnet', tabnet_model),
], weights = [1, 1, 1])

In [12]:
n_splits = 5

y_bins = pd.cut(train_y, bins=5, labels=False)
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_pred = np.zeros(len(train_y))
test_pred = np.zeros(len(test_X))
val_scores = []

train_X = train_X.astype(np.float32)
test_X = test_X.astype(np.float32)
train_y = train_y.astype(np.float32)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_X, y_bins)):
    X_tr, X_val = train_X.iloc[train_idx], train_X.iloc[val_idx]
    y_tr, y_val = train_y[train_idx], train_y[val_idx]

    voting_model.fit(X_tr, y_tr)

    y_val_pred = voting_model.predict(X_val)
    oof_pred[val_idx] = y_val_pred
    test_pred += voting_model.predict(test_X) / n_splits

    y_val_pred_rounded = np.round(y_val_pred).astype(int)
    val_kappa = cohen_kappa_score(y_val, y_val_pred_rounded, weights='quadratic')
    val_scores.append(val_kappa)
    
    print(f"Fold {fold + 1} - Validation QWK: {val_kappa:.4f}")

print(f"Mean Validation QWK: {np.mean(val_scores):.4f}")

Fold 1 - Validation QWK: 0.3857
Fold 2 - Validation QWK: 0.3989
Fold 3 - Validation QWK: 0.3968
Fold 4 - Validation QWK: 0.3398
Fold 5 - Validation QWK: 0.3441
Mean Validation QWK: 0.3731


Predict

In [13]:
initial_thresholds = [0.5, 1.5, 2.5]

def threshold_rounder(y_pred, thresholds):
    y_pred_adjusted = np.copy(y_pred)
    y_pred_adjusted[y_pred < thresholds[0]] = 0
    y_pred_adjusted[(y_pred >= thresholds[0]) & (y_pred < thresholds[1])] = 1
    y_pred_adjusted[(y_pred >= thresholds[1]) & (y_pred < thresholds[2])] = 2
    y_pred_adjusted[y_pred >= thresholds[2]] = 3
    return y_pred_adjusted.astype(int)

def evaluate_thresholds(thresholds, y_true, y_pred):
    y_pred_adjusted = threshold_rounder(y_pred, thresholds)
    qwk = cohen_kappa_score(y_true, y_pred_adjusted, weights='quadratic')
    return -qwk 

opt_result = minimize(
    fun=evaluate_thresholds,
    x0=initial_thresholds,
    args=(train_y, oof_pred),
    method='Nelder-Mead'
)
optimal_thresholds = opt_result.x

optimized_oof_pred = threshold_rounder(oof_pred, optimal_thresholds)
optimized_qwk = cohen_kappa_score(train_y, optimized_oof_pred, weights='quadratic')

print("Optimized QWK Score:", optimized_qwk)

test_pred_adjusted = threshold_rounder(test_pred, optimal_thresholds)

Optimized QWK Score: 0.4630030018145104


In [14]:
prediction = pd.DataFrame({
    'id': test_ids,
    'sii': test_pred_adjusted
})

prediction.to_csv('submission.csv', index=False)
print(prediction)

          id  sii
0   00008ff9    1
1   000fd460    0
2   00105258    0
3   00115b9f    0
4   0016bb22    0
5   001f3379    1
6   0038ba98    0
7   0068a485    0
8   0069fbed    2
9   0083e397    0
10  0087dd65    1
11  00abe655    0
12  00ae59c9    1
13  00af6387    1
14  00bd4359    1
15  00c0cd71    1
16  00d56d4b    0
17  00d9913d    0
18  00e6167c    0
19  00ebc35d    0
