In [130]:
%%capture
%load_ext autoreload
%autoreload 2

# We can also put these magic commands in `.vscode/settings.json` like this:
# "jupyter.runStartupCommands": [
#     "%load_ext autoreload",
#     "%autoreload 2"
# ]

In [143]:
import optuna
import numpy as np
import polars as pl
import lightgbm as lgb
import altair as alt
from scipy.optimize import minimize
from sklearn.impute import KNNImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score, accuracy_score

print("All imports loaded successfully")

All imports loaded successfully


  from .autonotebook import tqdm as notebook_tqdm


In [132]:
df_train = pl.read_csv('../input/processed/train_combined_v2.csv')

print(df_train.head())

shape: (5, 107)
┌──────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ id       ┆ Basic_Dem ┆ Basic_Dem ┆ Basic_Dem ┆ … ┆ first7_av ┆ first7_av ┆ first7_av ┆ first7_av │
│ ---      ┆ os-Enroll ┆ os-Age    ┆ os-Sex    ┆   ┆ g_light_m ┆ g_light_m ┆ g_light_s ┆ g_light_m │
│ str      ┆ _Season   ┆ ---       ┆ ---       ┆   ┆ in        ┆ ean       ┆ td        ┆ ax        │
│          ┆ ---       ┆ i64       ┆ i64       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│          ┆ str       ┆           ┆           ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64       │
╞══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 00008ff9 ┆ Fall      ┆ 5         ┆ 0         ┆ … ┆ null      ┆ null      ┆ null      ┆ null      │
│ 000fd460 ┆ Summer    ┆ 9         ┆ 0         ┆ … ┆ null      ┆ null      ┆ null      ┆ null      │
│ 00105258 ┆ Summer    ┆ 10        ┆ 1         ┆ … ┆ null      ┆ null      

In [133]:
print(f'Number of rows before dropping nulls: {df_train.shape[0]}')
df_train = df_train.drop_nulls(subset=['sii'])
print(f'Number of rows after dropping nulls: {df_train.shape[0]}')

Number of rows before dropping nulls: 3960
Number of rows after dropping nulls: 2736


In [134]:
X = df_train.drop('sii')
y = df_train.select('sii')

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (2736, 106)
y shape: (2736, 1)


In [135]:
def feature_engineering(df, is_training=False, imputer=None):
    df = df.with_columns(
        PAQ_Total = pl.when(
            (pl.col('PAQ_C-PAQ_C_Total').is_null()) | (pl.col('PAQ_A-PAQ_A_Total').is_null())
        )
        .then((pl.col('PAQ_C-PAQ_C_Total').fill_null(0) + pl.col('PAQ_A-PAQ_A_Total').fill_null(0))/2)
        .otherwise(pl.lit(None)),
        Fitness_Endurance_Duration = pl.col('Fitness_Endurance-Time_Mins') * 60 + pl.col('Fitness_Endurance-Time_Sec')
    )
    
    # Remove all season and pciat cols
    season_cols = [col for col in df.columns if col.endswith('Season')]
    pciat_cols = [col for col in df.columns if col.startswith('PCIAT')]
    df = df.drop(pciat_cols + season_cols + ['id',
                  'PAQ_C-PAQ_C_Total', 'PAQ_A-PAQ_A_Total', 'Fitness_Endurance-Time_Mins', 
                  'Fitness_Endurance-Time_Sec'])

    imputing_cols = [
        'Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-CGAS_Score', 
        'Physical-BMI', 'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference', 
        'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP', 
        'Fitness_Endurance-Max_Stage', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 
        'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 
        'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 
        'FGC-FGC_TL_Zone', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI', 
        'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM', 'BIA-BIA_FFMI', 
        'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 
        'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 
        'PreInt_EduHx-computerinternet_hoursday'
    ]
    imputer = KNNImputer(n_neighbors=10, add_indicator=True)
    res = imputer.fit_transform(df[imputing_cols])
    df = df.drop(imputing_cols)

    imputed_df = pl.DataFrame(res, schema=list(imputer.get_feature_names_out()), orient="row")
    df = pl.concat([df, imputed_df], how="horizontal")
    missing_indicator_cols = [col for col in imputed_df.columns if col.startswith('missingindicator') and col not in ['missingindicator_Physical-Waist_Circumference', 'missingindicator_CGAS-CGAS_Score']]
    df = df.drop(missing_indicator_cols)

    return df
    # if is_training:
    #     imputer = KNNImputer(n_neighbors=10, add_indicator=True)
    #     res = imputer.fit_transform(df)
    #     df = pd.DataFrame(res, columns=imputer.get_feature_names_out())
    
    # else:
    #     assert imputer is not None
    #     df = pd.DataFrame(imputer.transform(df), columns=imputer.get_feature_names_out())
    
    # if is_training:
    #     return df, imputer
    # else:
    #     return df, None

In [136]:
X = feature_engineering(X)

In [137]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def roundoff(arr, thresholds=[0.5, 1.5, 2.5]):
    return np.where(arr < thresholds[0], 0, 
                np.where(arr < thresholds[1], 1, 
                    np.where(arr < thresholds[2], 2, 3)))

def evaluate(thresholds, y_true, y_pred):
    score = quadratic_weighted_kappa(y_true, roundoff(y_pred, thresholds))
    return -score

In [154]:
skf = StratifiedKFold()

best_params_lgbm = {
    'n_estimators': 2000,
    'learning_rate': 0.023742598951513243,
    'num_leaves': 2580,
    'max_depth': 11,
    'min_data_in_leaf': 80,
    'lambda_l1': 5,
    'lambda_l2': 5,
    'min_gain_to_split': 12.594654152003114,
    'bagging_fraction': 0.9000000000000001,
    'bagging_freq': 1,
    'feature_fraction': 0.8,
    'class_weight': 'balanced'
}

scores = []
oof_raw = np.zeros(len(y), dtype=float) # oof predictions, before rounding
oof = np.zeros(len(y), dtype=int) # oof predictions, rounded
models = []
for fold, (tridx, validx) in enumerate(skf.split(X, y)):
    # model = XGBRegressor()
    # model = CatBoostRegressor(silent=True, allow_writing_files=False)
    model = lgb.LGBMRegressor(**best_params_lgbm, verbose=-1)
    model.fit(X[tridx], y[tridx].to_numpy().ravel())
    models.append(model)
    
    y_pred = model.predict(X[validx])
    oof_raw[validx] = y_pred
    y_pred = roundoff(y_pred, thresholds=[0.49120744, 1.35878992, 2.65786462])
    oof[validx] = y_pred

    score = quadratic_weighted_kappa(y[validx].to_numpy().ravel(), y_pred)
    scores.append(score)

    accuracy = accuracy_score(y[validx].to_numpy().ravel(), y_pred)
    print(f"Fold: {fold}, Score: {score:.6f}, Accuracy: {accuracy:.6f}")
    print("-"*40)

print(f"Mean score: {np.mean(scores)}")
score = quadratic_weighted_kappa(y, oof)
print(f"OOF score: {score}")

thresholds = minimize(evaluate, [0.5, 1.5, 2.5], args=(y, oof_raw), method='Nelder-Mead').x
print('Thresholds', thresholds)

y_pred_tuned = roundoff(oof_raw, thresholds=thresholds)
print("Tuned OOF Score:", quadratic_weighted_kappa(y, y_pred_tuned))

Fold: 0, Score: 0.305256, Accuracy: 0.383212
----------------------------------------
Fold: 1, Score: 0.324388, Accuracy: 0.351005
----------------------------------------
Fold: 2, Score: 0.352618, Accuracy: 0.367459
----------------------------------------
Fold: 3, Score: 0.333162, Accuracy: 0.371115
----------------------------------------
Fold: 4, Score: 0.277520, Accuracy: 0.382084
----------------------------------------
Mean score: 0.318588783066711
OOF score: 0.3187647042731302
Thresholds [0.79146906 1.47205755 2.53974656]
Tuned OOF Score: 0.4545727565988984


In [139]:
feature_importance = np.mean([model.feature_importances_ for model in models], axis=0)
cols = list(X.columns)

chart = alt.Chart(pl.DataFrame([{'label': label, 'height': height} for label, height in zip(cols, feature_importance)])).mark_bar().encode(
    x=alt.X('label:N', sort='-y'),
    y=alt.Y('height:Q'),
    tooltip=['label', 'height']
).properties(
    title='Bar Chart from Lists'
).interactive()

chart.display()

In [151]:
def objective(trial):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_categorical("n_estimators", [100, 500, 1000, 2000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 500, step=10),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 1.0, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 1.0, step=0.1
        ),
    }

    skf = StratifiedKFold()
    oof_raw = np.zeros(len(y), dtype=float) # oof predictions, before rounding
    for fold, (tridx, validx) in enumerate(skf.split(X, y)):
        model = lgb.LGBMRegressor(**param_grid, class_weight='balanced', verbose=-1)
        model.fit(X[tridx], y[tridx].to_numpy().ravel())
        oof_raw[validx] = model.predict(X[validx])

    thresholds = minimize(evaluate, [0.5, 1.5, 2.5], args=(y, oof_raw), method='Nelder-Mead').x
    y_pred_tuned = roundoff(oof_raw, thresholds=thresholds)
    return quadratic_weighted_kappa(y, y_pred_tuned)

study = optuna.create_study(direction="maximize", study_name="LGBM Regressor")
study.optimize(objective, n_trials=100)

[I 2024-10-04 19:37:09,235] A new study created in memory with name: LGBM Regressor
[I 2024-10-04 19:37:10,385] Trial 0 finished with value: 0.18647355385811504 and parameters: {'n_estimators': 1000, 'learning_rate': 0.1423592124857139, 'num_leaves': 440, 'max_depth': 6, 'min_data_in_leaf': 10, 'lambda_l1': 100, 'lambda_l2': 80, 'min_gain_to_split': 13.691596678808954, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 0 with value: 0.18647355385811504.
[I 2024-10-04 19:37:12,552] Trial 1 finished with value: 0.36846270526940583 and parameters: {'n_estimators': 2000, 'learning_rate': 0.06111367779125572, 'num_leaves': 480, 'max_depth': 4, 'min_data_in_leaf': 270, 'lambda_l1': 65, 'lambda_l2': 15, 'min_gain_to_split': 7.460375169320944, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.7}. Best is trial 1 with value: 0.36846270526940583.
[I 2024-10-04 19:37:13,317] Trial 2 finished with value: 0.19462737245695771 and param

In [152]:
study.best_params

{'n_estimators': 2000,
 'learning_rate': 0.023742598951513243,
 'num_leaves': 2580,
 'max_depth': 11,
 'min_data_in_leaf': 80,
 'lambda_l1': 5,
 'lambda_l2': 5,
 'min_gain_to_split': 12.594654152003114,
 'bagging_fraction': 0.9000000000000001,
 'bagging_freq': 1,
 'feature_fraction': 0.8}

In [153]:
study.best_value

0.4545727565988984