In [1]:
%%capture
%load_ext autoreload
%autoreload 2

# We can also put these magic commands in `.vscode/settings.json` like this:
# "jupyter.runStartupCommands": [
#     "%load_ext autoreload",
#     "%autoreload 2"
# ]

In [2]:
import numpy as np
import polars as pl
import lightgbm as lgb
from scipy.optimize import minimize
from sklearn.impute import KNNImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score, accuracy_score

print("All imports loaded successfully")

All imports loaded successfully


In [3]:
df_train = pl.read_csv('../input/processed/train_combined.csv')

print(df_train.head())

shape: (5, 106)
┌──────────┬────────────┬───────────┬───────────┬───┬───────────┬──────────┬───────────┬───────────┐
│ id       ┆ Basic_Demo ┆ Basic_Dem ┆ Basic_Dem ┆ … ┆ Z_std     ┆ enmo_std ┆ light_std ┆ battery_v │
│ ---      ┆ s-Enroll_S ┆ os-Age    ┆ os-Sex    ┆   ┆ ---       ┆ ---      ┆ ---       ┆ oltage_st │
│ str      ┆ eason      ┆ ---       ┆ ---       ┆   ┆ f64       ┆ f64      ┆ f64       ┆ d         │
│          ┆ ---        ┆ i64       ┆ i64       ┆   ┆           ┆          ┆           ┆ ---       │
│          ┆ str        ┆           ┆           ┆   ┆           ┆          ┆           ┆ f64       │
╞══════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪══════════╪═══════════╪═══════════╡
│ 00008ff9 ┆ Fall       ┆ 5         ┆ 0         ┆ … ┆ null      ┆ null     ┆ null      ┆ null      │
│ 000fd460 ┆ Summer     ┆ 9         ┆ 0         ┆ … ┆ null      ┆ null     ┆ null      ┆ null      │
│ 00105258 ┆ Summer     ┆ 10        ┆ 1         ┆ … ┆ null      ┆ null     

In [4]:
print(f'Number of rows before dropping nulls: {df_train.shape[0]}')
df_train = df_train.drop_nulls(subset=['sii'])
print(f'Number of rows after dropping nulls: {df_train.shape[0]}')

Number of rows before dropping nulls: 3960
Number of rows after dropping nulls: 2736


In [5]:
X = df_train.drop('sii')
y = df_train.select('sii')

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (2736, 105)
y shape: (2736, 1)


In [6]:
def feature_engineering(df, is_training=False, imputer=None):
    df = df.with_columns(
        PAQ_Total = pl.when(
            (pl.col('PAQ_C-PAQ_C_Total').is_null()) | (pl.col('PAQ_A-PAQ_A_Total').is_null())
        )
        .then((pl.col('PAQ_C-PAQ_C_Total').fill_null(0) + pl.col('PAQ_A-PAQ_A_Total').fill_null(0))/2)
        .otherwise(pl.lit(None)),
        Fitness_Endurance_Duration = pl.col('Fitness_Endurance-Time_Mins') * 60 + pl.col('Fitness_Endurance-Time_Sec')
    )
    
    # Remove all season and pciat cols
    season_cols = [col for col in df.columns if col.endswith('Season')]
    pciat_cols = [col for col in df.columns if col.startswith('PCIAT')]
    df = df.drop(pciat_cols + season_cols + ['id',
                  'PAQ_C-PAQ_C_Total', 'PAQ_A-PAQ_A_Total', 'Fitness_Endurance-Time_Mins', 
                  'Fitness_Endurance-Time_Sec'])
    
    imputer = KNNImputer(n_neighbors=10, add_indicator=True)
    res = imputer.fit_transform(df)
    df = pl.DataFrame(res, schema=list(imputer.get_feature_names_out()), orient="row")

    return df
    # if is_training:
    #     imputer = KNNImputer(n_neighbors=10, add_indicator=True)
    #     res = imputer.fit_transform(df)
    #     df = pd.DataFrame(res, columns=imputer.get_feature_names_out())
    
    # else:
    #     assert imputer is not None
    #     df = pd.DataFrame(imputer.transform(df), columns=imputer.get_feature_names_out())
    
    # if is_training:
    #     return df, imputer
    # else:
    #     return df, None

In [7]:
X = feature_engineering(X)

In [8]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def roundoff(arr, thresholds=[0.5, 1.5, 2.5]):
    return np.where(arr < thresholds[0], 0, 
                np.where(arr < thresholds[1], 1, 
                    np.where(arr < thresholds[2], 2, 3)))

def evaluate(thresholds, y_true, y_pred):
    score = quadratic_weighted_kappa(y_true, roundoff(y_pred, thresholds))
    return -score

In [9]:
skf = StratifiedKFold()

scores = []
oof_raw = np.zeros(len(y), dtype=float) # oof predictions, before rounding
oof = np.zeros(len(y), dtype=int) # oof predictions, rounded
for fold, (tridx, validx) in enumerate(skf.split(X, y)):
    # model = XGBRegressor()
    # model = CatBoostRegressor(silent=True, allow_writing_files=False)
    model = lgb.LGBMRegressor(class_weight='balanced', verbose=-1)
    model.fit(X[tridx], y[tridx].to_numpy().ravel())
    
    y_pred = model.predict(X[validx])
    oof_raw[validx] = y_pred
    y_pred = roundoff(y_pred, thresholds=[0.49120744, 1.35878992, 2.65786462])
    oof[validx] = y_pred

    score = quadratic_weighted_kappa(y[validx].to_numpy().ravel(), y_pred)
    scores.append(score)

    accuracy = accuracy_score(y[validx].to_numpy().ravel(), y_pred)
    print(f"Fold: {fold}, Score: {score:.6f}, Accuracy: {accuracy:.6f}")
    print("-"*40)

print(f"Mean score: {np.mean(scores)}")
score = quadratic_weighted_kappa(y, oof)
print(f"OOF score: {score}")

thresholds = minimize(evaluate, [0.5, 1.5, 2.5], args=(y, oof_raw), method='Nelder-Mead').x
print('Thresholds', thresholds)

y_pred_tuned = roundoff(oof_raw, thresholds=thresholds)
print("Tuned OOF Score:", quadratic_weighted_kappa(y, y_pred_tuned))

Fold: 0, Score: 0.358588, Accuracy: 0.490876
----------------------------------------
Fold: 1, Score: 0.416525, Accuracy: 0.541133
----------------------------------------
Fold: 2, Score: 0.406810, Accuracy: 0.504570
----------------------------------------
Fold: 3, Score: 0.407394, Accuracy: 0.522852
----------------------------------------
Fold: 4, Score: 0.276250, Accuracy: 0.458867
----------------------------------------
Mean score: 0.3731133619306023
OOF score: 0.3733536067613198
Thresholds [0.60670341 1.01666854 2.7669665 ]
Tuned OOF Score: 0.4298478843169621
