<a href="https://colab.research.google.com/github/HieuSerend/ML_HHK/blob/main/ML_HHK_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

child_mind_institute_problematic_internet_use_path = kagglehub.competition_download('child-mind-institute-problematic-internet-use')

print('Data source import complete.')


# Child Mind Institute — Problematic Internet Use
* In this challenge, the value of sii (target) is unknown for 1224 rows of train.csv file.
* A KNNImputer is used for handling features and missing values
* LGBMRegressor is used to predict the value of the sii column (presumably a target variable) based on the provided features in the dataset.

In [None]:
import numpy as np
import pandas as pd

#....................................................
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
!ls ../input/*

In [None]:
train_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv', index_col='id')
test_data = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv', index_col='id')

train_data.shape, test_data.shape

**Impute missing values in the 'sii' column using a LightGBM regression model.**

In [None]:
from sklearn.impute import KNNImputer
from lightgbm import LGBMRegressor

def filling_sii(data):
    # Filling missing data
    num_imputer = KNNImputer(n_neighbors=2, weights="uniform")

    features = data.columns.tolist()

    num_features = [f for f in features if data[f].dtype == 'float' or f == 'Basic_Demos-Age']
    impute_features = [f for f in num_features if f != 'sii']

    num_imputer.fit(data[impute_features])
    data[impute_features] = num_imputer.transform(data[impute_features])

    cat_features = [f for f in features if f not in num_features and f != 'sii']

    for cat in cat_features:
        data[cat] = data[cat].fillna('unknown')
        data[cat] = data[cat].astype('category')

    # Data preparation
    train_sii = data[data['sii'].notna()].copy()
    test_sii = data[data['sii'].isnull()].copy()
    y_sii = train_sii['sii'].copy()
    X_sii = train_sii.drop('sii', axis=1).copy()
    test_X_sii = test_sii.drop('sii', axis=1).copy()

    # Modeling
    lgbm_params = {
        'learning_rate': 0.046,
        'max_depth': 12,
        'num_leaves': 478,
        'min_data_in_leaf': 13,
        'feature_fraction': 0.893,
        'bagging_fraction': 0.784,
        'bagging_freq': 4,
        'lambda_l1': 10,
        'lambda_l2': 0.01
    }

    model = LGBMRegressor(**lgbm_params, verbose=-1)
    model.fit(X_sii, y_sii)

    pred = model.predict(test_X_sii)

    # Data setting
    X_sii['sii'] = y_sii.copy()
    test_X_sii['sii'] = np.round(pred.copy())

    sii_impute = pd.concat([X_sii, test_X_sii], axis=0)
    sii_impute.sort_index(axis=0, inplace=True)

    return sii_impute['sii']

train_data['sii'] = filling_sii(train_data.copy())

In [None]:
train_col = train_data.columns.tolist()
test_col = test_data.columns.tolist()

features = test_col.copy()

In [None]:
main_df = pd.concat([train_data[features], test_data], axis=0)

In [None]:
num_features = [f for f in features if train_data[f].dtype == 'float' or f=='Basic_Demos-Age']
cat_features = [f for f in features if f not in num_features]

len(cat_features), len(num_features)

**Handle missing values in both numeric and categorical features.**

In [None]:
def handling_nulls(df):
    imputer = KNNImputer(n_neighbors=2, weights="uniform")

    imputer.fit(df[num_features])
    df[num_features] = imputer.transform(df[num_features])

    for cat in cat_features:
        df[cat] = df[cat].fillna('unknown')
        df[cat] = df[cat].astype('category')

    return df

main_df = handling_nulls(main_df)

In [None]:
from sklearn.preprocessing import MinMaxScaler

df_code = pd.get_dummies(main_df, columns=cat_features)

scaler = MinMaxScaler()
df_code[num_features] = scaler.fit_transform(df_code[num_features])

# Time series

In [None]:
from tqdm import tqdm
from IPython.display import clear_output
from concurrent.futures import ThreadPoolExecutor
import os

In [None]:
# Process time series data by reading and summarizing statistics.
def process_file(filename, dirname):
    data = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    data.drop('step', axis=1, inplace=True)
    return data.describe().values.reshape(-1), filename.split('=')[1]
# Load and aggregate time series data into a summary DataFrame.
def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)

    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)

    data = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    data['id'] = indexes
    return data

train_ts = load_time_series('/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet')
test_ts = load_time_series('/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet')

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove('id')

In [None]:
df_main_ts = pd.concat([train_ts, test_ts], axis=0)

scaler = MinMaxScaler()
df_main_ts[time_series_cols] = scaler.fit_transform(df_main_ts[time_series_cols])

In [None]:
df_code = df_code.reset_index()

train_df = df_code[:3960].copy()
test_df = df_code[3960:].copy()

train_df.shape, test_df.shape

In [None]:
train_ts = df_main_ts[:996].copy()
test_ts = df_main_ts[996:].copy()

train_ts.shape, test_ts.shape

In [None]:
main_train_data = pd.merge(train_df, train_ts, how='left', on='id')
main_test_data = pd.merge(test_df, test_ts, how='left', on='id')

for col in time_series_cols:

    main_train_data[col] = main_train_data[col].fillna(main_train_data[col].median())
    main_test_data[col] = main_test_data[col].fillna(main_test_data[col].median())

main_train_data.shape, main_test_data.shape

# Train LightGBM model

In [None]:
y = train_data['sii'].copy()
X = main_train_data.drop('id', axis=1)
XX = main_test_data.drop('id', axis=1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
lgbm_params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,
    'lambda_l2': 0.01
}

model = LGBMRegressor(**lgbm_params, verbose=-1)

model.fit(X, y)

In [None]:
pred = model.predict(XX)

# Submission

In [None]:
sub = pd.DataFrame({'id': main_test_data['id'], 'sii': np.round(pred)})
sub.to_csv('submission.csv', index=False)