In [1]:
from tqdm import tqdm
from itertools import product

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GroupKFold
from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings('ignore', message='not allowed')

In [2]:
# Define the hour and minute ranges for constructing feature column names
hours = range(0, 6, 1)
minutes = range(0, 60, 5)

target_col = "bg+1-00"  # Target column name for prediction
group_col = "p_num"  # Column name for grouping (e.g., participant number)
date_col = "time"  # Column name for time data

# We only need the last 12 time intervals (1 hour)
bg_cols = [f"bg-{i}-{j:02d}" for i, j in product(hours, minutes)][:12]
insu_cols = [f"insulin-{i}-{j:02d}" for i, j in product(hours, minutes)][:12]
carb_cols = [f"carbs-{i}-{j:02d}" for i, j in product(hours, minutes)][:12]
hr_cols = [f"hr-{i}-{j:02d}" for i, j in product(hours, minutes)][:12]
step_cols = [f"steps-{i}-{j:02d}" for i, j in product(hours, minutes)][:12]
cals_cols = [f"cals-{i}-{j:02d}" for i, j in product(hours, minutes)][:12]

feature_cols = bg_cols + insu_cols + carb_cols + hr_cols + step_cols + cals_cols

In [3]:
df_train = pd.read_csv(
    '/kaggle/input/brist1d/train.csv', 
    index_col='id', 
    parse_dates=['time'],
)

df_test = pd.read_csv(
    '/kaggle/input/brist1d/test.csv', 
    index_col='id', 
    parse_dates=['time'],
)

df_subm = pd.read_csv(
    "/kaggle/input/brist1d/sample_submission.csv",
    index_col='id',
)

# Some frameworks may not handle column names with special characters like colons properly
df_train.columns = df_train.columns.str.replace(':', '-')
df_test.columns = df_test.columns.str.replace(':', '-')


seed = 43  #
Thr_NAN = 49

for colset in [bg_cols, insu_cols, carb_cols, hr_cols, step_cols, cals_cols]:
    df_train[colset] = (
        df_train[colset]
        .interpolate(axis=1)
        .fillna(method="bfill", axis=1)
        .fillna(method="ffill", axis=1)
    )
    df_test[colset] = (
        df_test[colset]
        .interpolate(axis=1)
        .fillna(method="bfill", axis=1)
        .fillna(method="ffill", axis=1)
    )
mask = df_train[feature_cols].isna().sum(axis=1) <= Thr_NAN
# Apply the mask to filter the rows
df_train = df_train[mask]


imputer = SimpleImputer()  

df_train[feature_cols] = imputer.fit_transform(df_train[feature_cols])
df_test[feature_cols] = imputer.transform(df_test[feature_cols])


df_train["sin_hour"] = np.sin(np.pi * df_train[date_col].dt.hour / 12)
df_train["cos_hour"] = np.cos(np.pi * df_train[date_col].dt.hour / 12)

df_test["sin_hour"] = np.sin(np.pi * df_test[date_col].dt.hour / 12)
df_test["cos_hour"] = np.cos(np.pi * df_test[date_col].dt.hour / 12)

feature_cols.extend(["sin_hour", "cos_hour"])


grouped_features = []

# Iterate through each set of related columns (e.g., blood glucose, insulin, etc.)
for colset in [bg_cols, insu_cols, carb_cols, hr_cols, step_cols, cals_cols]:
    group_idxs = [idx for idx, col in enumerate(feature_cols) if col in colset]
    grouped_features.append(group_idxs)


df_train_final = df_train[feature_cols]
groups = df_train[group_col]

y_target = df_train[[target_col]]

df_test_final = df_test[feature_cols]

X = df_train_final
y = y_target

  df_train = pd.read_csv(
  df_train = pd.read_csv(
  df_test = pd.read_csv(
  df_train[colset]
  df_test[colset]
  df_train[colset]
  df_test[colset]
  df_train[colset]
  df_test[colset]
  df_train[colset]
  df_test[colset]
  df_train[colset]
  df_test[colset]
  df_train[colset]
  df_test[colset]


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling 
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
test_X = scaler.transform(test_X)
df_test_final = scaler.transform(df_test_final)

In [5]:
%%time
import lightgbm as lgb
from lightgbm import early_stopping

train_y = np.array(train_y)
test_y = np.array(test_y)
# Create the LightGBM dataset
train_data = lgb.Dataset(train_X, label=train_y)
test_data = lgb.Dataset(test_X, label=test_y)

# Set initial parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    "device": "gpu",  # Use GPU
    'feature_fraction': 0.9
}


# Train the model
gbm = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, test_data],
    callbacks=[early_stopping(stopping_rounds=10)]
)



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 17392
[LightGBM] [Info] Number of data points in the train set: 140392, number of used features: 74
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 62 dense feature groups (8.57 MB) transferred to GPU in 0.009131 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 8.263568
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 1.60227	valid_1's rmse: 1.8127
CPU times: user 45.1 s, sys: 1.1 s, total: 46.2 s
Wall time: 17.8 s


In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

pred = gbm.predict(test_X)
mae = mean_absolute_error(test_y, pred)
print(f"MAE:{mae}")
rmse = np.sqrt(mean_squared_error(test_y, pred))
print(f"RMSE: {rmse:.4f}")

MAE:1.3389883925348074
RMSE: 1.8127


In [7]:
%%time
from sklearn.model_selection import GridSearchCV

param_grid = {
    'num_leaves': [70, 80],
    'learning_rate': [0.01, 0.001],
    'n_estimators': [1000, 3500, 4500]
}

# Create the LightGBM estimator
estimator = lgb.LGBMRegressor(device="gpu")

train_y 
# Perform grid search
gbm_cv = GridSearchCV(estimator, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
gbm_cv.fit(train_X, train_y)

# Best parameters
print('Best cross-validation score:', -gbm_cv.best_score_)
print('Best parameters found by grid search are:', gbm_cv.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 17401
[LightGBM] [Info] Number of data points in the train set: 112313, number of used features: 74
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 62 dense feature groups (6.86 MB) transferred to GPU in 0.048609 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 8.272823
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 17373
[LightGBM] [Info] Number of data points in the train set: 112313, number of used features: 74
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGB

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 17377
[LightGBM] [Info] Number of data points in the train set: 112314, number of used features: 74
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 62 dense feature groups (6.86 MB) transferred to GPU in 0.020885 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 8.262814
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 17370
[LightGBM] [Info] Number of data points in the train set: 112314, number of used features: 74
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGB

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 17373
[LightGBM] [Info] Number of data points in the train set: 112313, number of used features: 74
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 62 dense feature groups (6.86 MB) transferred to GPU in 0.099052 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 8.255092
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 17364
[LightGBM] [Info] Number of data points in the train set: 112314, number of used features: 74
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGB

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 17392
[LightGBM] [Info] Number of data points in the train set: 140392, number of used features: 74
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 62 dense feature groups (8.57 MB) transferred to GPU in 0.007954 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 8.263568
Best cross-validation score: 3.0782059577970977
Best parameters found by grid search are: {'learning_rate': 0.01, 'n_estimators': 4500, 'num_leaves': 80}
CPU times: user 2min 34s, sys: 4.87 s, total: 2min 39s
Wall time: 1h 8min 58s


In [8]:
best_params = gbm_cv.best_params_

# Update the parameters
params.update(best_params)

# Train the final model
gbm = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, test_data],
    callbacks=[early_stopping(stopping_rounds=10)]
)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 17392
[LightGBM] [Info] Number of data points in the train set: 140392, number of used features: 74
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 62 dense feature groups (8.57 MB) transferred to GPU in 0.008911 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 8.263568
Training until validation scores don't improve for 10 rounds




Did not meet early stopping. Best iteration is:
[4500]	training's rmse: 1.3431	valid_1's rmse: 1.73708


In [9]:
pred = gbm.predict(test_X)
mae = mean_absolute_error(test_y, pred)
print(f"MAE:{mae}")
rmse = np.sqrt(mean_squared_error(test_y, pred))
print(f"RMSE: {rmse:.4f}")

MAE:1.2802394442916867
RMSE: 1.7371


In [10]:
Results = gbm.predict(df_test_final)

df_subm['bg+1:00'] = Results

df_subm.to_csv('Submission_LGB_Fine-tuning.csv')

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 17370
[LightGBM] [Info] Number of data points in the train set: 112314, number of used features: 74
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 62 dense feature groups (6.86 MB) transferred to GPU in 0.027740 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 8.257620
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 17401
[LightGBM] [Info] Number of data points in the train set: 112313, number of used features: 74
[LightGBM] [Info] Using GPU Device: Tesla P100-PCIE-16GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGB