## Overview
This code (1) generates rolling features via the polars package (faster than pandas), (2) tunes hyperparameters for a LightGBM classifier for a sample of the training data (otherwise the data is too large) via the optuna framework and cross-validation, (3) gives out the obtained feature importances for every cross-validation model, (4) trains a final model on the whole sample of the training data, (5) preprocesses and predicts on the test data and (6) smoothes the predictions via a rolling mode.

In [None]:
import glob
import random
import warnings
import lightgbm
import optuna

import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import accuracy_score

random.seed(1)

In [None]:
# change this to the path to the folder containing the training data
PATH_TO_TRAINING_DATA_FOLDER = '/kaggle/input/zindi-network-traffic/Train_data/Train_data/'
PATH_TO_TRAINING_DATA_FOLDER = PATH_TO_TRAINING_DATA_FOLDER + '**'

# change this to the path to the folder containing the test data
PATH_TO_TEST_DATA_FOLDER = '/kaggle/input/zindi-network-traffic/Test_data/Test_data/'
PATH_TO_TEST_DATA_FOLDER = PATH_TO_TEST_DATA_FOLDER + '**'

# change this to the path to the file containing the sample submission file
PATH_TO_SAMPLE_SUBMISSION_FILE = '/kaggle/input/zindi-network-traffic/SampleSubmission.csv'

In [None]:
def reduce_memory_usage_pl(df, name):
    """ Reduce memory usage by polars dataframe {df} with name {name} by changing its data types.
        Original pandas version of this function: https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 """
    print(f"Memory usage of dataframe {name} is {round(df.estimated_size('mb'), 2)} MB")
    Numeric_Int_types = [pl.Int8,pl.Int16,pl.Int32,pl.Int64]
    Numeric_Float_types = [pl.Float32,pl.Float64]    
    for col in df.columns:
        col_type = df[col].dtype
        c_min = df[col].min()
        c_max = df[col].max()
        if col_type in Numeric_Int_types:
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df = df.with_columns(df[col].cast(pl.Int8))
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df = df.with_columns(df[col].cast(pl.Int16))
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df = df.with_columns(df[col].cast(pl.Int32))
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df = df.with_columns(df[col].cast(pl.Int64))
        elif col_type in Numeric_Float_types:
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df = df.with_columns(df[col].cast(pl.Float32))
            else:
                pass
        elif col_type == pl.Utf8:
            df = df.with_columns(df[col].cast(pl.Categorical))
        else:
            pass
    print(f"Memory usage of dataframe {name} became {round(df.estimated_size('mb'), 2)} MB")
    return df

In [None]:
def read(path):
    """Reads a file in the given path and adds the identifier in the name to the data itself."""
    try:
        df = pl.read_csv(path)
        df = df.with_columns(
            pl.lit(path.split('/')[-1].split('.')[0]).alias('ID')
        )
        return df
    except FileNotFoundError:
        print(path + "not found.")

In [None]:
def preprocess(df):
    """Creates differenct (rolling) features."""
    
    df = df.with_columns(
        (df['portPktIn'] - df['portPktOut']).alias("portPktDiff"),
    )
    
    df = df.with_columns(
        # Mean Rows Since Zero
        pl.col("portPktIn").cumcount().over(pl.col("portPktIn").cumsum()).rolling_mean(window_size=125, center=True, min_periods=1).alias("portPktInRowsSinceZeroRollingMeanCenter125"),
        pl.col("portPktIn").cumcount().over(pl.col("portPktIn").cumsum()).rolling_mean(window_size=250, center=True, min_periods=1).alias("portPktInRowsSinceZeroRollingMeanCenter250"),
        pl.col("portPktIn").cumcount().over(pl.col("portPktIn").cumsum()).rolling_mean(window_size=500, center=True, min_periods=1).alias("portPktInRowsSinceZeroRollingMeanCenter500"),
        pl.col("portPktIn").cumcount().over(pl.col("portPktIn").cumsum()).rolling_mean(window_size=1000, center=True, min_periods=1).alias("portPktInRowsSinceZeroRollingMeanCenter1000"),
        pl.col("portPktIn").cumcount().over(pl.col("portPktIn").cumsum()).rolling_mean(window_size=2000, center=True, min_periods=1).alias("portPktInRowsSinceZeroRollingMeanCenter2000"),
        pl.col("portPktOut").cumcount().over(pl.col("portPktOut").cumsum()).rolling_mean(window_size=125, center=True, min_periods=1).alias("portPktOutRowsSinceZeroRollingMeanCenter125"),
        pl.col("portPktOut").cumcount().over(pl.col("portPktOut").cumsum()).rolling_mean(window_size=250, center=True, min_periods=1).alias("portPktOutRowsSinceZeroRollingMeanCenter250"),
        pl.col("portPktOut").cumcount().over(pl.col("portPktOut").cumsum()).rolling_mean(window_size=500, center=True, min_periods=1).alias("pportPktOutRowsSinceZeroRollingMeanCenter500"),
        pl.col("portPktOut").cumcount().over(pl.col("portPktOut").cumsum()).rolling_mean(window_size=1000, center=True, min_periods=1).alias("portPktOutRowsSinceZeroRollingMeanCenter1000"),
        pl.col("portPktOut").cumcount().over(pl.col("portPktOut").cumsum()).rolling_mean(window_size=2000, center=True, min_periods=1).alias("portPktOutRowsSinceZeroRollingMeanCenter2000"),
        pl.col("portPktDiff").cumcount().over(pl.col("portPktDiff").cumsum()).rolling_mean(window_size=125, center=True, min_periods=1).alias("portPktDiffRowsSinceZeroRollingMeanCenter125"),
        pl.col("portPktDiff").cumcount().over(pl.col("portPktDiff").cumsum()).rolling_mean(window_size=250, center=True, min_periods=1).alias("portPktDiffRowsSinceZeroRollingMeanCenter250"),
        pl.col("portPktDiff").cumcount().over(pl.col("portPktDiff").cumsum()).rolling_mean(window_size=500, center=True, min_periods=1).alias("portPktDiffRowsSinceZeroRollingMeanCenter500"),
        pl.col("portPktDiff").cumcount().over(pl.col("portPktDiff").cumsum()).rolling_mean(window_size=1000, center=True, min_periods=1).alias("portPktDiffRowsSinceZeroRollingMeanCenter1000"),
        pl.col("portPktDiff").cumcount().over(pl.col("portPktDiff").cumsum()).rolling_mean(window_size=2000, center=True, min_periods=1).alias("portPktDiffRowsSinceZeroRollingMeanCenter2000"),
        pl.col("qSize").cumcount().over(pl.col("qSize").cumsum()).rolling_mean(window_size=125, center=True, min_periods=1).alias("qSizeRowsSinceZeroRollingMeanCenter125"),
        pl.col("qSize").cumcount().over(pl.col("qSize").cumsum()).rolling_mean(window_size=250, center=True, min_periods=1).alias("qSizeRowsSinceZeroRollingMeanCenter250"),
        pl.col("qSize").cumcount().over(pl.col("qSize").cumsum()).rolling_mean(window_size=500, center=True, min_periods=1).alias("qSizeRowsSinceZeroRollingMeanCenter500"),
        pl.col("qSize").cumcount().over(pl.col("qSize").cumsum()).rolling_mean(window_size=1000, center=True, min_periods=1).alias("qSizeRowsSinceZeroRollingMeanCenter1000"),
        pl.col("qSize").cumcount().over(pl.col("qSize").cumsum()).rolling_mean(window_size=2000, center=True, min_periods=1).alias("qSizeRowsSinceZeroRollingMeanCenter2000"),
        # Mean
        pl.col(["portPktIn"]).rolling_mean(window_size=100, center=True, min_periods=1).over("ID").alias("portPktInRollingMeanCenter100"),
        pl.col(["portPktIn"]).rolling_mean(window_size=250, center=True, min_periods=1).over("ID").alias("portPktInRollingMeanCenter250"),
        pl.col(["portPktIn"]).rolling_mean(window_size=500, center=True, min_periods=1).over("ID").alias("portPktInRollingMeanCenter500"),
        pl.col(["portPktIn"]).rolling_mean(window_size=1000, center=True, min_periods=1).over("ID").alias("portPktInRollingMeanCenter1000"),
        pl.col(["portPktIn"]).rolling_mean(window_size=2000, center=True, min_periods=1).over("ID").alias("portPktInRollingMeanCenter2000"),
        pl.col(["portPktOut"]).rolling_mean(window_size=100, center=True, min_periods=1).over("ID").alias("portPktOutRollingMeanCenter100"),
        pl.col(["portPktOut"]).rolling_mean(window_size=250, center=True, min_periods=1).over("ID").alias("portPktOutRollingMeanCenter250"),
        pl.col(["portPktOut"]).rolling_mean(window_size=500, center=True, min_periods=1).over("ID").alias("portPktOutRollingMeanCenter500"),
        pl.col(["portPktOut"]).rolling_mean(window_size=1000, center=True, min_periods=1).over("ID").alias("portPktOutRollingMeanCenter1000"),
        pl.col(["portPktOut"]).rolling_mean(window_size=2000, center=True, min_periods=1).over("ID").alias("portPktOutRollingMeanCenter2000"),
        #pl.col(["portPktDiff"]).rolling_mean(window_size=100, center=True, min_periods=1).over("ID").alias("portPktDiffRollingMeanCenter100"),
        pl.col(["portPktDiff"]).rolling_mean(window_size=250, center=True, min_periods=1).over("ID").alias("portPktDiffRollingMeanCenter250"),
        pl.col(["portPktDiff"]).rolling_mean(window_size=500, center=True, min_periods=1).over("ID").alias("portPktDiffRollingMeanCenter500"),
        pl.col(["portPktDiff"]).rolling_mean(window_size=1000, center=True, min_periods=1).over("ID").alias("portPktDiffRollingMeanCenter1000"),
        pl.col(["portPktDiff"]).rolling_mean(window_size=2000, center=True, min_periods=1).over("ID").alias("portPktDiffRollingMeanCenter2000"),
        pl.col(["qSize"]).rolling_mean(window_size=100, center=True, min_periods=1).over("ID").alias("qSizeRollingMeanCenter100"),
        pl.col(["qSize"]).rolling_mean(window_size=250, center=True, min_periods=1).over("ID").alias("qSizeRollingMeanCenter250"),
        pl.col(["qSize"]).rolling_mean(window_size=500, center=True, min_periods=1).over("ID").alias("qSizeRollingMeanCenter500"),
        pl.col(["qSize"]).rolling_mean(window_size=1000, center=True, min_periods=1).over("ID").alias("qSizeRollingMeanCenter1000"),
        pl.col(["qSize"]).rolling_mean(window_size=2000, center=True, min_periods=1).over("ID").alias("qSizeRollingMeanCenter2000"),
        # Maximum
        pl.col(["portPktIn"]).rolling_max(window_size=100, center=True, min_periods=1).over("ID").alias("portPktInRollingMaxCenter100"),
        pl.col(["portPktIn"]).rolling_max(window_size=250, center=True, min_periods=1).over("ID").alias("portPktInRollingMaxCenter250"),
        pl.col(["portPktIn"]).rolling_max(window_size=500, center=True, min_periods=1).over("ID").alias("portPktInRollingMaxCenter500"),
        pl.col(["portPktIn"]).rolling_max(window_size=1000, center=True, min_periods=1).over("ID").alias("portPktInRollingMaxCenter1000"),
        pl.col(["portPktIn"]).rolling_max(window_size=2000, center=True, min_periods=1).over("ID").alias("portPktInRollingMaxCenter2000"),
        pl.col(["portPktOut"]).rolling_max(window_size=100, center=True, min_periods=1).over("ID").alias("portPktOutRollingMaxCenter100"),
        pl.col(["portPktOut"]).rolling_max(window_size=250, center=True, min_periods=1).over("ID").alias("portPktOutRollingMaxCenter250"),
        pl.col(["portPktOut"]).rolling_max(window_size=500, center=True, min_periods=1).over("ID").alias("portPktOutRollingMaxCenter500"),
        pl.col(["portPktOut"]).rolling_max(window_size=1000, center=True, min_periods=1).over("ID").alias("portPktOutRollingMaxCenter1000"),
        pl.col(["portPktOut"]).rolling_max(window_size=2000, center=True, min_periods=1).over("ID").alias("portPktOutRollingMaxCenter2000"),
        pl.col(["portPktDiff"]).rolling_max(window_size=100, center=True, min_periods=1).over("ID").alias("portPktDiffRollingMaxCenter100"),
        pl.col(["portPktDiff"]).rolling_max(window_size=250, center=True, min_periods=1).over("ID").alias("portPktDiffRollingMaxCenter250"),
        pl.col(["portPktDiff"]).rolling_max(window_size=500, center=True, min_periods=1).over("ID").alias("portPktDiffRollingMaxCenter500"),
        pl.col(["portPktDiff"]).rolling_max(window_size=1000, center=True, min_periods=1).over("ID").alias("portPktDiffRollingMaxCenter1000"),
        pl.col(["portPktDiff"]).rolling_max(window_size=2000, center=True, min_periods=1).over("ID").alias("portPktDiffRollingMaxCenter2000"),
        pl.col(["qSize"]).rolling_max(window_size=100, center=True, min_periods=1).over("ID").alias("qSizeRollingMaxCenter100"),
        pl.col(["qSize"]).rolling_max(window_size=250, center=True, min_periods=1).over("ID").alias("qSizeRollingMaxCenter250"),
        pl.col(["qSize"]).rolling_max(window_size=500, center=True, min_periods=1).over("ID").alias("qSizeRollingMaxCenter500"),
        pl.col(["qSize"]).rolling_max(window_size=1000, center=True, min_periods=1).over("ID").alias("qSizeRollingMaxCenter1000"),
        pl.col(["qSize"]).rolling_max(window_size=2000, center=True, min_periods=1).over("ID").alias("qSizeRollingMaxCenter2000"),
        # Skewness
        pl.col(["portPktIn"]).rolling_skew(window_size=100).over("ID").alias("portPktInRollingSkew100"),
        pl.col(["portPktIn"]).rolling_skew(window_size=250).over("ID").alias("portPktInRollingSkew250"),
        pl.col(["portPktIn"]).rolling_skew(window_size=500).over("ID").alias("portPktInRollingSkew500"),
        pl.col(["portPktIn"]).rolling_skew(window_size=1000).over("ID").alias("portPktInRollingSkew1000"),
        pl.col(["portPktIn"]).rolling_skew(window_size=2000).over("ID").alias("portPktInRollingSkew2000"),
        pl.col(["portPktOut"]).rolling_skew(window_size=100).over("ID").alias("portPktOutRollingSkew100"),
        pl.col(["portPktOut"]).rolling_skew(window_size=250).over("ID").alias("portPktOutRollingSkew250"),
        pl.col(["portPktOut"]).rolling_skew(window_size=500).over("ID").alias("portPktOutRollingSkew500"),
        pl.col(["portPktOut"]).rolling_skew(window_size=1000).over("ID").alias("portPktOutRollingSkew1000"),
        pl.col(["portPktOut"]).rolling_skew(window_size=2000).over("ID").alias("portPktOutRollingSkew2000"),
        #pl.col(["portPktDiff"]).rolling_skew(window_size=100).over("ID").alias("portPktDiffRollingSkew100"),
        pl.col(["portPktDiff"]).rolling_skew(window_size=250).over("ID").alias("portPktDiffRollingSkew250"),
        pl.col(["portPktDiff"]).rolling_skew(window_size=500).over("ID").alias("portPktDiffRollingSkew500"),
        pl.col(["portPktDiff"]).rolling_skew(window_size=1000).over("ID").alias("portPktDiffRollingSkew1000"),
        pl.col(["portPktDiff"]).rolling_skew(window_size=2000).over("ID").alias("portPktDiffRollingSkew2000"),
        pl.col(["qSize"]).rolling_skew(window_size=100).over("ID").alias("qSizeRollingSkew100"),
        pl.col(["qSize"]).rolling_skew(window_size=250).over("ID").alias("qSizeRollingSkew250"),
        pl.col(["qSize"]).rolling_skew(window_size=500).over("ID").alias("qSizeRollingSkew500"),
        pl.col(["qSize"]).rolling_skew(window_size=1000).over("ID").alias("qSizeRollingSkew1000"),
        pl.col(["qSize"]).rolling_skew(window_size=2000).over("ID").alias("qSizeRollingSkew2000"),
        # Correlation
        pl.rolling_corr(a="qSize", b="portPktIn", window_size=100, min_periods=1).over("ID").alias("qSizeportPktIn100Corr"),
        pl.rolling_corr(a="qSize", b="portPktIn", window_size=250, min_periods=1).over("ID").alias("qSizeportPktIn250Corr"),
        pl.rolling_corr(a="qSize", b="portPktIn", window_size=500, min_periods=1).over("ID").alias("qSizeportPktIn500Corr"),
        pl.rolling_corr(a="qSize", b="portPktIn", window_size=1000, min_periods=1).over("ID").alias("qSizeportPktIn1000Corr"),
        pl.rolling_corr(a="qSize", b="portPktIn", window_size=2000, min_periods=1).over("ID").alias("qSizeportPktIn2000Corr"),
        pl.rolling_corr(a="portPktIn", b="portPktOut", window_size=100, min_periods=1).over("ID").alias("portPktInportPktOut100Corr"),
        pl.rolling_corr(a="portPktIn", b="portPktOut", window_size=250, min_periods=1).over("ID").alias("portPktInportPktOut250Corr"),
        pl.rolling_corr(a="portPktIn", b="portPktOut", window_size=500, min_periods=1).over("ID").alias("portPktInportPktOut500Corr"),
        pl.rolling_corr(a="portPktIn", b="portPktOut", window_size=1000, min_periods=1).over("ID").alias("portPktInportPktOut1000Corr"),
        pl.rolling_corr(a="portPktIn", b="portPktOut", window_size=2000, min_periods=1).over("ID").alias("portPktInportPktOut2000Corr"),
        # Standard Deviation
        pl.col(["portPktIn"]).rolling_std(window_size=100, center=True, min_periods=1).over("ID").alias("portPktInRollingStd100"),
        pl.col(["portPktIn"]).rolling_std(window_size=250, center=True, min_periods=1).over("ID").alias("portPktInRollingStd250"),
        pl.col(["portPktIn"]).rolling_std(window_size=500, center=True, min_periods=1).over("ID").alias("portPktInRollingStd500"),
        pl.col(["portPktIn"]).rolling_std(window_size=1000, center=True, min_periods=1).over("ID").alias("portPktInRollingStd1000"),     
        pl.col(["portPktOut"]).rolling_std(window_size=100, center=True, min_periods=1).over("ID").alias("portPktOutRollingStd100"),
        pl.col(["portPktOut"]).rolling_std(window_size=250, center=True, min_periods=1).over("ID").alias("portPktOutRollingStd250"),
        pl.col(["portPktOut"]).rolling_std(window_size=500, center=True, min_periods=1).over("ID").alias("portPktOutRollingStd500"),
        pl.col(["portPktOut"]).rolling_std(window_size=1000, center=True, min_periods=1).over("ID").alias("portPktOutRollingStd1000"),
        pl.col(["qSize"]).rolling_std(window_size=100, center=True, min_periods=1).over("ID").alias("qSizeRollingStd100"),
        pl.col(["qSize"]).rolling_std(window_size=250, center=True, min_periods=1).over("ID").alias("qSizeRollingStd250"),
        pl.col(["qSize"]).rolling_std(window_size=500, center=True, min_periods=1).over("ID").alias("qSizeRollingStd500"),
        pl.col(["qSize"]).rolling_std(window_size=1000, center=True, min_periods=1).over("ID").alias("qSizeRollingStd1000"),
        # Maximum Values of Consecutive Non-Zero Values)
        pl.col(["qSize"]).ne(0).cumsum().rolling_max(window_size=1000, center=True, min_periods=1).over("ID").alias("qSizeRollingMaxFlowSize1000"),
        pl.col(["qSize"]).ne(0).cumsum().rolling_max(window_size=500, center=True, min_periods=1).over("ID").alias("qSizeRollingMaxFlowSize500"),
        pl.col(["portPktIn"]).ne(0).cumsum().rolling_max(window_size=1000, center=True, min_periods=1).over("ID").alias("portPktInRollingMaxFlowSize1000"),
        pl.col(["portPktIn"]).ne(0).cumsum().rolling_max(window_size=500, center=True, min_periods=1).over("ID").alias("portPktInRollingMaxFlowSize500")
    )
    
    df = df.drop(['qSize', 'portPktIn', 'portPktOut', 'portPktOut', "portPktDiff"])
    
    return df

In [None]:
%%time

# preprocess training data as polars dataframe
train_path = glob.glob(PATH_TO_TRAINING_DATA_FOLDER)
train = pl.concat([read(file) for file in train_path])

train = preprocess(train)
train = reduce_memory_usage_pl(train, "train")

In [None]:
# subsample training data
train_sample = train.filter(
    pl.int_range(0, pl.count()).shuffle(seed=1).lt(10000).over("ID")
).to_pandas()

x = train_sample.drop(['ID', 'label'], axis=1)
y = train_sample['label']

In [None]:
gss = GroupShuffleSplit(n_splits=5, random_state=1)

# optuna function for hyperparameter tuning
def objective(trial):
    
    params = {
        'objective': trial.suggest_categorical('objective', ['multiclass', 'multiclassova']),
        'metric': trial.suggest_categorical('metric', ['multi_logloss', 'multi_error']),
        'num_class': trial.suggest_int('num_class', 12, 12),
        'boosting': trial.suggest_categorical('boosting', ['goss']),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.5, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.5, 10.0),
        'max_depth': trial.suggest_int('max_depth', 2, 64),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
        #'min_child_samples': trial.suggest_int('min_child_samples', 5, 125),
        'seed': trial.suggest_int('seed', 1, 1),
        'verbosity': trial.suggest_int('verbosity', -1, -1),
        'n_jobs': trial.suggest_int('n_jobs', -1, -1)
    }
    
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
            
    models = []
    scores = []
    
    for train_idx, val_idx in gss.split(x, y, groups=train_sample['ID']):
        
        dtrain = lightgbm.Dataset(x.iloc[train_idx], label=y.iloc[train_idx])
        dval = lightgbm.Dataset(x.iloc[val_idx], label=y.iloc[val_idx])
        
        booster = lightgbm.train(params=params, train_set=dtrain, valid_sets=[dval], valid_names=['val'],
                                 num_boost_round=n_estimators, callbacks=[lightgbm.log_evaluation(250)])
        
        pred = booster.predict(data=x.iloc[val_idx])
        pred = np.argmax(pred, axis=1)
        score = accuracy_score(y.iloc[val_idx], pred)
        
        scores.append(score)
        models.append(booster)
        
    trial.set_user_attr(key="best_booster", value=models)
    return np.mean(scores)

In [None]:
# saves the models obtained by cross-validation during the best tuning iteration
def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key='best_booster', value=trial.user_attrs['best_booster'])

In [None]:
sampler = optuna.samplers.TPESampler(multivariate=True, n_startup_trials=10, seed=12)
study = optuna.create_study(direction="maximize", sampler=sampler)

# best found hyperparamters
study.enqueue_trial({'objective': 'multiclass', 'metric': 'multi_logloss', 'boosting': 'goss', 'learning_rate': 0.013955815625779796, 'lambda_l1': 0.7375095630884171, 'lambda_l2': 0.9728996509396731, 'max_depth': 48, 'feature_fraction': 0.2718853622369295, 'bagging_fraction': 0.22179429225221098, 'n_estimators': 476, 'n_jobs': -1, 'seed': 1})
study.optimize(objective, n_trials=1, callbacks=[callback])

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
# iterate through the models obtained by cross-validation during the best tuning iteration and add the feature importances (gain)
models = study.user_attrs["best_booster"]
all_importances = {}

for model_idx, model in enumerate(models):
    importances = model.feature_importance(importance_type='gain')
    feature_names = model.feature_name()
    all_importances[model_idx] = {feature: importance for feature, importance in zip(feature_names, importances)}

importance_df = pd.DataFrame(all_importances)
combined_importances = importance_df.mean(axis=1)

plt.figure(figsize=(15, 6))
combined_importances.sort_values(ascending=False).plot(kind='bar')
plt.title('Combined Feature Importances')
plt.xlabel('Feature')
plt.ylabel('Importance')
plt.show()

In [None]:
# subsample training data
train_fit_sample = train.filter(
    pl.int_range(0, pl.count()).shuffle(seed=1).lt(10000).over("ID")
).to_pandas()

model = lightgbm.LGBMClassifier(**study.best_params)
model.fit(train_fit_sample.drop(['ID', 'label'], axis=1), train_fit_sample['label'])

In [None]:
%%time

# preprocess test data as polars dataframe
test_path = glob.glob(PATH_TO_TEST_DATA_FOLDER)
test = pl.concat([read(file) for file in test_path])
test_id = test['ID']

test = preprocess(test)
test = reduce_memory_usage_pl(test, "test")

In [None]:
# polars dataframe to pandas dataframe for model api compatibility
test_df = test.to_pandas()
test_df = test_df.drop(["ID"], axis=1)
pred = model.predict(test_df)

In [None]:
submission = pd.DataFrame({"ID": test_id, "Target": pred})

# smooth the predictions by the rolling mode
windowed_submission = pl.from_pandas(submission)
windowed_submission = windowed_submission.with_columns(
    pl.col("Target").rolling_apply(lambda s: s.mode(), window_size=300, center=True, min_periods=1).over("ID").alias("windowed_submission")
)

In [None]:
# create the submission file
testtest = test_id + '_' + test['time']
submission = pd.DataFrame({"ID": testtest, "Target": windowed_submission['windowed_submission']})

submission['ID'] = submission['ID'].str.replace('T', 't')

submission_sample = pd.read_csv(PATH_TO_SAMPLE_SUBMISSION_FILE)
submission = submission_sample[['ID']].merge(submission, how='left', on='ID')

submission['Target'] = submission.Target.fillna(0).astype('int8')
submission.to_csv('submission.csv', index=False, lineterminator='\n')