In [None]:
from scipy.stats import skew, kurtosis

def safe_skew(series):
    try:
        if series.nunique() > 1 and np.std(series) > 1e-6:
            return skew(series)
        return 0  # Default if too little variation
    except RuntimeWarning:
        return 0  # Ignore and return 0

def safe_kurtosis(series):
    try:
        if series.nunique() > 1 and np.std(series) > 1e-6:
            return kurtosis(series)
        return 0
    except RuntimeWarning:
        return 0

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import glob  # For handling file paths efficiently

paths_train = glob.glob('train/*.csv')  # Get all CSV file paths
prepare_data = []

for count, path in enumerate(paths_train):
    df_train = pd.read_csv(path)

    for i in range(int(len(df_train) / 480)):
        df_chunk = df_train.iloc[480 * i : 480 * (i + 1)]

        # Acceleration features
        std_x = df_chunk['ACC_X'].std()
        std_y = df_chunk['ACC_Y'].std()
        std_z = df_chunk['ACC_Z'].std()
        var_x = df_chunk['ACC_X'].var()
        var_y = df_chunk['ACC_Y'].var()
        var_z = df_chunk['ACC_Z'].var()
        skew_x = safe_skew(df_chunk['ACC_X'])
        skew_y = safe_skew(df_chunk['ACC_Y'])
        skew_z = safe_skew(df_chunk['ACC_Z'])
        kurt_x = safe_kurtosis(df_chunk['ACC_X'])
        kurt_y = safe_kurtosis(df_chunk['ACC_Y'])
        kurt_z = safe_kurtosis(df_chunk['ACC_Z'])

        # BVP features
        std_bvp = df_chunk['BVP'].std()
        mean_bvp = df_chunk['BVP'].mean()
        var_bvp = df_chunk['BVP'].var()

        # Temperature
        mean_temp = df_chunk['TEMP'].mean()
        std_temp = df_chunk['TEMP'].std()

        # Heart rate
        mean_hr = df_chunk['HR'].mean()
        std_hr = df_chunk['HR'].std()

        # EDA
        mean_eda = df_chunk['EDA'].mean()
        std_eda = df_chunk['EDA'].std()

        # Frequency domain (FFT)
        fft_acc_x = np.abs(np.fft.fft(df_chunk['ACC_X']))[:10]  # Take first 10 FFT components
        fft_acc_y = np.abs(np.fft.fft(df_chunk['ACC_Y']))[:10]
        fft_acc_z = np.abs(np.fft.fft(df_chunk['ACC_Z']))[:10]
        fft_bvp = np.abs(np.fft.fft(df_chunk['BVP']))[:10]

        # Combine all features
        feature_row = [
            std_x, std_y, std_z, var_x, var_y, var_z, skew_x, skew_y, skew_z, kurt_x, kurt_y, kurt_z,
            std_bvp, mean_bvp, var_bvp, mean_temp, std_temp, mean_hr, std_hr, mean_eda, std_eda
        ] + list(fft_acc_x) + list(fft_acc_y) + list(fft_acc_z) + list(fft_bvp)

        label = df_chunk['Sleep_Stage'].iloc[0]
        prepare_data.append(feature_row + [label])

print("✅ Feature extraction complete!")


✅ Feature extraction complete!


In [None]:
column_names = [
    'std_x', 'std_y', 'std_z', 'var_x', 'var_y', 'var_z',
    'skew_x', 'skew_y', 'skew_z', 'kurt_x', 'kurt_y', 'kurt_z',
    'std_bvp', 'mean_bvp', 'var_bvp', 'mean_temp', 'std_temp', 'mean_hr', 'std_hr',
    'mean_eda', 'std_eda'
]

for i in range(10):
    column_names.append(f'fft_acc_x_{i}')
    column_names.append(f'fft_acc_y_{i}')
    column_names.append(f'fft_acc_z_{i}')
    column_names.append(f'fft_bvp_{i}')

column_names.append('label')

df_train_ = pd.DataFrame(prepare_data, columns=column_names)

In [None]:
import os

paths_test = sorted(glob.glob('test_segment/*'))

prepare_data_test = []
for paths in paths_test:
    sub_paths = os.path.join(paths, "*")
    sub_paths = sorted(glob.glob(sub_paths))
    for sub in sub_paths: # for loop to get each csv
        sub_df_test = pd.read_csv(sub)
        # Acceleration features
        std_x = sub_df_test['ACC_X'].std()
        std_y = sub_df_test['ACC_Y'].std()
        std_z = sub_df_test['ACC_Z'].std()
        var_x = sub_df_test['ACC_X'].var()
        var_y = sub_df_test['ACC_Y'].var()
        var_z = sub_df_test['ACC_Z'].var()
        skew_x = safe_skew(sub_df_test['ACC_X'])
        skew_y = safe_skew(sub_df_test['ACC_Y'])
        skew_z = safe_skew(sub_df_test['ACC_Z'])
        kurt_x = safe_kurtosis(sub_df_test['ACC_X'])
        kurt_y = safe_kurtosis(sub_df_test['ACC_Y'])
        kurt_z = safe_kurtosis(sub_df_test['ACC_Z'])

        # BVP features
        std_bvp = sub_df_test['BVP'].std()
        mean_bvp = sub_df_test['BVP'].mean()
        var_bvp = sub_df_test['BVP'].var()

        # Temperature
        mean_temp = sub_df_test['TEMP'].mean()
        std_temp = sub_df_test['TEMP'].std()

        # Heart rate
        mean_hr = sub_df_test['HR'].mean()
        std_hr = sub_df_test['HR'].std()

        # EDA
        mean_eda = sub_df_test['EDA'].mean()
        std_eda = sub_df_test['EDA'].std()

        # Frequency domain (FFT)
        fft_acc_x = np.abs(np.fft.fft(sub_df_test['ACC_X']))[:10]  # Take first 10 FFT components
        fft_acc_y = np.abs(np.fft.fft(sub_df_test['ACC_Y']))[:10]
        fft_acc_z = np.abs(np.fft.fft(sub_df_test['ACC_Z']))[:10]
        fft_bvp = np.abs(np.fft.fft(sub_df_test['BVP']))[:10]

        # Combine all features
        feature_row = [
            std_x, std_y, std_z, var_x, var_y, var_z, skew_x, skew_y, skew_z, kurt_x, kurt_y, kurt_z,
            std_bvp, mean_bvp, var_bvp, mean_temp, std_temp, mean_hr, std_hr, mean_eda, std_eda
        ] + list(fft_acc_x) + list(fft_acc_y) + list(fft_acc_z) + list(fft_bvp)

        prepare_data_test.append(feature_row)

In [None]:
column_names = [
    'std_x', 'std_y', 'std_z', 'var_x', 'var_y', 'var_z',
    'skew_x', 'skew_y', 'skew_z', 'kurt_x', 'kurt_y', 'kurt_z',
    'std_bvp', 'mean_bvp', 'var_bvp', 'mean_temp', 'std_temp', 'mean_hr', 'std_hr',
    'mean_eda', 'std_eda'
]

for i in range(10):
    column_names.append(f'fft_acc_x_{i}')
    column_names.append(f'fft_acc_y_{i}')
    column_names.append(f'fft_acc_z_{i}')
    column_names.append(f'fft_bvp_{i}')

df_test_ = pd.DataFrame(prepare_data_test, columns=column_names)

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
save_path = 'best_model'
hyperparameters = {
    'GBM': [
        {'ag_args_fit': {'num_gpus': 0}},  # Train with CPU
        {'ag_args_fit': {'num_gpus': 1}}   # Train with GPU
    ],
    'RF': {},   # Random Forest (helps with imbalanced data)
    'XGB': {},  # XGBoost (great for tabular data)
    'NN_TORCH': {'num_epochs': 50},  # Neural network (can capture complex patterns)
}
time_limit=600

In [None]:
predictor = TabularPredictor(label='label',
                            problem_type='multiclass',
                            path=save_path,
                            ).fit(
                                df_train_,
                                presets='best_quality',
                                hyperparameters=hyperparameters,
                                time_limit=time_limit
                            )
print("Finish!")

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.3.0: Thu Jan  2 20:23:36 PST 2025; root:xnu-11215.81.4~3/RELEASE_ARM64_T8112
CPU Count:          8
Memory Avail:       4.00 GB / 16.00 GB (25.0%)
Disk Space Avail:   198.59 GB / 460.43 GB (43.1%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked ov

Finish!


In [None]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,0.777212,accuracy,209.609222,317.301092,0.003641,1.000319,3,True,9
1,XGBoost_BAG_L2,0.777152,accuracy,208.329748,294.701894,0.886821,35.151566,2,True,7
2,LightGBM_BAG_L2,0.776672,accuracy,208.71876,281.149207,1.275833,21.598879,2,True,5
3,NeuralNetTorch_BAG_L2,0.776148,accuracy,209.327217,288.262141,1.88429,28.711813,2,True,8
4,RandomForest_BAG_L2,0.775144,accuracy,209.55592,303.036646,2.112993,43.486319,2,True,6
5,LightGBM_BAG_L1,0.767413,accuracy,206.170805,237.893251,206.170805,237.893251,1,True,1
6,WeightedEnsemble_L2,0.767413,accuracy,206.176936,238.695412,0.006131,0.802161,2,True,4
7,RandomForest_BAG_L1,0.654761,accuracy,0.894863,18.237078,0.894863,18.237078,1,True,2
8,XGBoost_BAG_L1,0.599536,accuracy,0.377259,3.419998,0.377259,3.419998,1,True,3


In [None]:
sample_submission = pd.read_csv("sample_submission.csv")

In [None]:
y_pred = predictor.predict(df_test_)

In [None]:
sample_submission['labels'] = y_pred
sample_submission['labels'].value_counts()

labels
N2    5241
W     2273
N1     219
R       96
N3       3
Name: count, dtype: int64

In [None]:
sample_submission.to_csv("submission_6.csv", index=False)
sample_submission['labels'].value_counts()

labels
N2    5241
W     2273
N1     219
R       96
N3       3
Name: count, dtype: int64