In [1]:
import numpy as np
import pandas as pd
import os
import joblib
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

from sklearn.ensemble import (
    RandomForestClassifier, 
    ExtraTreesClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier

In [6]:
df = pd.read_csv("../data/train/bigboy_hella_features.csv")
# x = df[['cosine_feature', 'count_feature', 'hr_std', 'hr_mean', 'time_feature', 'hr_mean_diff']]
x = df[['cosine_feature', 'count_feature', 'hr_std', 'hr_mean', 'time_feature', 
        'hr_mean_diff', 'count_feature_lag_1', 'count_feature_lag_2', 'hr_std_lag_1', 'hr_std_lag_2', 
        'hr_mean_lag_1', 'hr_mean_lag_2', 'delta'
]]
y = df['psg_label']
y.value_counts()

psg_label
2    11203
5     5030
3     3252
0     1851
1     1475
Name: count, dtype: int64

In [7]:
y.replace({5: 4}, inplace=True)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(18248, 13) (4563, 13) (18248,) (4563,)


In [9]:
models_dir = "../saved_models/hella_features"

class_weights = {
    0: 1,  # Wake
    1: 5,  # N1
    2: 1,  # N2
    3: 1,  # N3 (Deep Sleep) - Make this "heavier" so the model is scared to misclassify it
    4: 1   # REM
}

algorithms = {
    'Random Forest': RandomForestClassifier(class_weight=class_weights, n_estimators=200, min_samples_leaf=5),
    'Extra Trees': ExtraTreesClassifier(class_weight=class_weights, n_estimators=200, min_samples_leaf=5),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Logistic Regression': LogisticRegression(class_weight=class_weights),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    # 'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000),
    'XGBoost': XGBClassifier(device='cuda')
}

for name, clf in algorithms.items():
    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)
    exact_score = clf.score(x_test, y_test)
    print(f"{name:<20} | {exact_score:.4f}")    
    # print(f"{name}: {clf.features_}")
    # print(f"{name}: {clf.feature_importances_}")

    model_filename = os.path.join(models_dir, f"{name.replace(' ', '_')}.joblib")
    joblib.dump(clf, model_filename)
    print(f"Saved {name} to {model_filename}")

Random Forest        | 0.7583
Saved Random Forest to ../saved_models/hella_features\Random_Forest.joblib
Extra Trees          | 0.7188
Saved Extra Trees to ../saved_models/hella_features\Extra_Trees.joblib
Gradient Boosting    | 0.6728
Saved Gradient Boosting to ../saved_models/hella_features\Gradient_Boosting.joblib


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression  | 0.5698
Saved Logistic Regression to ../saved_models/hella_features\Logistic_Regression.joblib
K-Nearest Neighbors  | 0.7539
Saved K-Nearest Neighbors to ../saved_models/hella_features\K-Nearest_Neighbors.joblib
XGBoost              | 0.7791
Saved XGBoost to ../saved_models/hella_features\XGBoost.joblib
