In [1]:
import numpy as np
import pandas as pd
import os
import joblib
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

from sklearn.ensemble import (
    RandomForestClassifier, 
    ExtraTreesClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("../data/train/bigboy_with_hr_mean.csv")
x = df[['cosine_feature', 'count_feature', 'hr_std', 'hr_mean', 'time_feature']]
y = df['psg_label']
y.value_counts()

psg_label
2    11203
5     5030
3     3252
0     1907
1     1475
Name: count, dtype: int64

In [3]:
y.replace({5: 4}, inplace=True)

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(18293, 5) (4574, 5) (18293,) (4574,)


In [5]:
models_dir = "../saved_models/optimized_weights"

class_weights = {
    0: 1,  # Wake
    1: 5,  # N1
    2: 1,  # N2
    3: 1,  # N3 (Deep Sleep) - Make this "heavier" so the model is scared to misclassify it
    4: 1   # REM
}

algorithms = {
    'Random Forest': RandomForestClassifier(class_weight=class_weights, n_estimators=200, min_samples_leaf=5),
    'Extra Trees': ExtraTreesClassifier(class_weight=class_weights, n_estimators=200, min_samples_leaf=5),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Logistic Regression': LogisticRegression(class_weight=class_weights),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000),
    'XGBoost': XGBClassifier(device='cuda')
}

for name, clf in algorithms.items():
    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)
    exact_score = clf.score(x_test, y_test)
    print(f"{name:<20} | {exact_score:.4f}")    
    # print(f"{name}: {clf.features_}")
    # print(f"{name}: {clf.feature_importances_}")

    model_filename = os.path.join(models_dir, f"{name.replace(' ', '_')}.joblib")
    joblib.dump(clf, model_filename)
    print(f"Saved {name} to {model_filename}")

Random Forest        | 0.7608
Saved Random Forest to ../saved_models/optimized_weights\Random_Forest.joblib
Extra Trees          | 0.7178
Saved Extra Trees to ../saved_models/optimized_weights\Extra_Trees.joblib
Gradient Boosting    | 0.6530
Saved Gradient Boosting to ../saved_models/optimized_weights\Gradient_Boosting.joblib


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression  | 0.5564
Saved Logistic Regression to ../saved_models/optimized_weights\Logistic_Regression.joblib
K-Nearest Neighbors  | 0.7595
Saved K-Nearest Neighbors to ../saved_models/optimized_weights\K-Nearest_Neighbors.joblib
Neural Network       | 0.6544
Saved Neural Network to ../saved_models/optimized_weights\Neural_Network.joblib
XGBoost              | 0.7713
Saved XGBoost to ../saved_models/optimized_weights\XGBoost.joblib


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
