In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Import train_test_split function
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
# import random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [2]:
# Load dataset into a pandas DataFrame
data = pd.read_csv('C:/Users/mathi/OneDrive/Skrivebord/2_Sem_Kand/Projekt_Kandidat2/Projekt_Kandidat2/DataPreb/DataPSGandDetektion.csv')

X = data[['AVGHR',
   'AVGRR',
   'SLEEPSCORE',
   'DURATIONINBED',
   'DURATIONAWAKE',
   'DURATIONINSLEEP',
   'DURATIONINLIGHT',
   'DURATIONSLEEPONSET',
   'BEDEXITDURATION',
   'NYHABL',
   'AGEBL',
   'WEIGHTBL',
   'HEIGHTBL',
   'BPSYSBL']]
y = data['SLEEPAPNEA']

patient_ids = data['patient_ID']

# ## Alle nætter = 4426 
test = [571, 584, 612] # Nætter søvnapnø = 95
train = [510, 459, 431] # Nætter søvnapnø = 627

# Find unikke patient-ID'er
unique_patients = np.unique(patient_ids)

# Fjern train_patients fra unique_patients og test_patients fra unique_patients
unique_patients = unique_patients[~np.isin(unique_patients, train)]
unique_patients = unique_patients[~np.isin(unique_patients, test)]

# Opdel patient-ID'er i træning og test
train_patients, test_patients = train_test_split(unique_patients, test_size=0.2, random_state=42)

# Tilføj train til train_patients og test til test_patients
train_patients = np.append(train_patients, train)
test_patients = np.append(test_patients, test)

# Opdel data efter patient-ID
X_train = X[patient_ids.isin(train_patients)]
y_train = y[patient_ids.isin(train_patients)]
X_test = X[patient_ids.isin(test_patients)]
y_test = y[patient_ids.isin(test_patients)]


# Definér hyperparametre, der skal testes
param_grid = {
    # # # 'n_estimators': [100, 150, 200, 250, 300, 400, 500],
    'n_estimators': [150],
    # # # 'max_depth' : [4,5,6,7,8,9,10,11,12,13,14],
    'max_depth' : [8],
    'criterion' :['gini'], # , 'entropy'
    'min_samples_split': [10], # 2, 5, 
    'min_samples_leaf': [1], # , 2, 4
    'max_samples': [0.5], # , 0.75, 1.0
    'min_weight_fraction_leaf': [0.0], # , 0.1, 0.2
    'min_impurity_decrease': [0.0], # , 0.1, 0.2
    'class_weight': ['balanced_subsample'], # 'balanced', None
    'max_leaf_nodes': [20], # None, 10, , 30, 40, 50   
}

# Opret en Random Forest-klassifikator
rf = RandomForestClassifier()

# Opret en GridSearchCV-objekt
grid_search = GridSearchCV(rf, param_grid, cv=5)

# Udfør grid search på træningsdataene
grid_search.fit(X_train, y_train)

# Find de bedste hyperparametre
best_params = grid_search.best_params_

print("Bedste hyperparametre:")
print(best_params)

# Evaluér den bedste model på testdataene
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Beregn og udskriv forvirringsmatrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
# print roc_auc 
print("ROC AUC score:")
print(roc_auc_score(y_test, y_pred))





Bedste hyperparametre:
{'class_weight': 'balanced_subsample', 'criterion': 'gini', 'max_depth': 8, 'max_leaf_nodes': 20, 'max_samples': 0.5, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 150}
Confusion Matrix:
[[758   0]
 [ 95   0]]
ROC AUC score:
0.5
