In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from mlxtend.feature_selection import SequentialFeatureSelector
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.metrics import roc_auc_score
from scipy.special import softmax
from sklearn.model_selection import GridSearchCV
import seaborn as sns
# Import logtic regression classifier

# Load dataset into a pandas DataFrame
data = pd.read_csv('C:/Users/mathi/OneDrive/Skrivebord/2_Sem_Kand/Projekt_Kandidat2/Projekt_Kandidat2/DataPreb/DataPSGandDetektion.csv')

X = data[['AVGHR', 'AVGRR', 'AVGACT', 'SLEEPSCORE', 'DURATIONINBED', 'DURATIONAWAKE', 'DURATIONINSLEEP', 'DURATIONINREM', 'DURATIONINLIGHT', 'DURATIONINDEEP', 'DURATIONSLEEPONSET', 'BEDEXITDURATION', 'BEDEXITCOUNT', 'TOSSNTURNCOUNT', 'FROMGMTOFFSET', 'MINHR', 'MAXHR', 'MINRR', 'MAXRR', 'FMCOUNT', 'AWAKENINGS', 'HRVSCORE', 'HRVLF', 'HRVHF', 'HRVRMSSDEVENING', 'HRVRMSSDMORNING', 'BPSYS', 'BPDIA', 'HR', 'WEIGHT', 'STEPS', 'NYHABL', 'AGEBL', 'WEIGHTBL', 'HEIGHTBL', 'BPSYSBL', 'BPDIABL', 'HRB', 'EFTITBL', 'GENDER']]
y = data['SLEEPAPNEA']

patient_ids = data['patient_ID']

# ## Alle nætter = 4426 
test = [571, 584, 612] # Nætter søvnapnø = 95
train = [510, 459, 431] # Nætter søvnapnø = 627

# Find unikke patient-ID'er
unique_patients = np.unique(patient_ids)

# Fjern train_patients fra unique_patients og test_patients fra unique_patients
unique_patients = unique_patients[~np.isin(unique_patients, train)]
unique_patients = unique_patients[~np.isin(unique_patients, test)]

# Opdel patient-ID'er i træning og test
train_patients, test_patients = train_test_split(unique_patients, test_size=0.2, random_state=42)

# Tilføj train til train_patients og test til test_patients
train_patients = np.append(train_patients, train)
test_patients = np.append(test_patients, test)

# Opdel data efter patient-ID
X_train = X[patient_ids.isin(train_patients)]
y_train = y[patient_ids.isin(train_patients)]
X_test = X[patient_ids.isin(test_patients)]
y_test = y[patient_ids.isin(test_patients)]

model = RandomForestClassifier(random_state=42)

# Fit the model with training data
model.fit(X_train, y_train)

# Create a SequentialFeatureSelector object
sfs = SequentialFeatureSelector(estimator=model,
                                k_features=10,
                                forward=True,
                                scoring='roc_auc',
                                cv=5)

# Perform feature selection on the training data
X_train_selected = sfs.fit_transform(X_train, y_train)
selected_features = list(sfs.k_feature_names_)

# sfs.subsets_
# Make confusion matrix for the test data
y_pred_proba = model.predict_proba(X_test)[:,1]



y_pred = (y_pred_proba > 0.5).astype(int)

# Opret en forvirringsmatrix
cm = confusion_matrix(y_test, y_pred)


# Plot forvirringsmatricen
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()






# # Iterate through the selected features sfs.subsets_ and test the performance of the model using the test data with these features and append the results to a list so i can plot it
# results = []
# # Opret en liste til at gemme den højeste true positive rate for hver funktion
# tp_rates = []
# fp_rates = []
# for i in range(1, len(sfs.subsets_)+1):
#     X_test_selected = sfs.transform(X_test)
#     model.fit(X_train_selected[:, 0:i], y_train)
#     y_pred_proba = model.predict_proba(X_test_selected[:, 0:i])[:,1]
#     # Find den højeste true positive rate
#     tp_rates.append(confusion_matrix(y_test, y_pred_proba > 0.5)[1,1] / (confusion_matrix(y_test, y_pred_proba > 0.5)[1,1] + confusion_matrix(y_test, y_pred_proba > 0.5)[1,0]))
#     fp_rates.append(confusion_matrix(y_test, y_pred_proba > 0.5)[0,1] / (confusion_matrix(y_test, y_pred_proba > 0.5)[0,1] + confusion_matrix(y_test, y_pred_proba > 0.5)[0,0]))
#     results.append(roc_auc_score(y_test, y_pred_proba))

# # List følgende navne i følgende rækkefølge : 'AGEBL', 'HEIGHTBL', 'AVGHR', 'AVGRR', 'NYHABL', 'BEDEXITDURATION', 'DURATIONSLEEPONSET', 'WEIGHTBL', 'DURATIONINLIGHT', 'SLEEPSCORE', 'DURATIONINSLEEP', 'DURATIONINBED', 'BPSYSBL', 'DURATIONAWAKE', 'AVGACT'. Alt dette er taget ud fra subset_ i sfs
# names = ['AGEBL', 'HEIGHTBL', 'AVGHR', 'AVGRR', 'NYHABL', 'BEDEXITDURATION', 'DURATIONSLEEPONSET', 'WEIGHTBL', 'DURATIONINLIGHT', 'SLEEPSCORE', 'DURATIONINSLEEP', 'DURATIONINBED', 'BPSYSBL', 'DURATIONAWAKE', 'AVGACT']

# fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
# ax = plt.gca()
# plt.title('Sequential Forward Selection (SFS) (w. StdDev)')
# ax.set_xticklabels(names, rotation=90, fontsize=8)

# # Plot the results
# plt.plot(range(1, len(sfs.subsets_)+1), results, color='red', label='ROC-AUC on test set')
# plt.ylabel('ROC-AUC')
# plt.xlabel('Features')
# plt.legend()

# # Create a secondary y-axis
# ax2 = ax.twinx()
# plt.plot(range(1, len(sfs.subsets_)+1), tp_rates, color='green', label='True Positive Rate')
# plt.plot(range(1, len(sfs.subsets_)+1), fp_rates, color='orange', label='False Positive Rate')
# plt.ylabel('True/False Positive Rate')
# plt.legend()

# plt.show()