In [68]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix

In [69]:
data_directory = '/Users/lamahasbini/Library/CloudStorage/OneDrive-AmericanUniversityofBeirut/FYP 2024-25 ECE/FYP_Fall_2024-2025/ML Implementation/EDA Analysis Input Files'
combined_df = pd.DataFrame()

for file in os.listdir(data_directory):
    if file.endswith('.xlsx'):
        file_path = os.path.join(data_directory, file)
        temp = pd.read_excel(file_path, sheet_name='Sheet3')
        parts = file.replace('.xlsx', '').split('_')
        if len(parts) >= 3 and parts[-1] == "new":
            patient_id = f"{parts[-3]}_{parts[-2]}"
        else:
            patient_id = f"{parts[-2]}_{parts[-1]}"
        
        temp['Patient_ID'] = patient_id  
        combined_df = pd.concat([combined_df, temp], ignore_index=True)

In [70]:
print(combined_df.head())  
print(f"Total records combined: {len(combined_df)}")

   Onset_EEG Sleep_stage            Date_EEG  EDR_count      Date_E4_synced  \
0   26759.25           1 2017-07-14 22:47:39          0 2017-07-14 22:46:10   
1   26788.25           W 2017-07-14 22:48:08          0 2017-07-14 22:46:39   
2   26818.25           W 2017-07-14 22:48:38          0 2017-07-14 22:47:09   
3   26848.25           W 2017-07-14 22:49:08          0 2017-07-14 22:47:39   
4   26878.25           W 2017-07-14 22:49:38          0 2017-07-14 22:48:09   

   Storm_mark  Epoch duration Patient_ID  Epoch duration (rounded)  \
0           0             NaN     P31_LW                       NaN   
1           0             NaN     P31_LW                       NaN   
2           0             NaN     P31_LW                       NaN   
3           0             NaN     P31_LW                       NaN   
4           0             NaN     P31_LW                       NaN   

   Average EDA per epoch  
0                    NaN  
1                    NaN  
2                    Na

In [71]:
combined_df = combined_df.dropna()

In [72]:
X = combined_df.drop(['Sleep_stage', 'Onset_EEG', 'Date_EEG', 'Patient_ID'], axis=1)
X['Date_E4_synced_hour'] = X['Date_E4_synced'].dt.hour
X = X.drop(['Date_E4_synced'], axis=1)
print(X)

y = combined_df['Sleep_stage']
y = y.astype(str)

       EDR_count  Storm_mark  Epoch duration  Epoch duration (rounded)  \
1068           0           0       29.999998                      30.0   
1069           0           0       30.000008                      30.0   
1070           0           0       28.999994                      29.0   
1071           0           0       29.999998                      30.0   
1072           0           0       30.000008                      30.0   
...          ...         ...             ...                       ...   
46122          0           0       29.000004                      29.0   
46123          0           0       29.999998                      30.0   
46124          0           0       29.999998                      30.0   
46125          0           0       29.999998                      30.0   
46126          0           0       29.000004                      29.0   

       Average EDA per epoch  Date_E4_synced_hour  
1068                0.259464                   23  
1069   

In [73]:
X.isnull().sum()

EDR_count                   0
Storm_mark                  0
Epoch duration              0
Epoch duration (rounded)    0
Average EDA per epoch       0
Date_E4_synced_hour         0
dtype: int64

In [74]:
sm = SMOTE(random_state=42)
X, y = sm.fit_resample(X, y)



In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(confusion_matrix(y_test, y_pred))

[[2371  644  172  448  565  350]
 [ 682 2012  271  501  742  430]
 [ 182  273 3606  192  170  129]
 [ 420  481  227 2635  414  325]
 [ 554  646  194  436 2477  345]
 [ 369  406  145  345  352 2975]]


In [76]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.52      0.52      0.52      4550
           2       0.45      0.43      0.44      4638
           3       0.78      0.79      0.79      4552
           R       0.58      0.59      0.58      4502
           W       0.52      0.53      0.53      4652
           r       0.65      0.65      0.65      4592

    accuracy                           0.58     27486
   macro avg       0.58      0.59      0.58     27486
weighted avg       0.58      0.58      0.58     27486

[[2371  644  172  448  565  350]
 [ 682 2012  271  501  742  430]
 [ 182  273 3606  192  170  129]
 [ 420  481  227 2635  414  325]
 [ 554  646  194  436 2477  345]
 [ 369  406  145  345  352 2975]]
