### SLEEP DATA ANALYSIS

In [198]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
from sklearn.preprocessing import LabelEncoder
sb.set() # set the default Seaborn style for graphics

## Importing the sleep data set

In [199]:
sleepData = pd.read_csv("sleepData.csv")
sleepData.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


## Cleaning & Encoding columns in DataSet

In [200]:
# Replace 'Normal Weight' with 'Normal' in the 'BMI Category' column
sleepData["BMI Category"] = sleepData["BMI Category"].replace("Normal Weight", "Normal")
# Extract the value before '/' symbol
sleepData['Blood Pressure'] = sleepData['Blood Pressure'].str.split('/').str[0]


# initiate Label Encoder
label_encoder = LabelEncoder()
sleepData["BMI Category"] = label_encoder.fit_transform(sleepData["BMI Category"])
# Normal: 0, Normal Weight: 1, Obese: 2, Overweight: 3

# Replace NaN with 0 and map 'Sleep Apnea' and 'Insomnia' to 1
sleepData["Sleep Disorder"] = sleepData["Sleep Disorder"].fillna(0)
sleepData["Sleep Disorder"] = sleepData["Sleep Disorder"].replace(['Sleep Apnea', 'Insomnia'], 1)
# Convert column to integer type
sleepData["Sleep Disorde"] = sleepData["Sleep Disorder"].astype(int)



In [201]:
sleepDuration = pd.DataFrame(sleepData["Sleep Duration"])
sleepQuality = pd.DataFrame(sleepData["Quality of Sleep"])
physicalActivity = pd.DataFrame(sleepData["Physical Activity Level"])
stressLevel = pd.DataFrame(sleepData["Stress Level"])
bmiCategory = pd.DataFrame(sleepData["BMI Category"])
bloodPressure = pd.DataFrame(sleepData["Blood Pressure"])
heartRate = pd.DataFrame(sleepData["Heart Rate"])
dailySteps = pd.DataFrame(sleepData["Daily Steps"])
sleepDisorder = pd.DataFrame(sleepData["Sleep Disorder"])

In [202]:
print(sleepDisorder)

     Sleep Disorder
0                 0
1                 0
2                 0
3                 1
4                 1
..              ...
368               1
369               1
370               1
371               1
372               1

[373 rows x 1 columns]


In [203]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Combine variables into a single DataFrame
X = pd.concat([sleepDuration, sleepQuality, physicalActivity, stressLevel, bmiCategory, 
               bloodPressure, heartRate, dailySteps], axis=1)
y = sleepDisorder.values.ravel()  # Flatten sleepDisorder into a 1D array

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize model
random_forest = RandomForestClassifier()

# Train the model
random_forest.fit(X_train_scaled, y_train)

# Get feature importances
feature_importances = random_forest.feature_importances_

# Create a DataFrame of feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort features by importance (descending order)
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)
# Select the top N features (e.g., top 3)
top_n_features = feature_importance_df.head(3)['Feature'].tolist()


# Filter X_train and X_test to include only the top N features
X_train_selected = X_train[top_n_features]
X_test_selected = X_test[top_n_features]

# Train the model with selected features
random_forest_selected = RandomForestClassifier()
random_forest_selected.fit(X_train_selected, y_train)

# Evaluate the models using cross-validation
random_forest_scores = cross_val_score(random_forest, X_train_scaled, y_train, cv=5)
random_forest_selected_scores = cross_val_score(random_forest_selected, X_train_selected, y_train, cv=5)

# Compare model performance
print("Random Forest Cross-Validation Scores:", random_forest_scores)
print("Random Forest Selected Cross-Validation Scores:", random_forest_selected_scores)

# Make predictions on the testing data using the best model
y_pred = random_forest.predict(X_test_scaled)
y_pred_selected = random_forest_selected.predict(X_test_selected)

# Evaluate the models on the test data
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
accuracy_selected = accuracy_score(y_test, y_pred_selected)
report_selected = classification_report(y_test, y_pred_selected)

print("Using all variables")
print("Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Optimal Model")
print("Accuracy:", accuracy_selected)
print("Classification Report:\n", report_selected)


                   Feature  Importance
4             BMI Category    0.345409
5           Blood Pressure    0.250300
0           Sleep Duration    0.177146
3             Stress Level    0.059223
6               Heart Rate    0.058773
2  Physical Activity Level    0.042271
1         Quality of Sleep    0.039795
7              Daily Steps    0.027082
Random Forest Cross-Validation Scores: [0.91666667 0.86666667 0.98333333 0.94915254 0.94915254]
Random Forest Selected Cross-Validation Scores: [0.91666667 0.85       0.98333333 0.94915254 0.93220339]
Using all variables
Accuracy: 0.96
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96        41
           1       0.97      0.94      0.96        34

    accuracy                           0.96        75
   macro avg       0.96      0.96      0.96        75
weighted avg       0.96      0.96      0.96        75

Optimal Model
Accuracy: 0.96
Classification Report:
          