In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, mean_squared_error

sleep_data = "Sleep_health_and_lifestyle_dataset.csv"
df = pd.read_csv(sleep_data)

In [2]:
df

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,370,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,371,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,372,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,373,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [3]:
# 'Blood Pressure'를 수축기/이완기 혈압으로 분리
df[['Systolic_BP', 'Diastolic_BP']] = df['Blood Pressure'].str.split('/', expand=True)
df['Systolic_BP'] = pd.to_numeric(df['Systolic_BP'], errors='coerce')
df['Diastolic_BP'] = pd.to_numeric(df['Diastolic_BP'], errors='coerce')

# 'Blood Pressure' 컬럼 제거
df.drop(columns=['Blood Pressure'], inplace=True)

# 스케일링할 수치형 변수 선택
numeric_cols = ['Sleep Duration', 'Quality of Sleep', 'Physical Activity Level',
                'Stress Level', 'Systolic_BP', 'Diastolic_BP', 'Heart Rate', 'Daily Steps']

# 숫자형 데이터 추출 및 결측치 처리
df_numeric = df[numeric_cols].copy()
df_numeric = df_numeric.fillna(df_numeric.mean())

# 표준화 (StandardScaler)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_numeric)

# 스케일링된 데이터를 DataFrame으로 변환
df_scaled = pd.DataFrame(df_scaled, columns=numeric_cols, dtype=float)

# 기존 df의 해당 컬럼을 스케일링된 값으로 업데이트
df[numeric_cols] = df_scaled


In [20]:
iso_forest = IsolationForest(contamination=0.05, random_state=42)
df["Anomaly"] = iso_forest.fit_predict(df_scaled)

# 정상(1) / 이상치(-1) 확인
print("Anomaly detection results:\n", df["Anomaly"].value_counts())

Anomaly detection results:
 Anomaly
 1    356
-1     18
Name: count, dtype: int64


In [9]:
df['Sleep Disorder'] = df['Sleep Disorder'].fillna('No Disorder')

encoder = LabelEncoder()
df['Sleep Disorder Encoded'] = encoder.fit_transform(df['Sleep Disorder'])

# no disorder : 1 / sleep apnea : 2 / insomnia : 0
print(df[['Sleep Disorder', 'Sleep Disorder Encoded']].drop_duplicates())

  Sleep Disorder  Sleep Disorder Encoded
0    No Disorder                       1
3    Sleep Apnea                       2
5       Insomnia                       0


In [12]:
# KNN 모델 학습 (수면 장애 기준 유사 사용자 그룹 분석)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(df_scaled, df['Sleep Disorder Encoded'])

# 유사 사용자 그룹 예측
df["Similar_User_Group"] = knn.predict(df_scaled)

In [None]:
#df.to_csv("processed_sleep_data.csv", index=False)

In [19]:
X = df_scaled  # 입력 변수
y = df["Quality of Sleep"] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mlp = MLPRegressor(hidden_layer_sizes=(64, 32), activation='relu', max_iter=500, random_state=42)
mlp.fit(X_train, y_train)

# 모델 평가
y_pred = mlp.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Predicting sleep improvement MSE: {mse:.4f}")

Predicting sleep improvement MSE: 0.0158
