In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from joblib import dump, load

hrv_data = pd.read_csv('../input/hrv.csv')
resting_hr_data = pd.read_csv('../input/resting_hr.csv')
stress_data = pd.read_csv('../input/stress.csv')
sleep_data = pd.read_csv('../input/sleep.csv')
sleep_stage_data = pd.read_csv('../input/sleep_stage.csv')
sleep_pattern_data = pd.read_csv('../input/sleep_pattern.csv')
steps_data = pd.read_csv('../input/steps.csv')
spo2_data = pd.read_csv('../input/spo2.csv')
breathing_data = pd.read_csv('../input/breathing.csv')

# Convert date columns to datetime format
stress_data['date'] = pd.to_datetime(stress_data['date'])
steps_data['date'] = pd.to_datetime(steps_data['date'])
sleep_data['dateOfSleep'] = pd.to_datetime(sleep_data['dateOfSleep'])
sleep_data['date'] = pd.to_datetime(sleep_data['dateOfSleep']) - pd.DateOffset(days=1)
sleep_stage_data['dateOfSleep'] = pd.to_datetime(sleep_stage_data['dateOfSleep'])
sleep_stage_data['date'] = pd.to_datetime(sleep_stage_data['dateOfSleep']) - pd.DateOffset(days=1)
sleep_pattern_data['dateOfSleep'] = pd.to_datetime(sleep_pattern_data['dateOfSleep'])
sleep_pattern_data['date'] = pd.to_datetime(sleep_pattern_data['dateOfSleep']) - pd.DateOffset(days=1)
hrv_data['date'] = pd.to_datetime(hrv_data['minute'])
resting_hr_data['dateTime'] = pd.to_datetime(resting_hr_data['dateTime'])
resting_hr_data['date'] = pd.to_datetime(resting_hr_data['dateTime']) - pd.DateOffset(days=1)
spo2_data['date'] = pd.to_datetime(spo2_data['minute'])
spo2_data = spo2_data.groupby(spo2_data['date'].dt.date)['spo2'].mean()
spo2_data = spo2_data.reset_index()
spo2_data['date'] = pd.to_datetime(spo2_data['date']) - pd.DateOffset(days=1)
breathing_data['date'] = pd.to_datetime(breathing_data['dateTime'])
breathing_data['date'] = pd.to_datetime(breathing_data['date']) - pd.DateOffset(days=1)
hrv_data = hrv_data.groupby(hrv_data['date'].dt.date).mean()
hrv_data = hrv_data.reset_index()
hrv_data['date'] = pd.to_datetime(hrv_data['date']) - pd.DateOffset(days=1)

# Create a date range dataframe for later merging
date_range = pd.date_range(start='2023-11-01', end='2024-02-16', freq='D')
date_df = pd.DataFrame({'date': date_range})

hrv_data = date_df.merge(hrv_data, on='date', how='left')
# hrv_data = hrv_data.ffill()
resting_hr_data = date_df.merge(resting_hr_data, on='date', how='left')
# resting_hr_data = resting_hr_data.ffill()
stress_data = date_df.merge(stress_data, on='date', how='left')
# stress_data = stress_data.ffill()
sleep_data = date_df.merge(sleep_data, on='date', how='left')
# sleep_data = sleep_data.ffill()
sleep_stage_data = date_df.merge(sleep_stage_data, on='date', how='left')
# sleep_stage__data = sleep_stage_data.ffill()
steps_data = date_df.merge(steps_data, on='date', how='left')
# steps_data = steps_data.ffill()
spo2_data = date_df.merge(spo2_data, on='date', how='left')
# spo2_data = spo2_data.ffill()
breathing_data = date_df.merge(breathing_data, on='date', how='left')
# breathing_data = breathing_data.ffill()

merged_df = hrv_data.merge(stress_data, left_on='date', right_on='date', how='outer')
merged_df = merged_df.merge(resting_hr_data, left_on='date', right_on='date', how='outer')
merged_df = merged_df.merge(sleep_data, left_on='date', right_on='date', how='outer')
merged_df = merged_df.merge(sleep_stage_data, left_on='date', right_on='date', how='outer')
merged_df = merged_df.merge(steps_data, left_on='date', right_on='date', how='outer')
merged_df = merged_df.merge(spo2_data, left_on='date', right_on='date', how='outer')
final_df = merged_df.merge(breathing_data, left_on='date', right_on='date', how='outer')

final_df.drop('coverage', axis=1, inplace=True)
final_df.drop('minutesToFallAsleep', axis=1, inplace=True)
final_df.drop('timeInBed', axis=1, inplace=True)
final_df.drop('rem_count', axis=1, inplace=True)
final_df.drop('wake_count', axis=1, inplace=True)
final_df.drop('light_count', axis=1, inplace=True)
final_df.drop('deep_count', axis=1, inplace=True)

final_df.rename({'deepSleep':'deep_breath'}, axis=1, inplace=True)
final_df.rename({'remSleep':'rem_breath'}, axis=1, inplace=True)
final_df.rename({'fullSleep':'full_breath'}, axis=1, inplace=True)
final_df.rename({'lightSleep':'light_breath'}, axis=1,inplace=True)

final_df.rename({'deep':'deep_count'}, axis=1, inplace=True)
final_df.rename({'rem':'rem_count'}, axis=1, inplace=True)
final_df.rename({'wake':'wake_count'}, axis=1, inplace=True)
final_df.rename({'light':'light_count'}, axis=1,inplace=True)

final_df['is_weekend'] = pd.to_datetime(final_df['date']).dt.dayofweek.isin([5, 6]).astype(int)

suggested_rem_proportion = 22
rem_range_min = 20
rem_range_max = 25

# Calculate the REM sleep proportion
final_df['rem_proportion'] = final_df['rem_proportion'] * 100
final_df['deep_proportion'] = final_df['deep_proportion'] * 100
final_df['wake_proportion'] = final_df['wake_proportion'] * 100
final_df['light_proportion'] = final_df['light_proportion'] * 100

# Add a new column for the REM sleep range
final_df['rem_sleep_range'] = 0

# Check if the REM sleep proportion is within the desired range
final_df.loc[final_df['rem_proportion'] >= rem_range_min, 'rem_sleep_range'] = 1
final_df.loc[final_df['rem_proportion'] > rem_range_max, 'rem_sleep_range'] = 2

final_df.dropna(axis=0, how='any', inplace=True)

# Prepare the Data for Model Training
hrv_features = final_df[['rmssd', 'hf', 'lf','spo2','steps','rem_breath','rem_minutes','rem_count','rem_proportion','deep_breath', 'deep_minutes', 'deep_count', 'deep_proportion','light_breath','light_minutes','light_count','light_proportion','wake_minutes','wake_count','wake_proportion','restingHeartRate','duration']]
stress_labels = final_df['level'] - 1

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(hrv_features, stress_labels, test_size=0.2, random_state=42)

# Apply oversampling to the training data
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Apply undersampling to the training data
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)


rf_model = RandomForestClassifier()
rf_model.fit(X_train_resampled, y_train_resampled)

dump(rf_model, '../models/model.pkl')

In [None]:
# Model Evaluation
y_pred = rf_model.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)