#Data Preprocessing


In [1]:
import pandas as pd

In [2]:
# Load the datasets
heart_rate_non_linear_train = pd.read_csv('/content/drive/MyDrive/Updated_Dataset/heart_rate_non_linear_features_train.csv')
time_domain_train = pd.read_csv('/content/drive/MyDrive/Updated_Dataset/time_domain_features_train.csv')
frequency_domain_train = pd.read_csv('/content/drive/MyDrive/Updated_Dataset/frequency_domain_features_train.csv')


In [3]:
# Convert 'datasetId' in heart_rate_non_linear_train and 'uuid' in other datasets to the same type
heart_rate_non_linear_train['datasetId'] = heart_rate_non_linear_train['datasetId'].astype(str)
time_domain_train['uuid'] = time_domain_train['uuid'].astype(str)
frequency_domain_train['uuid'] = frequency_domain_train['uuid'].astype(str)

In [4]:
# Check column names of all datasets
print(heart_rate_non_linear_train.columns)
print(time_domain_train.columns)
print(frequency_domain_train.columns)


Index(['uuid', 'SD1', 'SD2', 'sampen', 'higuci', 'datasetId', 'condition'], dtype='object')
Index(['MEAN_RR', 'MEDIAN_RR', 'SDRR', 'RMSSD', 'SDSD', 'SDRR_RMSSD', 'HR',
       'pNN25', 'pNN50', 'KURT', 'SKEW', 'MEAN_REL_RR', 'MEDIAN_REL_RR',
       'SDRR_REL_RR', 'RMSSD_REL_RR', 'SDSD_REL_RR', 'SDRR_RMSSD_REL_RR',
       'KURT_REL_RR', 'SKEW_REL_RR', 'uuid'],
      dtype='object')
Index(['uuid', 'VLF', 'VLF_PCT', 'LF', 'LF_PCT', 'LF_NU', 'HF', 'HF_PCT',
       'HF_NU', 'TP', 'LF_HF', 'HF_LF'],
      dtype='object')


In [41]:
# Merge only the datasets that have 'uuid' (heart_rate_non_linear_train and frequency_domain_train)
train_data = pd.merge(heart_rate_non_linear_train, frequency_domain_train, on='uuid')

# Drop unnecessary columns like 'uuid' if not needed
train_data.drop(columns=['uuid'], inplace=True)

# Explore the merged data
train_data.info()
train_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SD1        19999 non-null  float64
 1   SD2        19999 non-null  float64
 2   sampen     19999 non-null  float64
 3   higuci     19999 non-null  float64
 4   datasetId  19999 non-null  object 
 5   condition  19999 non-null  object 
 6   VLF        19999 non-null  float64
 7   VLF_PCT    19999 non-null  float64
 8   LF         19999 non-null  float64
 9   LF_PCT     19999 non-null  float64
 10  LF_NU      19999 non-null  float64
 11  HF         19999 non-null  float64
 12  HF_PCT     19999 non-null  float64
 13  HF_NU      19999 non-null  float64
 14  TP         19999 non-null  float64
 15  LF_HF      19999 non-null  float64
 16  HF_LF      19999 non-null  float64
dtypes: float64(15), object(2)
memory usage: 2.6+ MB


Unnamed: 0,SD1,SD2,sampen,higuci,VLF,VLF_PCT,LF,LF_PCT,LF_NU,HF,HF_PCT,HF_NU,TP,LF_HF,HF_LF
count,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0,19999.0
mean,10.606856,154.321079,2.062377,1.182252,2206.594533,64.303917,946.726438,34.085692,95.593261,39.092854,1.610391,4.406739,3192.413825,116.431084,0.048214
std,2.9136,109.219042,0.207395,0.062339,1827.754378,16.877325,574.035566,16.127098,4.116735,45.485751,1.761178,4.116735,1931.837492,361.260823,0.049305
min,3.921014,38.30819,0.438945,1.034418,162.489666,19.538255,91.365407,2.181166,70.519949,0.066006,0.002187,0.013684,379.094555,2.392124,0.000137
25%,8.389207,90.70795,2.034806,1.139758,1000.083081,52.765352,543.335791,22.210154,93.73761,10.708597,0.348563,1.238909,1836.0329,14.968346,0.012545
50%,10.215244,116.390882,2.133989,1.174449,1669.251875,66.528373,783.538667,31.98907,96.658633,25.010189,1.035163,3.341367,2803.682001,28.927868,0.034569
75%,12.691705,166.447914,2.18185,1.223618,2662.50866,76.931153,1205.658783,44.711588,98.761091,44.890856,2.222202,6.26239,4049.478368,79.716192,0.066808
max,18.833375,796.821019,2.23455,1.360941,12346.34432,97.720925,3275.104875,77.419013,99.986316,363.740953,12.774102,29.480051,13134.19763,7306.611088,0.418038


In [11]:
from sklearn.preprocessing import LabelEncoder

# Assume 'condition' is the target column with string values like 'no stress', 'mild stress', etc.
# Apply label encoding to convert it into numeric values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)  # Encode the target

# Convert target to float32 (after label encoding)
y = y.astype('float32')

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Reshape the data for LSTM input (samples, timesteps, features)
X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_val_reshaped = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))

# Check shapes and types
print(X_train_reshaped.shape)
print(y_train.shape)
print(X_val_reshaped.shape)
print(y_val.shape)


(15999, 1, 16)
(15999,)
(4000, 1, 16)
(4000,)


#Model Architecture

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Build the RNN model
model = Sequential([
    LSTM(50, activation='relu', input_shape=(1, X_train.shape[1])),
    Dropout(0.2),
    Dense(1)  # Final layer for regression (single output)
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# View model summary
model.summary()


  super().__init__(**kwargs)


#Model Training

In [15]:
# Train the model with smaller batch size
history = model.fit(X_train_reshaped, y_train, epochs=100, batch_size=16, validation_data=(X_val_reshaped, y_val), verbose=1)


Epoch 1/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1651 - val_loss: 0.1322
Epoch 2/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1622 - val_loss: 0.1364
Epoch 3/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.1652 - val_loss: 0.1346
Epoch 4/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.1574 - val_loss: 0.1318
Epoch 5/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.1642 - val_loss: 0.1317
Epoch 6/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1613 - val_loss: 0.1274
Epoch 7/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1634 - val_loss: 0.1291
Epoch 8/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.1624 - val_loss: 0.1262
Epoch 9/100
[1m

In [39]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predictions on validation data
y_pred = model.predict(X_val_reshaped)

# Calculate metrics
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

# Print results
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R² Score: {r2}')


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Mean Squared Error: 0.08928684890270233
Root Mean Squared Error: 0.2988090515136719
R² Score: 0.8083561658859253
