In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [47]:
# Load data
df = pd.read_csv('../data/HR_data.csv')

In [48]:
# Order the data frame so it is more clear to read
df_new = df.copy()
df_new = df_new.sort_values(by=['Cohort', 'Round', 'Phase'])
df_new.reset_index(drop=True, inplace=True)

# Define the columns to move to the front
cols_to_front = ['Unnamed: 0','Cohort', 'Round', 'Phase','Individual','Puzzler']

# Get all other columns, excluding the ones moved to the front
other_cols = [col for col in df_new.columns if col not in cols_to_front]

# Create the new column order
new_column_order = cols_to_front + other_cols

# Reindex the DataFrame with the new column order
df_new = df_new[new_column_order]

In [49]:
nan_counts = df_new.isnull().sum()

# Display columns with NaN values and their counts
print("NaN values in each column:")
print(nan_counts[nan_counts > 0])


rows_with_nan = df_new[df_new.isnull().any(axis=1)]

# Display the rows with NaN values
print("Rows with NaN values:")
print(rows_with_nan.index)

NaN values in each column:
EDA_TD_P_RT     1
EDA_TD_P_ReT    1
inspired        2
attentive       1
afraid          1
active          1
determined      2
dtype: int64
Rows with NaN values:
Index([28, 52, 68, 74, 206], dtype='int64')


In [None]:
# Identify columns with nan values
nan_counts = df_new.isnull().sum()
cols_with_nan = nan_counts[nan_counts > 0].index

# Identify rows with nan values
rows_with_nan_index = df_new[df_new.isnull().any(axis=1)].index

# See the specific rows and columns with nan values
print("Subset of DataFrame showing only rows and columns with missing values:")
df_new.loc[rows_with_nan_index, cols_with_nan]

Subset of DataFrame showing only rows and columns with missing values:


Unnamed: 0,EDA_TD_P_RT,EDA_TD_P_ReT,inspired,attentive,afraid,active,determined
28,2.108696,1.911765,,3.0,4.0,2.0,2.0
52,3.3875,2.367647,,3.0,1.0,2.0,3.0
68,1.96875,2.703125,2.0,2.0,1.0,2.0,
74,2.321429,1.892857,2.0,,,,
206,,,2.0,3.0,1.0,3.0,3.0


In [51]:
# Input the EDA values first KNN
from sklearn.impute import KNNImputer

EDA_cols = ['EDA_TD_P_RT', 'EDA_TD_P_ReT']

# KNN
knn_imputer = KNNImputer(n_neighbors=5)
df_new[EDA_cols] = knn_imputer.fit_transform(df_new[EDA_cols])

# Mode Imputation
# for col in EDA_cols:
#     mode_value = df_new[col].mode()[0] # Calculate the mode (take the first if multiple modes exist)
#     df_new[col].fillna(mode_value, inplace=True)

print("Subset of DataFrame showing only rows and columns with missing values:")

# Re-check the specific rows/columns to see the imputed values
df_new.loc[rows_with_nan_index, cols_with_nan]

Subset of DataFrame showing only rows and columns with missing values:


Unnamed: 0,EDA_TD_P_RT,EDA_TD_P_ReT,inspired,attentive,afraid,active,determined
28,2.108696,1.911765,,3.0,4.0,2.0,2.0
52,3.3875,2.367647,,3.0,1.0,2.0,3.0
68,1.96875,2.703125,2.0,2.0,1.0,2.0,
74,2.321429,1.892857,2.0,,,,
206,1.998292,2.013928,2.0,3.0,1.0,3.0,3.0


In [52]:
# Input the emotion values KNN

emotion_cols = ['inspired', 'attentive', 'afraid', 'active', 'determined']

# # KNN and round
# knn_imputer = KNNImputer(n_neighbors=5)
# imputed_values = knn_imputer.fit_transform(df_new[emotion_cols])
# df_new[emotion_cols] = np.round(imputed_values)

# Mode: more used for ordinal variables
for col in emotion_cols:
    mode_value = df_new[col].mode()[0] # Calculate the mode
    df_new.loc[df_new[col].isnull(), col] = mode_value

print("Subset of DataFrame showing only rows and columns with missing values:")

# Re-check the specific rows/columns to see the imputed values
df_new.loc[rows_with_nan_index, cols_with_nan]

Subset of DataFrame showing only rows and columns with missing values:


Unnamed: 0,EDA_TD_P_RT,EDA_TD_P_ReT,inspired,attentive,afraid,active,determined
28,2.108696,1.911765,2.0,3.0,4.0,2.0,2.0
52,3.3875,2.367647,2.0,3.0,1.0,2.0,3.0
68,1.96875,2.703125,2.0,2.0,1.0,2.0,3.0
74,2.321429,1.892857,2.0,3.0,1.0,2.0,3.0
206,1.998292,2.013928,2.0,3.0,1.0,3.0,3.0


In [None]:
# Dowload the data
df_new.to_csv('../data/HR_data_preprocessed.csv', index=False)