# Required Libraries

In [167]:
import os
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as py 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

# Combining all the data into the one folder

As the data hierarchy is like that there is main folder named as **Stress_dataset** and inside that folder there is a data of 15 nurses with diffrent id and in each there are some psychological  signals which are divided in csv files and same like that all 15 nurses have the same structure . But what the code acctually doing is  

**=>** combining all the psychological signals into one csv files like it is combining the signals into the single file and 5          combined **csv** files in the folder

In [327]:
# Script to combine all signals into one
DATA_PATH = "C:/Users/LENOVO/Downloads/Data/Stress_dataset"
SAVE_PATH = "C:/Users/LENOVO/Downloads/a2"
os.mkdir(SAVE_PATH)

final_columns = {
    'ACC': ['id', 'X', 'Y', 'Z', 'datetime'],
    'BVP': ['id','BVP','datetime'],
    'EDA': ['id', 'EDA', 'datetime'],
    'HR': ['id', 'HR', 'datetime'],
    'TEMP': ['id', 'TEMP', 'datetime'],
    

}

names = {
    'ACC.csv': ['X', 'Y', 'Z'],
    'BVP.csv': ['BVP'],
    'EDA.csv': ['EDA'],
    'HR.csv': ['HR'],
    'TEMP.csv': ['TEMP'],
    
}

desired_signals = ['ACC.csv', 'BVP.csv', 'EDA.csv', 'HR.csv', 'TEMP.csv']
desired_signals = ['ACC.csv', 'BVP.csv','EDA.csv', 'HR.csv','TEMP.csv']

acc = pd.DataFrame(columns=final_columns['ACC'])
bvp = pd.DataFrame(columns=final_columns['BVP'])
eda = pd.DataFrame(columns=final_columns['EDA'])
hr = pd.DataFrame(columns=final_columns['HR'])
temp = pd.DataFrame(columns=final_columns['TEMP'])


def process_df(df, file):
    start_timestamp = df.iloc[0,0]
    sample_rate = df.iloc[1,0]
    new_df = pd.DataFrame(df.iloc[2:].values, columns=df.columns)
    new_df['id'] =  file[-2:]
    new_df['datetime'] = [(start_timestamp + i/sample_rate) for i in range(len(new_df))]
    return new_df

for file in os.listdir(DATA_PATH):
    print(f'Processing {file}')
    for sub_file in os.listdir(os.path.join(DATA_PATH, file)):
        if not sub_file.endswith(".zip"):
            for signal in os.listdir(os.path.join(DATA_PATH, file, sub_file)):
                if signal in desired_signals:
                    df = pd.read_csv(os.path.join(DATA_PATH, file, sub_file, signal), names=names[signal], header=None)
                    if not df.empty:
                        if signal == 'ACC.csv':
                            acc = pd.concat([acc, process_df(df, file)])
                        if signal == 'BVP.csv':
                            bvp = pd.concat([bvp, process_df(df, file)]) 
                        if signal == 'EDA.csv':
                            eda = pd.concat([eda, process_df(df, file)])
                        if signal == 'HR.csv':
                            hr = pd.concat([hr, process_df(df, file)])
                        if signal == 'TEMP.csv':
                            temp = pd.concat([temp, process_df(df, file)])
                        

print('Saving Data ...')
acc.to_csv(os.path.join(SAVE_PATH, 'combined_acc.csv'), index=False)
bvp.to_csv(os.path.join(SAVE_PATH, 'combined_bvp.csv'), index=False)
eda.to_csv(os.path.join(SAVE_PATH, 'combined_eda.csv'), index=False)
hr.to_csv(os.path.join(SAVE_PATH, 'combined_hr.csv'), index=False)
temp.to_csv(os.path.join(SAVE_PATH, 'combined_temp.csv'), index=False)


Processing 15
Processing 5C
Processing 6B
Processing 6D
Processing 7A
Processing 7E
Processing 83
Processing 8B
Processing 94
Processing BG
Processing CE
Processing DF
Processing E4
Processing EG
Processing F5
Saving Data ...


# Merging the data into one csv file

After that we have 5 combined csv files together but there is a need to combined these 5 files into one . so this code is combining all these 5 files into the one csv file named as **mergedcsv.csv** 

In [3]:
folder_path = "C:/Users/LENOVO/Downloads/a2"

merged_df = pd.DataFrame()

# this lopp will merge all the csv files 
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        filepath = os.path.join(folder_path, filename)
        df = pd.read_csv(filepath)
        merged_df = pd.concat([merged_df, df], axis=1)

# mergeed_df has all the columns of the dataset
merged_df.to_csv('mergedcsv.csv', index=False)


After that this code is reading that csv file which i have saved previously 

In [253]:
df = pd.read_csv("C:/Users/LENOVO/mergedcsv.csv", low_memory=False)

Here is the dataset containing **514674 rows x 17 colums** with the indexing of the id and date and time because it was repeating in the dataset that is why indexing is done here 

In [254]:
df

Unnamed: 0,userid,X,Y,Z,accdatetime,userid.1,BVP,bvpdatetime,userid.2,EDA,edadatetime,userid.3,HR,hrdatetime,userid.4,TEMP,tempdatetime
0,15,-13,-3,61,1594231189,15,0.00,1594231189,15,0.000000,1594231189,15,55.00,1.594231e+09,15,30.59,1594231189
1,15,-13,-3,60,1594231189,15,0.00,1594231189,15,0.042276,1594231189,15,55.00,1.594231e+09,15,30.59,1594231189
2,15,-12,-2,60,1594231189,15,0.00,1594231189,15,0.064055,1594231190,15,57.00,1.594231e+09,15,30.59,1594231190
3,15,-13,-3,60,1594231189,15,0.00,1594231189,15,0.064055,1594231190,15,67.25,1.594231e+09,15,30.59,1594231190
4,15,-13,-3,61,1594231189,15,0.00,1594231189,15,0.065336,1594231190,15,65.40,1.594231e+09,15,30.59,1594231190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514670,F5,-71,-5,0,1594301555,F5,-15.01,1594239231,F5,0.233441,1596824115,F5,,,F5,31.81,1596824121
514671,F5,-69,-8,3,1594301555,F5,-21.60,1594239231,F5,0.236003,1596824115,F5,,,F5,31.81,1596824121
514672,F5,-59,-4,9,1594301555,F5,-27.14,1594239231,F5,0.237285,1596824115,F5,,,F5,31.79,1596824121
514673,F5,-56,0,9,1594301555,F5,-30.64,1594239231,F5,0.232160,1596824115,F5,,,F5,31.79,1596824121


here is the null count of the dataset that how much NaN values are there in the dataset previously in the assignment 1 we are already done with the data exploration and the data cleaning so the data is already explored and clean . but on the other hand we have some NaN values in the **HR** , **hrdatetime** .

In [255]:
## in this function we are obtaining the the number of null columns in the dataset
null_counts = df.isnull().sum()
print(null_counts)

userid              0
X                   0
Y                   0
Z                   0
accdatetime         0
userid.1            0
BVP                 0
bvpdatetime         0
userid.2            0
EDA                 0
edadatetime         0
userid.3            0
HR              35155
hrdatetime      35155
userid.4            0
TEMP                0
tempdatetime        0
dtype: int64


In this code iam getting all the relevant information regarding the dataset like the **datatypes , Non-Null Count , Dtype** 

In [256]:
#info function provides the information of the data frame that how many null columns and the data type of the variable 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 514675 entries, 0 to 514674
Data columns (total 17 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   userid        514675 non-null  object 
 1   X             514675 non-null  int64  
 2   Y             514675 non-null  int64  
 3   Z             514675 non-null  int64  
 4   accdatetime   514675 non-null  int64  
 5   userid.1      514675 non-null  object 
 6   BVP           514675 non-null  float64
 7   bvpdatetime   514675 non-null  int64  
 8   userid.2      514675 non-null  object 
 9   EDA           514675 non-null  float64
 10  edadatetime   514675 non-null  int64  
 11  userid.3      514675 non-null  object 
 12  HR            479520 non-null  float64
 13  hrdatetime    479520 non-null  float64
 14  userid.4      514675 non-null  object 
 15  TEMP          514675 non-null  float64
 16  tempdatetime  514675 non-null  int64  
dtypes: float64(5), int64(7), object(5)
memory usage:

In [257]:
##datatype of all the column
df.dtypes

userid           object
X                 int64
Y                 int64
Z                 int64
accdatetime       int64
userid.1         object
BVP             float64
bvpdatetime       int64
userid.2         object
EDA             float64
edadatetime       int64
userid.3         object
HR              float64
hrdatetime      float64
userid.4         object
TEMP            float64
tempdatetime      int64
dtype: object

# Compressing the dataset by using median 

As there is alot of data which is very difficult to read and to work on that so that is why i am compressing the csv file where i have all the columns just because there is too much data and its really difficult to work with the large data so i am compressing the file with a technique. Like iam taking median of every column inshort that i have the data of 15 nurses so this technique will provide the median data of 15 nurses and their psychological signals so it will be easy to create and implement a model. Here i in the compression i didnt used the datatime just because when we are implementing the model datetime is not required that is whty iam not using data and time 

In the below code iam compressing the data for the **acc**

In [258]:
import pandas as pd

# compute median values of columns 'X', 'Y', and 'Z' for each unique 'userid'
df_median = df.groupby('userid')['X', 'Y', 'Z'].median().reset_index()

# set column names
df_median.columns = ['userid', 'median_X', 'median_Y', 'median_Z']



After compressing the acc with respect to id it look like this 

In [262]:
df_median = df_median.rename(columns={'userid': 'id'})
df_median

Unnamed: 0,id,median_X,median_Y,median_Z
0,15,-21.0,-3.0,58.0
1,5C,-32.0,-4.0,54.0
2,6B,-37.0,-1.0,51.0
3,6D,-37.0,-2.0,50.0
4,7A,-40.0,-4.0,34.0
5,7E,-48.0,9.0,32.0
6,83,-37.0,0.0,44.0
7,8B,-43.0,-4.0,44.0
8,94,-54.0,8.0,22.0
9,BG,-48.0,9.0,12.0


Now iam compression for **bvp** signal

In [263]:
df_median1 = df.groupby('userid.1')['BVP'].median().reset_index()

# set column names
df_median1.columns = ['userid.1', 'median_BVP']



In [264]:
df_median1 = df_median1.rename(columns={'userid.1': 'id'})

After compressing bvp look like this 

In [266]:
df_median1

Unnamed: 0,id,median_BVP
0,15,2.44
1,5C,4.02
2,6B,1.26
3,6D,1.58
4,7A,3.03
5,7E,0.9
6,83,1.2
7,8B,0.91
8,94,0.47
9,BG,1.15


Now iam compressing for **eda**

In [233]:
df_median2 = df.groupby('userid.2')['EDA'].median().reset_index()

# set column names
df_median2.columns = ['userid.2', 'median_EDA']



In [234]:
df_median2 = df_median2.rename(columns={'userid.2': 'id'})

After compression of eda it look like this 

In [235]:
df_median2

Unnamed: 0,id,median_EDA
0,15,0.075585
1,5C,0.085834
2,6B,0.078147
3,6D,0.057653
4,7A,0.040998
5,7E,0.115306
6,83,0.086106
7,8B,0.125822
8,94,0.163991
9,BG,0.140929


Now iam compressing for **HR**

In [236]:
df_median3 = df.groupby('userid.3')['HR'].median().reset_index()

# set column names
df_median3.columns = ['userid.3', 'median_HR']


In [237]:
df_median3 = df_median3.rename(columns={'userid.3': 'id'})

After compression of hr it look like this

In [268]:
df_median3

Unnamed: 0,id,median_HR
0,15,75.83
1,5C,84.57
2,6B,66.93
3,6D,75.73
4,7A,78.15
5,7E,84.97
6,83,90.62
7,8B,85.83
8,94,84.52
9,BG,82.25


Now iam compressing for **temp**

In [270]:
df_median4 = df.groupby('userid.4')['TEMP'].median().reset_index()

# set column names
df_median4.columns = ['userid.4', 'median_TEMP']



In [271]:
df_median4 = df_median4.rename(columns={'userid.4': 'id'})

After compression of temp it look like this

In [273]:
df_median4

Unnamed: 0,id,median_TEMP
0,15,31.31
1,5C,31.33
2,6B,30.21
3,6D,29.57
4,7A,27.21
5,7E,32.16
6,83,31.35
7,8B,30.95
8,94,30.43
9,BG,29.71


### Merging the multiple data frame into single one 

After that iam combining all the multiple dataframe into the single one and to work on them collectively and to implement a model into it

In [278]:

merged_df = pd.merge(df_median, df_median1, on='id', how='outer')
merged_df1 = pd.merge(merged_df, df_median2, on='id', how='outer')
merged_df2 = pd.merge(merged_df1, df_median3, on='id', how='outer')
merged_df3 = pd.merge(merged_df2, df_median4, on='id', how='inner')





merged_df = pd.merge(df_median, df_median1, on='id', how='outer')
# print the merged DataFrame
print(merged_df3)


    id  median_X  median_Y  median_Z  median_BVP  median_EDA  median_HR  \
0   15     -21.0      -3.0      58.0        2.44    0.075585      75.83   
1   5C     -32.0      -4.0      54.0        4.02    0.085834      84.57   
2   6B     -37.0      -1.0      51.0        1.26    0.078147      66.93   
3   6D     -37.0      -2.0      50.0        1.58    0.057653      75.73   
4   7A     -40.0      -4.0      34.0        3.03    0.040998      78.15   
5   7E     -48.0       9.0      32.0        0.90    0.115306      84.97   
6   83     -37.0       0.0      44.0        1.20    0.086106      90.62   
7   8B     -43.0      -4.0      44.0        0.91    0.125822      85.83   
8   94     -54.0       8.0      22.0        0.47    0.163991      84.52   
9   BG     -48.0       9.0      12.0        1.15    0.140929      82.25   
10  CE     -58.0      21.0      13.0        1.76    0.447878      94.73   
11  DF     -47.0     -33.0       9.0        0.95    0.131953      92.51   
12  E4     -15.0     -62.

Here we are filling the NaN values by using interpolate method is linear to fillout the NaN vaalues in the dataframe 

In [279]:
merged_df3 = merged_df3.interpolate(method='linear')


Here is the final look of the dataframe where the every signal is present with each nurses it was the median of the data we compressed the data jjust becuase there are many risk associated while working with the large dataset that is why we have compressed the dataset like this now it will be easy and efficient to implement a model on this dataframe 










In [280]:
merged_df3

Unnamed: 0,id,median_X,median_Y,median_Z,median_BVP,median_EDA,median_HR,median_TEMP
0,15,-21.0,-3.0,58.0,2.44,0.075585,75.83,31.31
1,5C,-32.0,-4.0,54.0,4.02,0.085834,84.57,31.33
2,6B,-37.0,-1.0,51.0,1.26,0.078147,66.93,30.21
3,6D,-37.0,-2.0,50.0,1.58,0.057653,75.73,29.57
4,7A,-40.0,-4.0,34.0,3.03,0.040998,78.15,27.21
5,7E,-48.0,9.0,32.0,0.9,0.115306,84.97,32.16
6,83,-37.0,0.0,44.0,1.2,0.086106,90.62,31.35
7,8B,-43.0,-4.0,44.0,0.91,0.125822,85.83,30.95
8,94,-54.0,8.0,22.0,0.47,0.163991,84.52,30.43
9,BG,-48.0,9.0,12.0,1.15,0.140929,82.25,29.71


Now iam working with the **Surevy Results.xlsx** Where we have the **Stress level** so we want the stress level from the file with respect to ID

In [282]:
## now we will load the survey result file
df = pd.read_excel("C:/Users/LENOVO/assignment2/SurveyResults.xlsx")

In [283]:
df

Unnamed: 0,id,Start time,End time,duration,date,Stress level,COVID related,Treating a covid patient,Patient in Crisis,Patient or patient's family,Doctors or colleagues,"Administration, lab, pharmacy, radiology, or other ancilliary services\n",Increased Workload,Technology related stress,Lack of supplies,Documentation,Competency related stress,Saftey (physical or physiological threats),Work Environment - Physical or others: work processes or procedures,Description
0,5C,08:00:00,09:00:00,01:00:00,2020-04-15,1,0,1,0,1,0,0,0,0,0,0,0,0,0,na
1,5C,17:31:00,17:58:00,00:27:00,2020-04-14,1,0,1,0,1,0,0,1,0,0,0,0,0,0,na
2,E4,15:32:00,15:37:00,00:05:00,2020-04-18,2,0,1,0,1,0,0,0,0,0,0,0,0,0,Spoke with family regarding patient's decline ...
3,E4,14:05:00,14:11:00,00:06:00,2020-04-18,2,0,0,0,1,0,0,0,0,0,0,0,0,0,Was placing another FaceTime call to a patient...
4,7A,13:52:00,14:03:00,00:11:00,2020-04-18,2,0,1,0,0,0,0,1,0,0,0,0,0,1,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,83,23:05:00,23:50:00,00:45:00,2020-12-12,2,0,0,0,0,0,0,0,0,0,0,0,0,0,na
354,83,00:12:00,02:01:00,01:49:00,2020-12-13,2,0,1,0,0,0,0,0,0,0,0,0,0,0,na
355,83,20:34:00,20:48:00,00:14:00,2020-12-11,2,0,1,0,0,0,0,0,0,0,0,0,0,0,na
356,83,20:54:00,21:13:00,00:19:00,2020-12-11,2,0,1,0,0,0,0,0,0,0,0,0,0,0,na


as i did previously now we will do the same thing with this file we just want the stress level of the nurses we are only selecting that column with the code below. furthermore we are labelling the stress level like 0.0 as Low 1.0 is moderate and 2.0 as High

In [287]:
import pandas as pd
import numpy as np

# Load the XLSX file into a pandas dataframe
df = pd.read_excel("C:/Users/LENOVO/assignment2/SurveyResults.xlsx")

# Replace non-numeric values in the 'Stress level' column with NaN
df['Stress level'] = pd.to_numeric(df['Stress level'], errors='coerce')

# Compute the mode stress level for each 'id'
df_mode = df.groupby('id')['Stress level'].apply(lambda x: x.mode()[0]).reset_index()

# Map numerical mode values to categorical values
reverse_mapping = {0.0: "Low", 1.0: "Moderate", 2.0: "High"}
df_mode["Stress level"] = df_mode["Stress level"].map(reverse_mapping)

# Set column names
df_mode.columns = ['id', 'Stress level']

print(df_mode)


    id Stress level
0   15         High
1   83         High
2   94          Low
3   5C     Moderate
4   6B         High
5   6D         High
6   7A         High
7   7E          Low
8   8B         High
9   BG         High
10  CE         High
11  DF         High
12  E4         High
13  EG         High
14  F5         High


This is the final look that what we are acheiving from the survey results file 

In [288]:
df_mode

Unnamed: 0,id,Stress level
0,15,High
1,83,High
2,94,Low
3,5C,Moderate
4,6B,High
5,6D,High
6,7A,High
7,7E,Low
8,8B,High
9,BG,High


Now here iam megeing the previous file and the stress level file together to implement a model into the single file and to implement into the simpler data 


In [295]:
df = pd.merge(merged_df3, df_mode, on='id', how='inner')

This is the final look of the dtatfram after combining both the survey result file and the psycologica signal file now we have only dataframe inwhich we have all the important attributes with respect to the nurses id id is the unique identifier

In [297]:
df

Unnamed: 0,id,median_X,median_Y,median_Z,median_BVP,median_EDA,median_HR,median_TEMP,Stress level
0,5C,-32.0,-4.0,54.0,4.02,0.085834,84.57,31.33,Moderate
1,6B,-37.0,-1.0,51.0,1.26,0.078147,66.93,30.21,High
2,6D,-37.0,-2.0,50.0,1.58,0.057653,75.73,29.57,High
3,7A,-40.0,-4.0,34.0,3.03,0.040998,78.15,27.21,High
4,7E,-48.0,9.0,32.0,0.9,0.115306,84.97,32.16,Low
5,8B,-43.0,-4.0,44.0,0.91,0.125822,85.83,30.95,High
6,BG,-48.0,9.0,12.0,1.15,0.140929,82.25,29.71,High
7,CE,-58.0,21.0,13.0,1.76,0.447878,94.73,33.0,High
8,DF,-47.0,-33.0,9.0,0.95,0.131953,92.51,30.59,High
9,E4,-15.0,-62.0,-4.0,-0.44,0.151179,92.51,33.65,High


just analyzing the attributes of the dataframe

In [299]:
## in this function we are obtaining the the number of null columns in the dataset
null_counts = df.isnull().sum()
print(null_counts)

id              0
median_X        0
median_Y        0
median_Z        0
median_BVP      0
median_EDA      0
median_HR       0
median_TEMP     0
Stress level    0
dtype: int64


In [300]:
#info function rovides the information of the data frame that how many null columns and the data type of the variable 
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12 entries, 0 to 11
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            12 non-null     object 
 1   median_X      12 non-null     float64
 2   median_Y      12 non-null     float64
 3   median_Z      12 non-null     float64
 4   median_BVP    12 non-null     float64
 5   median_EDA    12 non-null     float64
 6   median_HR     12 non-null     float64
 7   median_TEMP   12 non-null     float64
 8   Stress level  12 non-null     object 
dtypes: float64(7), object(2)
memory usage: 960.0+ bytes


# Feature Selection

Now in the code below there is the corelation matrix for the combined file and after analyzing the co relation matrix we will select the best feature for the model implementation 

In [302]:
# Load the data into a pandas DataFrame
df = df

# Compute the correlation matrix
corr_matrix = df.corr()

# Display the correlation matrix
print(corr_matrix)

             median_X  median_Y  median_Z  median_BVP  median_EDA  median_HR  \
median_X     1.000000 -0.760826 -0.008324   -0.113347   -0.482060  -0.143088   
median_Y    -0.760826  1.000000  0.473003    0.387464    0.259640  -0.235403   
median_Z    -0.008324  0.473003  1.000000    0.435614   -0.477681  -0.537820   
median_BVP  -0.113347  0.387464  0.435614    1.000000   -0.126359  -0.362988   
median_EDA  -0.482060  0.259640 -0.477681   -0.126359    1.000000   0.559678   
median_HR   -0.143088 -0.235403 -0.537820   -0.362988    0.559678   1.000000   
median_TEMP  0.113478 -0.226512 -0.335760   -0.516089    0.552408   0.666776   

             median_TEMP  
median_X        0.113478  
median_Y       -0.226512  
median_Z       -0.335760  
median_BVP     -0.516089  
median_EDA      0.552408  
median_HR       0.666776  
median_TEMP     1.000000  


**[median_EDA]** , **[median_HR]** , **[median_TEMP]** have the good decent co relation so we are selecting these three as a feature . furthermore in the research paper the researchers all used these three same feature for the model prediction 

# Creating A Model

In [304]:
# Load data
df = df


Here in the code where we are setting the selected features in the variable X and the target variable in the variable Y

In [311]:
# Selected the features 
X = df[['median_EDA', 'median_HR', 'median_TEMP']]
y = df['Stress level']


# Train And Test Spliting

Now we are splitting the data in the training and the testing phase for the model creation 

In [312]:
# Oversample the minority class
oversample = RandomOverSampler(random_state=42)
X, y = oversample.fit_resample(X, y)

In [313]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [315]:
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

iam implementing two models the first one is the Logistic regression and the other one is K-nearest and i will select the model which will giving me the best accuracy score 

In [322]:
# Fit models
classifiers = {

    'Logistic Regression': LogisticRegression(),    
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Results And Prediction

In the code below the loop will perform the cross validation and after that predicting the score for stress and also providing the accuracy for both the model . And the Score of **Cross Validation** is also mentioned here

In [325]:
for name, clf in classifiers.items():
    print(f"{'='*30}\n{name}\n{'-'*30}")
    # Performing cross-validation
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    print(f"Cross-validation Scores: {scores}\nMean score: {scores.mean()}\n")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
        # Print accuracy and confusion matrix
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)
    print(f"Accuracy: {acc}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Classification Report:\n{cr}\n")

Logistic Regression
------------------------------
Cross-validation Scores: [1.  0.8 0.6 0.6 1. ]
Mean score: 0.8

Accuracy: 0.8333333333333334
Confusion Matrix:
[[1 1 0]
 [0 2 0]
 [0 0 2]]
Classification Report:
              precision    recall  f1-score   support

        High       1.00      0.50      0.67         2
         Low       0.67      1.00      0.80         2
    Moderate       1.00      1.00      1.00         2

    accuracy                           0.83         6
   macro avg       0.89      0.83      0.82         6
weighted avg       0.89      0.83      0.82         6


K-Nearest Neighbors
------------------------------
Cross-validation Scores: [0.8  0.8  0.8  0.6  0.75]
Mean score: 0.7500000000000001

Accuracy: 0.6666666666666666
Confusion Matrix:
[[0 1 1]
 [0 2 0]
 [0 0 2]]
Classification Report:
              precision    recall  f1-score   support

        High       0.00      0.00      0.00         2
         Low       0.67      1.00      0.80         2
    Moder

# RESULTS

After implementing both the models the results i got is given below 

1) Logistic Regression  
      Accuracy : 0.833
      Mean SCORE : 0.8
        
2) K-Nearest Neighbors
      Accuracy : 0.66
      MEAN SCORE : 0.75

# MODEL SELECTION 

I selected the logistic regression just because logistic regression is providing the better result then k-nearest neighbours and the mean score is also better then the other one overall the statstically logistic regression is providing the better output. then the k nearest . so my final choice of the model is logistic regresiion which is completing the company requirements to predict the stress with respect to the psycological signals