In [42]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

In [6]:
df = pd.read_csv("novagen_dataset.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9549 entries, 0 to 9548
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    9549 non-null   float64
 1   BMI                    9549 non-null   float64
 2   Blood_Pressure         9549 non-null   float64
 3   Cholesterol            9549 non-null   float64
 4   Glucose_Level          9549 non-null   float64
 5   Heart_Rate             9549 non-null   float64
 6   Sleep_Hours            9549 non-null   float64
 7   Exercise_Hours         9549 non-null   float64
 8   Water_Intake           9549 non-null   float64
 9   Stress_Level           9549 non-null   float64
 10  Target                 9549 non-null   int64  
 11  Smoking                9549 non-null   int64  
 12  Alcohol                9549 non-null   int64  
 13  Diet                   9549 non-null   int64  
 14  MentalHealth           9549 non-null   int64  
 15  Phys

In [26]:
X  = df.drop("Target" ,axis = 1)
y = df["Target"]


X_train , X_test, y_train , y_test = train_test_split(
    X,y,test_size = 0.2 , random_state = 42, stratify=y
)


In [27]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
log_reg = LogisticRegression(
    penalty="l2",
    solver="liblinear",
    max_iter=1000
)

log_reg.fit(X_train_scaled, y_train)

In [29]:
y_pred = log_reg.predict(X_test_scaled)

In [69]:
print("acc score :",accuracy_score(y_test,y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("classification Report : \n",classification_report(y_test,y_pred))


acc score : 0.9492146596858638
Recall: 0.9618473895582329
classification Report : 
               precision    recall  f1-score   support

           0       0.96      0.94      0.95       914
           1       0.94      0.96      0.95       996

    accuracy                           0.95      1910
   macro avg       0.95      0.95      0.95      1910
weighted avg       0.95      0.95      0.95      1910



# KNN model

In [70]:
kn = KNeighborsClassifier(
    n_neighbors=5,
    metric="euclidean"
)

kn.fit(X_train_scaled, y_train)

In [71]:
y_pred = kn.predict(X_test_scaled)

print("acc score :",accuracy_score(y_test,y_pred)*100)
print("Recall:", recall_score(y_test, y_pred))
print("classification Report : \n",classification_report(y_test,y_pred))

acc score : 88.32460732984293
Recall: 0.8835341365461847
classification Report : 
               precision    recall  f1-score   support

           0       0.87      0.88      0.88       914
           1       0.89      0.88      0.89       996

    accuracy                           0.88      1910
   macro avg       0.88      0.88      0.88      1910
weighted avg       0.88      0.88      0.88      1910



# DecissionTree

In [72]:
dt= DecisionTreeClassifier(
    max_depth = 15,
    random_state = 42,
    # n_estimators=200,
       
)

dt.fit(X_train_scaled , y_train)


y_pred = dt.predict(X_test_scaled)

print("acc score :",accuracy_score(y_test,y_pred)*100)
print("Recall:", recall_score(y_test, y_pred))

print("classification Report : \n",classification_report(y_test,y_pred))

acc score : 89.31937172774869
Recall: 0.9096385542168675
classification Report : 
               precision    recall  f1-score   support

           0       0.90      0.88      0.89       914
           1       0.89      0.91      0.90       996

    accuracy                           0.89      1910
   macro avg       0.89      0.89      0.89      1910
weighted avg       0.89      0.89      0.89      1910



In [74]:
rf = RandomForestClassifier(
    n_estimators=200,
    # max_depth = 5,
    random_state=42 
)


rf.fit(X_train_scaled , y_train)


y_pred = rf.predict(X_test_scaled)
print("acc score :",accuracy_score(y_test,y_pred)*100)
print("Recall:", recall_score(y_test, y_pred)*100)
print("classification Report : \n",classification_report(y_test,y_pred))

acc score : 93.97905759162303
Recall: 96.18473895582329
classification Report : 
               precision    recall  f1-score   support

           0       0.96      0.92      0.94       914
           1       0.93      0.96      0.94       996

    accuracy                           0.94      1910
   macro avg       0.94      0.94      0.94      1910
weighted avg       0.94      0.94      0.94      1910



In [75]:
gdc = GradientBoostingClassifier(
    learning_rate=0.1, 
    n_estimators=300,
    random_state=42,
    max_depth = 4
)

gdc.fit(X_train_scaled , y_train)


y_pred = gdc.predict(X_test_scaled)
print("acc score :",accuracy_score(y_test,y_pred)*100)
print("Recall:", recall_score(y_test, y_pred)*100)
print("classification Report : \n",classification_report(y_test,y_pred))

acc score : 94.92146596858639
Recall: 96.18473895582329
classification Report : 
               precision    recall  f1-score   support

           0       0.96      0.94      0.95       914
           1       0.94      0.96      0.95       996

    accuracy                           0.95      1910
   macro avg       0.95      0.95      0.95      1910
weighted avg       0.95      0.95      0.95      1910



In [80]:
vtc = VotingClassifier(
    estimators = [
        ("lr", LogisticRegression(max_iter=1000, solver="liblinear")),
        ("knn", KNeighborsClassifier(n_neighbors=5)),
        ("rf", RandomForestClassifier(n_estimators=200, random_state=42))
    ],
    voting = "soft"
)


vtc.fit(X_train_scaled, y_train)

In [81]:
y_pred = vtc.predict(X_test_scaled)
print("acc score :",accuracy_score(y_test,y_pred)*100)
print("Recall:", recall_score(y_test, y_pred)*100)
print("classification Report : \n",classification_report(y_test,y_pred))

acc score : 91.57068062827224
Recall: 92.9718875502008
classification Report : 
               precision    recall  f1-score   support

           0       0.92      0.90      0.91       914
           1       0.91      0.93      0.92       996

    accuracy                           0.92      1910
   macro avg       0.92      0.92      0.92      1910
weighted avg       0.92      0.92      0.92      1910

