In [10]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

In [11]:
df = pd.read_csv("CVD_cleaned.csv")

In [12]:
df.shape

(308854, 19)

In [13]:
df.dtypes

General_Health                   object
Checkup                          object
Exercise                         object
Heart_Disease                    object
Skin_Cancer                      object
Other_Cancer                     object
Depression                       object
Diabetes                         object
Arthritis                        object
Sex                              object
Age_Category                     object
Height_(cm)                     float64
Weight_(kg)                     float64
BMI                             float64
Smoking_History                  object
Alcohol_Consumption             float64
Fruit_Consumption               float64
Green_Vegetables_Consumption    float64
FriedPotato_Consumption         float64
dtype: object

In [14]:
df.isnull().sum()

General_Health                  0
Checkup                         0
Exercise                        0
Heart_Disease                   0
Skin_Cancer                     0
Other_Cancer                    0
Depression                      0
Diabetes                        0
Arthritis                       0
Sex                             0
Age_Category                    0
Height_(cm)                     0
Weight_(kg)                     0
BMI                             0
Smoking_History                 0
Alcohol_Consumption             0
Fruit_Consumption               0
Green_Vegetables_Consumption    0
FriedPotato_Consumption         0
dtype: int64

In [16]:
X = df.drop(["Height_(cm)", "Weight_(kg)", "Heart_Disease"], axis=1)
y = df["Heart_Disease"]

In [17]:
numerical_coulmns = ["BMI", "Alcohol_Consumption", "Fruit_Consumption",
                     "Green_Vegetables_Consumption", "FriedPotato_Consumption"]
categorical_columns = X.columns.difference(numerical_coulmns)

In [25]:
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', 'passthrough', numerical_coulmns),
    ('cat', categorical_transformer, categorical_columns)
])

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=1000))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9)

In [26]:
pred = pipe.fit(X_train, y_train).predict(X_test)
accuracy = accuracy_score(y_true=y_test, y_pred=pred)

In [27]:
print(f"Accuracy: {accuracy:.6f}")

Accuracy: 0.917989


In [None]:
import pickle

with open('model.pkl', 'wb') as files:
    pickle.dump(pipe, files)