In [10]:
import pandas as pd

# Load the dataset
df = pd.read_csv("D_Set/CVD_cleaned.csv")

# Check the first few rows of the dataset to understand its structure
print(df.head())

# Check the info of the dataset to see datatypes and non-null values
print(df.info())

# Basic statistics of the dataset
print(df.describe())


  General_Health                  Checkup Exercise Heart_Disease Skin_Cancer  \
0           Poor  Within the past 2 years       No            No          No   
1      Very Good     Within the past year       No           Yes          No   
2      Very Good     Within the past year      Yes            No          No   
3           Poor     Within the past year      Yes           Yes          No   
4           Good     Within the past year       No            No          No   

  Other_Cancer Depression Diabetes Arthritis     Sex Age_Category  \
0           No         No       No       Yes  Female        70-74   
1           No         No      Yes        No  Female        70-74   
2           No         No      Yes        No  Female        60-64   
3           No         No      Yes        No    Male        75-79   
4           No         No       No        No    Male          80+   

   Height_(cm)  Weight_(kg)    BMI Smoking_History  Alcohol_Consumption  \
0        150.0        32.66  

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("D_Set/CVD_cleaned.csv")

# Label encode target variable (Heart Disease)
label_encoder = LabelEncoder()
df['Heart_Disease'] = label_encoder.fit_transform(df['Heart_Disease'])  # 'Yes' -> 1, 'No' -> 0

# Separate features (X) and target variable (y)
X = df.drop('Heart_Disease', axis=1)
y = df['Heart_Disease']

# List of categorical columns that require one-hot encoding
categorical_cols = ['General_Health', 'Checkup', 'Exercise', 'Skin_Cancer', 
                    'Other_Cancer', 'Depression', 'Diabetes', 'Arthritis', 
                    'Sex', 'Age_Category', 'Smoking_History']

# Preprocessing pipeline: OneHotEncode categorical variables and scale numerical ones
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols),
        ('num', StandardScaler(), X.select_dtypes(include=['float64']).columns)
    ])

# Train-Test Split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression Model
log_reg_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Decision Tree Classifier Model
dtc_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Fit the models
log_reg_model.fit(X_train, y_train)
dtc_model.fit(X_train, y_train)

# Predict on the test set
y_pred_log_reg = log_reg_model.predict(X_test)
y_pred_dtc = dtc_model.predict(X_test)

# Evaluate the models
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log_reg))

print("\nDecision Tree Classifier Classification Report:")
print(classification_report(y_test, y_pred_dtc))

# Confusion Matrices
print("\nLogistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_log_reg))

print("\nDecision Tree Classifier Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dtc))

# Accuracy Scores
print(f"\nLogistic Regression Accuracy: {accuracy_score(y_test, y_pred_log_reg):.4f}")
print(f"Decision Tree Classifier Accuracy: {accuracy_score(y_test, y_pred_dtc):.4f}")


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96     56774
           1       0.51      0.06      0.11      4997

    accuracy                           0.92     61771
   macro avg       0.71      0.53      0.53     61771
weighted avg       0.89      0.92      0.89     61771


Decision Tree Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.92      0.93     56774
           1       0.21      0.24      0.22      4997

    accuracy                           0.86     61771
   macro avg       0.57      0.58      0.57     61771
weighted avg       0.87      0.86      0.87     61771


Logistic Regression Confusion Matrix:
[[56466   308]
 [ 4682   315]]

Decision Tree Classifier Confusion Matrix:
[[52170  4604]
 [ 3809  1188]]

Logistic Regression Accuracy: 0.9192
Decision Tree Classifier Accuracy: 0.8638
