In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report


# Sample data
data = {
    'Age': [63, 37, 41, 56, 57, 57, 63, 44, 52, 57],
    'Gender': ['Male', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Male', 'Male'],
    'Cholesterol': [233, 250, 204, 236, 192, 294, 256, 263, 209, 354],
    'Blood Pressure': ['145/90', '130/70', '130/80', '175/105', '150/90', '140/90', '140/80', '120/80', '140/85', '160/95'],
    'Heart Disease': ['Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No', 'No', 'Yes']
}

# Create pandas DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('heart_disease.csv', index=False)

print("Dataset 'heart_disease.csv' created successfully.")

df.dropna(inplace=True)  # Remove rows with missing values (replace with more sophisticated imputation if necessary)
df.drop_duplicates(inplace=True)

# ----> Split Blood Pressure into Systolic and Diastolic <----
df[['Systolic', 'Diastolic']] = df['Blood Pressure'].str.split('/', expand=True).astype(float)
df.drop('Blood Pressure', axis=1, inplace=True)

# Feature Engineering (Normalize numerical features)
# ----> Update numerical_features to include new columns <----
numerical_features = ['Age', 'Cholesterol', 'Systolic', 'Diastolic']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Prepare data for modeling
X = df.drop('Heart Disease', axis=1)  # Features
y = df['Heart Disease']  # Target variable

# One-hot encode categorical features (if any)
X = pd.get_dummies(X, drop_first=True) # Example, adapt to your data

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
model = LogisticRegression()
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Dataset 'heart_disease.csv' created successfully.
Confusion Matrix:
 [[2]]

Classification Report:
               precision    recall  f1-score   support

          No       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



