In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np


# Load the dataset
data = pd.read_csv("heart_disease.csv")
data.head()


# Check for missing values
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)


# Fill missing values with the median value of the respective columns
data['ca'].fillna(data['ca'].median(), inplace=True)
data['thal'].fillna(data['thal'].median(), inplace=True)


# Transform categorical data using one-hot encoding
categorical_columns = ['cp', 'restecg', 'slope', 'ca', 'thal']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)


# Display the first few rows after transformation
print("Transformed Data:\n", data.head())


# Split the data into features and target variable
X = data.drop(columns=['heart_diagnosis'])
y = data['heart_diagnosis'].apply(lambda x: 1 if x > 0 else 0)  # Convert to binary classification


# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Print the model results
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

# Get feature importance
feature_importance = np.abs(model.coef_[0])
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance}).sort_values(by='Importance', ascending=False)

print("Feature Importance:\n", importance_df)



Missing values:
 age                0
gender             0
cp                 0
trestbps           0
chol               0
fbs                0
restecg            0
thalach            0
exang              0
oldpeak            0
slope              0
ca                 4
thal               2
heart_diagnosis    0
dtype: int64
Transformed Data:
     age  gender  trestbps   chol  fbs  thalach  exang  oldpeak  \
0  63.0     1.0     145.0  233.0  1.0    150.0    0.0      2.3   
1  67.0     1.0     160.0  286.0  0.0    108.0    1.0      1.5   
2  67.0     1.0     120.0  229.0  0.0    129.0    1.0      2.6   
3  37.0     1.0     130.0  250.0  0.0    187.0    0.0      3.5   
4  41.0     0.0     130.0  204.0  0.0    172.0    0.0      1.4   

   heart_diagnosis  cp_2.0  ...  cp_4.0  restecg_1.0  restecg_2.0  slope_2.0  \
0                0   False  ...   False        False         True      False   
1                2   False  ...    True        False         True       True   
2                1  