In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

Data collection and processing

In [None]:
# loading the csv data to a Pandas DataFrame
heart_data = pd.read_csv('/content/drive/MyDrive/heart_disease_uci.csv')

In [None]:
# Dropping the columns with too many missing values
heart_data = heart_data.drop(columns=['slope', 'ca', 'thal'], axis=1)

In [None]:
# Drop rows with missing values in categorical columns
heart_data = heart_data.dropna(subset=['sex', 'cp', 'fbs', 'restecg', 'exang'])

In [None]:
# Handle missing values in numeric columns
numeric_columns = heart_data.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='mean')
heart_data[numeric_columns] = imputer.fit_transform(heart_data[numeric_columns])


In [None]:
# OneHotEncode categorical columns
categorical_columns = heart_data.select_dtypes(exclude=[np.number]).columns
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_data = pd.DataFrame(
    encoder.fit_transform(heart_data[categorical_columns]),
    columns=encoder.get_feature_names_out(categorical_columns)
)

In [None]:
# Combine encoded data with numeric data
heart_data = pd.concat([heart_data.drop(columns=categorical_columns), encoded_data], axis=1)

In [None]:
# Handle missing values in numeric columns
numeric_columns = X.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='mean')  # Impute using the mean
X[numeric_columns] = imputer.fit_transform(X[numeric_columns])

# Ensure no NaN values remain after imputation
if X.isnull().sum().sum() > 0:
    print("Dropping rows with persistent NaN values...")
    X = X.dropna()

# Final check
print(X.isnull().sum())


id                          0
age                         0
trestbps                    0
chol                        0
thalch                      0
oldpeak                     0
sex_Male                    0
dataset_Hungary             0
dataset_Switzerland         0
dataset_VA Long Beach       0
cp_atypical angina          0
cp_non-anginal              0
cp_typical angina           0
fbs_True                    0
restecg_normal              0
restecg_st-t abnormality    0
exang_True                  0
dtype: int64


In [None]:
# Categorizing the 'num' column (0 = no disease, 1,2,3,4 = heart disease)
heart_data['num'] = heart_data['num'].apply(lambda x: 1 if x > 0 else 0)


In [None]:
# Splitting features and target
X = heart_data.drop(columns='num', axis=1)
Y = heart_data['num']

In [None]:
# Splitting into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


In [None]:
# Handle missing values for both X_train and X_test
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)  # Fit and transform on training data
X_test = imputer.transform(X_test)  # Only transform on test data (no fitting)

# Check if there are any NaN values remaining
print("Missing values in X_train after imputation:", pd.DataFrame(X_train).isnull().sum().sum())
print("Missing values in X_test after imputation:", pd.DataFrame(X_test).isnull().sum().sum())


Missing values in X_train after imputation: 0
Missing values in X_test after imputation: 0


In [None]:
# Model training using Logistic Regression
model = LogisticRegression(max_iter=1000, solver='lbfgs')
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Evaluating the model
# Accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.8390804597701149


In [None]:
# Accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.8114285714285714


In [None]:
import joblib

# Save the model
joblib.dump(model, 'logistic_regression_model.joblib')
print("Model saved as logistic_regression_model.joblib")

# Load the model
loaded_model = joblib.load('logistic_regression_model.joblib')
print("Model loaded successfully")


Model saved as logistic_regression_model.joblib
Model loaded successfully


In [None]:
import pandas as pd
import numpy as np

# Get feature names from the original DataFrame
feature_names = heart_data.drop(columns=['num']).columns  # Adjust 'target' to the actual target column name in your dataset

# Get model coefficients
coefficients = model.coef_[0]  # For binary classification, model.coef_ is a 2D array with one row

# Create a DataFrame to display coefficients
coefficients_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Absolute Importance': np.abs(coefficients)
}).sort_values(by='Absolute Importance', ascending=False)

print(coefficients_df)



                     Feature  Coefficient  Absolute Importance
9      dataset_VA Long Beach    -3.248612             3.248612
8        dataset_Switzerland    -2.340766             2.340766
10        cp_atypical angina    -1.898630             1.898630
7            dataset_Hungary    -1.550039             1.550039
6                   sex_Male     1.216222             1.216222
12         cp_typical angina    -1.162861             1.162861
11            cp_non-anginal    -0.815836             0.815836
15  restecg_st-t abnormality    -0.682488             0.682488
5                    oldpeak     0.679972             0.679972
16                exang_True     0.575395             0.575395
14            restecg_normal    -0.543474             0.543474
13                  fbs_True     0.214250             0.214250
4                     thalch    -0.019206             0.019206
1                        age     0.012941             0.012941
2                   trestbps     0.006871             0