In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn import preprocessing
import pickle
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Load and preprocess the data
data = pd.read_csv("heart_attack_prediction_dataset.csv")
data = data.drop(columns=['Patient ID', 'Blood Pressure', 'Exercise Hours Per Week', 'Sedentary Hours Per Day',  
                          'Triglycerides', 'Physical Activity Days Per Week', 'Sleep Hours Per Day', 'Cholesterol', 
                          'Stress Level', 'Continent', 'Hemisphere', 'Country'])


In [29]:

# Use LabelEncoder for categorical features
label_encoder = preprocessing.LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['Diet'] = label_encoder.fit_transform(data['Diet'])
# data['Continent'] = label_encoder.fit_transform(data['Continent'])
# data['Hemisphere'] = label_encoder.fit_transform(data['Hemisphere'])
# data['Country'] = label_encoder.fit_transform(data['Country'])
print(data.columns.tolist())
print(data)

['Age', 'Sex', 'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity', 'Alcohol Consumption', 'Diet', 'Previous Heart Problems', 'Medication Use', 'Income', 'BMI', 'Heart Attack Risk']
      Age  Sex  Heart Rate  Diabetes  Family History  Smoking  Obesity  \
0      67    1          72         0               0        1        0   
1      21    1          98         1               1        1        1   
2      21    0          72         1               0        0        0   
3      84    1          73         1               1        1        0   
4      66    1          93         1               1        1        1   
...   ...  ...         ...       ...             ...      ...      ...   
8758   60    1          61         1               1        1        0   
8759   28    0          73         1               0        0        1   
8760   47    1         105         0               1        1        1   
8761   36    1          60         1               0        1    

In [30]:

# Define the features and target variable
X = data.drop('Heart Attack Risk', axis=1)
y = data['Heart Attack Risk']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the hyperparameter distribution
param_dist = {
    'C': uniform(loc=0.001, scale=100),  # Continuous distribution for C
    'penalty': ['l1', 'l2'],  
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],  
}

# Create the RandomizedSearchCV object
model = LogisticRegression(random_state=42)
random_search = RandomizedSearchCV(model, param_dist, n_iter=50, cv=5, scoring='accuracy', return_train_score=True)

# Perform randomized search hyperparameter tuning
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the validation accuracy for the best model
validation_accuracy = random_search.best_score_
print("Validation Accuracy:", validation_accuracy)

# Use the best model for predictions
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, zero_division=1)


print(f"\nTest Accuracy: {test_accuracy}")
print("Classification Report:")
print(classification_rep)

model_pkl_file = "final.pkl"  

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(best_model, file)

100 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\LENOVO\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\LENOVO\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\LENOVO\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solve

Best Hyperparameters: {'C': 22.022506952313154, 'penalty': 'l1', 'solver': 'saga'}
Validation Accuracy: 0.6411802949141862

Test Accuracy: 0.6432103461392165
Classification Report:
              precision    recall  f1-score   support

           0       0.64      1.00      0.78      1691
           1       1.00      0.00      0.00       938

    accuracy                           0.64      2629
   macro avg       0.82      0.50      0.39      2629
weighted avg       0.77      0.64      0.50      2629



