In [19]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler,  LabelEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor


df=pd.read_csv('/Users/edham/OneDrive/Documents/GitHub/Assg_ML/heart_disease_uci.csv')
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [20]:
df.isnull().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [21]:
df = df.drop(columns=['id', 'dataset'])
df = df.replace(r'^\s*$', np.nan, regex=True)
print(df.isnull().sum())

age           0
sex           0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


In [22]:
categorical_cols = ['thal', 'ca', 'slope', 'exang', 'restecg','fbs', 'cp', 'sex', 'num']
bool_cols = ['fbs', 'exang']
numerical_cols = [ 'trestbps', 'chol', 'thalch', 'oldpeak']

In [23]:
label_encoder = preprocessing.LabelEncoder()
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])
df[bool_cols] = pd.get_dummies(df[bool_cols], dtype=int)
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].mean())
df.isnull().sum()


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [24]:
X = df.drop(columns = 'num')
scaler = MinMaxScaler()
Xs = scaler.fit_transform(X)

y = df['num']


In [25]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [27]:
# Define models
models = {}
models['KNN'] = KNeighborsRegressor()
models['LR'] = LogisticRegression()
models['RF'] = RandomForestRegressor()
models['GB'] = GradientBoostingRegressor()
models['MLP'] = MLPRegressor()

print("Result")
for n in models:
    results = cross_val_score(models[n], Xs, y, cv=kfold)
    print(f"Mean: {results.mean():.2%}, std: ({results.std():.2%})")


Result
Mean: 33.67%, std: (6.57%)
Mean: 57.28%, std: (3.45%)
Mean: 39.34%, std: (5.33%)
Mean: 40.06%, std: (3.94%)




Mean: 42.51%, std: (3.49%)




In [None]:
best_model = None
best_accuracy = 0.0


In [None]:
for name, model in models:
    pipeline = Pipeline([
        ('model',model)
    ])
    

In [None]:
scores = cross_val_score(pipeline, X_train, y_train, cv=5)

In [None]:
mean_accuracy = scores.mean()

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)

In [None]:
print("Model:", name)
print("Cross-validation Accuracy:", mean_accuracy)
print("Test Accuracy:", accuracy)
print()

In [None]:
if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

In [None]:
print("Best Model:", best_model)

In [None]:
import pickle
pickle.dump(best_model, open('heart_disease_model.pkl', 'wb'))

In [None]:
# Use spotchecking to quickly evaluate the performance of different algorithms
models = {}
models['lgr'] = LogisticRegression()
models['knn'] = KNeighborsRegressor()
models['rfc'] = RandomForestClassifier()
models['gbc'] = GradientBoostingClassifier()
models['mlp'] = MLPClassifier()

kf = KFold(n_splits=3, shuffle=True, random_state=42)
for n in models:
    scores = cross_val_score(models[n], X_train_RRscaled, y_train, cv=kf, n_jobs=-1)
    print(f"{n}:  {scores.mean():.3%}, {scores.std():.3%}")

In [None]:
#Logistic regression
from sklearn.model_selection import train_test_split as split
X_train_RRscaled, X_test_RRscaled, y_train_RRscaled, y_test_RRscaled = split(X, y, test_size=0.25, random_state=42)
lgr = LogisticRegression().fit(X_train_RRscaled, y_train)
print(f'R2 score: {lnr.score(X_test_RRscaled, y_test):.2f}')

In [None]:
# MLP Classification
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
X_train_RRscaled, X_test_RRscaled, y_train_RRscaled, y_test_RRscaled = split(X, y, test_size=0.25, random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(10, 10), random_state=42).fit(X_train, y_train)
print(f'Accuracy: {mlp.score(X_test_RRscaled, y_test):.2%}')

In [None]:
# k-NN Regressor
from sklearn.model_selection import train_test_split as split
X_train_RRscaled, X_test_RRscaled, y_train_RRscaled, y_test_RRscaled = split(X, y, test_size=0.25, random_state=42)
knn = KNeighborsRegressor(n_neighbors=5).fit(X_train_RRscaled, y_train_RRscaled)
print(f'R2 score: {knn.score(X_test_RRscaled, y_test):.2f}')

In [None]:
 # Random Forest Classifier
from sklearn.model_selection import train_test_split as split
X_train_RRscaled, X_test_RRscaled, y_train_RRscaled, y_test_RRscaled = split(X, y, test_size=0.25, random_state=42)
rfc = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train_RRscaled, y_train_RRscaled)
print(f'Accuracy: {rfc.score(X_test_RRscaled, y_test):.2%}')

In [None]:
# Gradient Boosting Classifier
from sklearn.model_selection import train_test_split as split
X_train_RRscaled, X_test_RRscaled, y_train_RRscaled, y_test_RR_RRscaled = split(X, y, test_size=0.25, random_state=42)
gbc = GradientBoostingClassifier(n_estimators=100, random_state=42).fit(X_train_RRscaled, y_train_RRscaled)
print(f'Accuracy: {gbc.score(X_test_RRscaled, y_test):.2%}')