In [None]:
#import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (accuracy_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve)
from sklearn.model_selection import train_test_split, GridSearchCV


#load the heart_disease_uci dataset
heart_data = pd.read_csv('heart_disease_uci(1).csv')

#check data types and basic statistics
# heart_data.head()
# heart_data.info()
# heart_data.describe()


#encode categorical columns
categorical_cols = heart_data.select_dtypes(include=['object']).columns 
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_array = encoder.fit_transform(heart_data[categorical_cols])
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_cols))
df = pd.concat([heart_data.drop(categorical_cols, axis=1,), encoded_df], axis=1)
print(df)

#remove NaNs
df.dropna(inplace = True)


#define features and targert
X = df.drop(columns=['chol']) #features
y = df['chol'] #targets


#split data into train set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# # scale the features(X)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#list to store evaluation metrics
alphas = np.logspace(0, 1, 10)
l1_ratios = np.linspace(0, 1, 10)

r2_scores = []
rmse_scores = []

# #ElasticNet for linear regression
for alpha in alphas:
    for l1_ratio in l1_ratios:
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state = 42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        #Evaluate
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        r2_scores.append([alpha, l1_ratio, r2])
        rmse_scores.append([alpha, l1_ratio, rmse])
print(r2_scores)
print(rmse_scores)

#convert r2 and rmse to Dataframes
r2_df = pd.DataFrame(r2_scores, columns=['alpha', 'l1_ratio', 'R2'])
print(r2_df)
rmse_df = pd.DataFrame(rmse_scores, columns=['alpha', 'l1_ratio', 'RMSE'])
print(rmse_df)

#Pivot for heatmap
r2_pivot = r2_df.pivot(index="alpha",columns="l1_ratio", values="R2")
rmse_pivot = rmse_df.pivot(index="alpha", columns="l1_ratio", values="RMSE")

# #plot R2 heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(r2_pivot, annot=True, fmt='.4f', cmap='viridis', cbar_kws={'label': 'R2'})
plt.title('R2 for ElasticNet')
plt.xlabel('L1_Ratio')
plt.xlabel('Alpha')
plt.show()

# #plot RMSE heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(rmse_pivot, annot=True, fmt='.4f', cmap='mako', cbar_kws={'label': 'RMSE'} )
plt.title('RMSE for ElasticNet')
plt.xlabel('L1_Ratio')
plt.xlabel('Alpha')
plt.show()


# #find top-perfoming configuration 
top_r2 = r2_df.loc[r2_df['R2'].idxmax()]
print(top_r2)
top_rmse = rmse_df.loc[rmse_df['RMSE'].idxmin()]
print(top_rmse)



In [None]:
#---Classification models---

#Define target for classification; target = num
X = df.drop(columns=['num']) 
y = df['num'].astype(int)



#split data into test data and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # scale the features(X)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Hyperparameter grid for Logistic Regression
logreg_param_grid = [
    {'penalty': ['l1'], 'solver': ['liblinear', 'saga'] },
    {'penalty': ['l2'], 'solver': ['liblinear', 'saga', 'lbfgs']},

]
    
#Logistic regression with GridSearchCV
logreg = LogisticRegression(max_iter=1000)
logreg_grid = GridSearchCV(logreg, logreg_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
logreg_grid.fit(X_train, y_train)

#best logistic regression model
best_logreg = logreg_grid.best_estimator_
print(f"Best Logistic Regression params: {logreg_grid.best_params_}")

# #Predictions
y_pred_logreg = best_logreg.predict(X_test)
y_prob_logreg = best_logreg.predict_proba(X_test)


# #Logistic regression evaluation
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
logreg_f1 = f1_score(y_test, y_pred_logreg, average='weighted')
logreg_auroc = roc_auc_score(y_test, y_prob_logreg, multi_class='ovr')

print('Accuracy:', logreg_accuracy)
print('F1 Score:', logreg_f1 )
print('AUROC:', logreg_auroc)


#k-Nearest neighbors(k-NN)

#Hyperparameter grid for k-NN
knn_param_grid = {
    'n_neighbors':[1, 5, 10, 15, 20],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

#K-NN with GridSearchCV
knn = KNeighborsClassifier()
knn_grid = GridSearchCV(knn, knn_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
knn_grid.fit(X_train, y_train)

#Best K-NN model
best_knn = knn_grid.best_estimator_
print(f"Best k-NN params: {knn_grid.best_params_}")

#Predictions
y_pred_knn = best_knn.predict(X_test)
y_prob_knn = best_knn.predict_proba(X_test)

#K-NN evaluation
knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn, average='weighted')
knn_auroc = roc_auc_score(y_test, y_prob_knn, multi_class='ovr')

print('Knn Accuracy:', knn_accuracy)
print('Knn F1 Score:', knn_f1)
print('Knn AUROC:', knn_auroc)

#AUROC & AUPRC CURVES
precision_logreg, recall_logreg, _ = precision_recall_curve(y_test, y_prob_logreg)
pr_auc_logreg = auc(recall_logreg, precision_logreg)

#Precision-recall curve for K-NN
precision_knn, recall_knn, _ = precision_curve(y_test, y_prob_knn)
pr_auc_knn = auc(recall_knn, precision_knn)

#Plot AUROC Curves
plt.figure(figsize=(10, 6))
fpr_logreg_, tpr_logreg, _ = roc_curve(y_test, y_prob_knn)
fpr_knn, tpr_knn, _ = roc_curvey(y_test, y_prob_knn)

plt.plot(fpr_logreg, tpr_logreg, label='AUPRC')
plt.plot(fpr_knn, tpr_knn, label="AUROC")
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUROC Curve')
plt.legend()
plt.show

#plot AUPRC curves
plt.plot(recall_logreg, precision_logreg, label='AUPRC')
plt.plot(recall_knn, precsision_knn, label="k-NN AUROC")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('AUPRC Curve')
plt.legend()
plt.show





