In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import pickle

# Load the dataset
data_url = "https://github.com/JCJ8914/ML_examination/raw/main/data.csv"
df = pd.read_csv(data_url)
#reading the data
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,2
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,1
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,2
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,1
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,1


In [15]:
df.shape

(270, 14)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       270 non-null    int64  
 1   sex       270 non-null    int64  
 2   cp        270 non-null    int64  
 3   trestbps  270 non-null    int64  
 4   chol      270 non-null    int64  
 5   fbs       270 non-null    int64  
 6   restecg   270 non-null    int64  
 7   thalach   270 non-null    int64  
 8   exang     270 non-null    int64  
 9   oldpeak   270 non-null    float64
 10  slope     270 non-null    int64  
 11  ca        270 non-null    int64  
 12  thal      270 non-null    int64  
 13  target    270 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 29.7 KB


In [17]:
#Checking for null values
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [19]:
# Split the data into features (X) and target variable (y)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define a dictionary to store the performance metrics for each model
model_metrics = {}

# Logistic Regression
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)
logreg_y_pred = logreg.predict(X_test)
model_metrics["Logistic Regression"] = {
    "Accuracy": accuracy_score(y_test, logreg_y_pred),
    "Precision": precision_score(y_test, logreg_y_pred),
    "Recall": recall_score(y_test, logreg_y_pred),
    "F1 Score": f1_score(y_test, logreg_y_pred)
}

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)
model_metrics["Random Forest"] = {
    "Accuracy": accuracy_score(y_test, rf_y_pred),
    "Precision": precision_score(y_test, rf_y_pred),
    "Recall": recall_score(y_test, rf_y_pred),
    "F1 Score": f1_score(y_test, rf_y_pred)
}

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_y_pred = dt.predict(X_test)
model_metrics["Decision Tree"] = {
    "Accuracy": accuracy_score(y_test, dt_y_pred),
    "Precision": precision_score(y_test, dt_y_pred),
    "Recall": recall_score(y_test, dt_y_pred),
    "F1 Score": f1_score(y_test, dt_y_pred)
}

# Support Vector Machines (SVM)
svm = SVC(random_state=42)
svm.fit(X_train, y_train)
svm_y_pred = svm.predict(X_test)
model_metrics["SVM"] = {
    "Accuracy": accuracy_score(y_test, svm_y_pred),
    "Precision": precision_score(y_test, svm_y_pred),
    "Recall": recall_score(y_test, svm_y_pred),
    "F1 Score": f1_score(y_test, svm_y_pred)
}

# Neural Networks
nn = MLPClassifier(random_state=42)
nn.fit(X_train, y_train)
nn_y_pred = nn.predict(X_test)
model_metrics["Neural Network"] = {
    "Accuracy": accuracy_score(y_test, nn_y_pred),
    "Precision": precision_score(y_test, nn_y_pred),
    "Recall": recall_score(y_test, nn_y_pred),
    "F1 Score": f1_score(y_test, nn_y_pred)
}

# Find the model with the highest accuracy
best_model = max(model_metrics, key=lambda x: model_metrics[x]["Accuracy"])

# Print the performance metrics for each model
for model, metrics in model_metrics.items():
    print(f"{model}:")
    print(f"Accuracy: {metrics['Accuracy']}")
    print(f"Precision: {metrics['Precision']}")
    print(f"Recall: {metrics['Recall']}")
    print(f"F1 Score: {metrics['F1 Score']}")
    print()

# Save the best model to a file in pkl format
model_filename = "model.pkl"
with open(model_filename, "wb") as file:
    if best_model == "Logistic Regression":
        pickle.dump(logreg, file)
    elif best_model == "Random Forest":
        pickle.dump(rf, file)
    elif best_model == "Decision Tree":
        pickle.dump(dt, file)
    elif best_model == "SVM":
        pickle.dump(svm, file)
    elif best_model == "Neural Network":
        pickle.dump(nn, file)

print("Best Model:", best_model)

Logistic Regression:
Accuracy: 0.9074074074074074
Precision: 0.9117647058823529
Recall: 0.9393939393939394
F1 Score: 0.9253731343283583

Random Forest:
Accuracy: 0.7592592592592593
Precision: 0.7777777777777778
Recall: 0.8484848484848485
F1 Score: 0.8115942028985507

Decision Tree:
Accuracy: 0.6851851851851852
Precision: 0.7857142857142857
Recall: 0.6666666666666666
F1 Score: 0.721311475409836

SVM:
Accuracy: 0.8888888888888888
Precision: 0.8857142857142857
Recall: 0.9393939393939394
F1 Score: 0.9117647058823529

Neural Network:
Accuracy: 0.8888888888888888
Precision: 0.8857142857142857
Recall: 0.9393939393939394
F1 Score: 0.9117647058823529

Best Model: Logistic Regression


