In [53]:
# Step 1: Load the dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
#Libraries
import pandas as pd
import numpy as np




In [55]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [56]:
#Readinf CSV File
data = pd.read_csv('heart.csv')

In [57]:
#Print first 5 rows
print("Dataset Preview:")
print(data.head())

Dataset Preview:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0     125.0   212    0        1    168.0      0      1.0      2   
1   53    1   0     140.0   203    1        0    155.0      1      3.1      0   
2   70    1   0     145.0   174    0        1    125.0      1      2.6      0   
3   61    1   0     148.0   203    0        1    161.0      0      0.0      2   
4   62    0   0     138.0   294    1        1    106.0      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  


In [58]:
#Statistical summary
print("\nDataset Summary:")
print(data.describe())


Dataset Summary:
               age          sex           cp     trestbps        chol  \
count  1025.000000  1025.000000  1025.000000  1024.000000  1025.00000   
mean     54.434146     0.695610     0.942439   131.611328   246.00000   
std       9.072290     0.460373     1.029641    17.525273    51.59251   
min      29.000000     0.000000     0.000000    94.000000   126.00000   
25%      48.000000     0.000000     0.000000   120.000000   211.00000   
50%      56.000000     1.000000     1.000000   130.000000   240.00000   
75%      61.000000     1.000000     2.000000   140.000000   275.00000   
max      77.000000     1.000000     3.000000   200.000000   564.00000   

               fbs      restecg      thalach        exang      oldpeak  \
count  1025.000000  1025.000000  1024.000000  1025.000000  1024.000000   
mean      0.149268     0.529756   149.140625     0.336585     1.069629   
std       0.356527     0.527878    23.001333     0.472772     1.174079   
min       0.000000     0.000

In [59]:
#Display info including data types and NAN values
print("\nDataset Info:")
print(data.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1024 non-null   float64
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1024 non-null   float64
 8   exang     1025 non-null   int64  
 9   oldpeak   1024 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(3), int64(11)
memory usage: 112.2 KB
None


In [60]:
#REmoving duplicates
data.drop_duplicates(inplace=True)

In [61]:
#Replacing NAN numbers
#Imputation
data.replace("?", np.nan, inplace=True)
print("\nMissing Values Count:")
print(data.isnull().sum())


Missing Values Count:
age         0
sex         0
cp          0
trestbps    1
chol        0
fbs         0
restecg     0
thalach     1
exang       0
oldpeak     1
slope       0
ca          0
thal        0
target      0
dtype: int64


In [62]:
#Droping NAN rows
data.dropna(inplace=True)

In [63]:
#Setting age column as target
# Ma'am you mentioned to set age as a target varriable but age is the continous value in the case of classification it will not work properly
# because as we read in class that classification algorithm wokrs on the the binary values
X = data.drop(columns=['age'])
y = data['age']

In [64]:
#80:20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
#Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [66]:
#Declaring dictionary globally
results = []

In [67]:

# Function to train predict and evaluate each model and also finding the accuracy precision recall f1
# paasing the model that have to be evaluated and also the name of that model to save it
def evaluate_model(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)


    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted') #setting the average weighted because our targeted colum has continous values
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    cm = confusion_matrix(y_test, y_pred)


    # Storing all the above calculated result in the results list with their model name
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "Confusion Matrix": cm
    })



In [68]:
#Clearing the list because if i don't clear it here it will append the result in further indexes it clear the old save data if we run it multiple time it can save that's why
results = []

In [69]:
#Initializing Modela
#logistic is more suitable for binary values
log_reg = LogisticRegression()
decision_tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()
naive_bayes = GaussianNB()
# i use Gaussian Naive because we are using the age as a target value and age is continous value the MultinomialNB is giving error on this column

In [70]:
#Passing Model and Model Names
evaluate_model(log_reg, "Logistic Regression")
evaluate_model(decision_tree, "Decision Tree")
evaluate_model(knn, "K-Nearest Neighbors")
evaluate_model(naive_bayes, "Naive Bayes")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [71]:
#logging results in dataframe to used it for camparison
results_df = pd.DataFrame(results)

In [72]:
#Printing results of all the models
print("\nModel Comparison:")
print(results_df[['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score']])


Model Comparison:
                 Model  Accuracy  Precision    Recall  F1 Score
0  Logistic Regression  0.081967   0.054918  0.081967  0.063752
1        Decision Tree  0.081967   0.145902  0.081967  0.075098
2  K-Nearest Neighbors  0.032787   0.028689  0.032787  0.030445
3          Naive Bayes  0.049180   0.077049  0.049180  0.049727


In [73]:
#Analyzing according to accuracy printing the best model which has highest accuracy
best_model = results_df.loc[results_df['Accuracy'].idxmax()]
print("\nBest Model in terms of Accuracy:")
print(best_model)


Best Model in terms of Accuracy:
Model                                             Logistic Regression
Accuracy                                                     0.081967
Precision                                                    0.054918
Recall                                                       0.081967
F1 Score                                                     0.063752
Confusion Matrix    [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
Name: 0, dtype: object


In [74]:
#Providing detail with conditions
print("\nModel Performance Analysis:")
print(f"The {best_model['Model']} performed best because:")
if best_model['Model'] == "Logistic Regression":
    print("It handles binary classification well")
    print("The data might have linear decision boundaries")
elif best_model['Model'] == "Decision Tree":
    print("It captured non-linear patterns in the data")
    print("The data might have clear decision rules")
elif best_model['Model'] == "K-Nearest Neighbors":
    print("The data points in same class might be clustering together")
    print("The feature space has well-defined neighborhoods")
elif best_model['Model'] == "Naive Bayes":
    print("The features might be relatively independent")
    print("The data follows Gaussian distribution")


Model Performance Analysis:
The Logistic Regression performed best because:
It handles binary classification well
The data might have linear decision boundaries
