In [87]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

---

# Project 1 - Classification 
- dataset: heart_disease.csv 
- predict whether person have heart disease or not

In [88]:
# 1) Export data

heart_disease = pd.read_csv("heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


- **age** - Age (possibly of a person or property)
- **sex** - Sex/Gender (likely binary: 0 or 1, possibly 0=female, 1=male)
- **cp** - Chest Pain type (values 1-4, different types of chest pain)
- **trestbps** - Resting Blood Pressure (in mm Hg on admission to the hospital)
- **chol** - Serum Cholesterol in mg/dl
- **fbs** - Fasting Blood Sugar (likely > 120 mg/dl, 1=true, 0=false)
- **restecg** - Resting Electrocardiographic results (values 0, 1, 2)
- **thalach** - Maximum Heart Rate Achieved
- **exang** - Exercise Induced Angina (1=yes, 0=no)
- **oldpeak** - ST depression induced by exercise relative to rest
- **slope** - Slope of the peak exercise ST segment
- **ca** - Number of major vessels (0-3) colored by fluoroscopy
- **thal** - Thalassemia (blood disorder, values 0-3)
- **target** - Target variable (likely 0=no disease, 1=disease present)

In [89]:
# 2) Split the data into [input] and [expected output]

# Create X (feature matrix)
X = heart_disease.drop("target", axis = 1)

# Create y (labels)
Y = heart_disease["target"] # we want to predict the output based on the input

In [90]:
# 3) Again split the data into training and test set
#  By default, train_test_split() randomly shuffles your data and splits it 75% training / 25% testing


from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

len(X_train)


227

In [91]:
# 4) Select suitable model to train on with right hyperparameter
# What is hyperparameters 
# - dail to tune the model

# Since we are dealing with classification problem, we use Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# We'll keep the default hyperparameters
clf.get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [92]:
# 5) Training Phase - Fit model
# Fit is another word for the model the learn the pattern

clf.fit(X_train, Y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [93]:
# 6) Training Phase - Use test set to get the prediction results.  
# The results will be evaluated later

Y_Predict = clf.predict(X_test)
Y_Predict_Prob = clf.predict_proba(X_test)[:, 1] 

Y_result = pd.DataFrame({
    "Prediction": Y_Predict,
    "Probability": Y_Predict_Prob,
    "Expected Output": Y_test
})


Y_result["Correctness"] = Y_result["Prediction"] == Y_result["Expected Output"]

ones = (Y_result['Correctness'] == 1).sum() / len(Y_result) * 100
zeros = (Y_result ['Correctness'] == 0).sum() / len(Y_result) * 100

ones, zeros




(np.float64(88.1578947368421), np.float64(11.842105263157894))

In [94]:
# 7) Evaluate the model on the training data

Train_result = clf.score(X_train, Y_train)
Test_result = clf.score(X_test, Y_test)

Train_result, Test_result

(1.0, 0.881578947368421)

1) Accuracy Score

    Correct Prediction / Total = Percentage %

2) ROC Curve & AUC Score

    ROC Curve: Plot True Positive Rate vs False Positive Rate at different threshold
    AUC (Area Under Curve): Single number summarizing the ROC curve

    - 1 = Perfrect model
    - 0.5 = Random huessing (coin flip)
    - < 0.5 = Worse random

3) Confusion Matrix
    TN - True Negative
    FP - False Positive
    FN - False Negative
    TP - True Positive

4) Classification report



- Accuracy: Quick overview, good for balanced datasets
- ROC/AUC: Comparing models, works well with imbalanced data
- Confusion Matrix: Understanding specific error types
 Classification Report: Comprehensive view of precision/recall trade-offs

In [98]:
# Accuracy (same result as above)

from sklearn.metrics import accuracy_score
print(accuracy_score(Y_test, Y_Predict))

# Receiver Operating Character (ROC curve)

from sklearn.metrics import roc_curve, roc_auc_score

false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_test, Y_Predict_Prob)
print("-----------------------------")
print(roc_auc_score(Y_test, Y_Predict))

# Confusion metrix

from sklearn.metrics import confusion_matrix
print("-----------------------------")
print(confusion_matrix(Y_test, Y_Predict))


# Classification report

from sklearn.metrics import classification_report
print("-----------------------------")
print(classification_report(Y_test, Y_Predict))


0.881578947368421
-----------------------------
0.8833333333333333
-----------------------------
[[33  3]
 [ 6 34]]
-----------------------------
              precision    recall  f1-score   support

           0       0.85      0.92      0.88        36
           1       0.92      0.85      0.88        40

    accuracy                           0.88        76
   macro avg       0.88      0.88      0.88        76
weighted avg       0.88      0.88      0.88        76



---

# Project 2 - Regression - dataset: load_boston (from scikit learn) - predict house prices

In [None]:
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
home_price = pd.read_csv('housing.csv', delim_whitespace=True, names=column_names, header=None)

home_price

  home_price = pd.read_csv('housing.csv', delim_whitespace=True, names=column_names, header=None)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0
