### Regular EDA

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor , RandomForestClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score , confusion_matrix , classification_report ,precision_score , recall_score , f1_score , RocCurveDisplay


##### Load Data

In [None]:
df = pd.read_csv('./data/heart-disease.csv')
df.head()

In [None]:
df.tail()

In [None]:
df['target'].value_counts()

In [None]:
df['target'].value_counts().plot(kind='bar', color=['salmon', 'lightblue'])

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df['sex'].value_counts()

In [None]:
pd.crosstab(df['sex'], df['target'])

In [None]:
pd.crosstab(df['sex'], df['target']).plot(kind='bar', figsize=(10,6), color=['salmon', 'lightblue'])
plt.title('Gender vs Disease')
plt.xlabel('0 = No Disease , 1 = Disease')
plt.ylabel('Amount')
plt.legend(['Female','Male'])
plt.xticks(rotation=0)

### Avg vs Max Heart Rate 

In [None]:
plt.figure(figsize=(10,6))
# Scatter with positive examples
plt.scatter(df.age[df.target==1], df.thalach[df.target==1], color='g')
# Scatter with negative examples
plt.scatter(df.age[df.target==0], df.thalach[df.target==0], color='r')
plt.title('Heart Disease in function of Age and Max Heart Rate')
plt.xlabel('Age')
plt.ylabel('Max Heart Rate')
plt.legend(['Disease', 'No Disease']);

### cp -chest pain 
###### level 0 ,1 ,2 3 

In [None]:
pd.crosstab(df.cp, df.target)

In [None]:
pd.crosstab(df.cp, df.target).plot(kind='bar', figsize=(10,6), color=['salmon', 'lightblue'])
plt.title('Chest Pain Type vs Disease')
plt.xlabel('Chest Pain Type')
plt.ylabel('Amount')
plt.legend(['No Disease', 'Disease'])
plt.xticks(rotation=0)
plt.show()
### cp -chest pain

### Modeling

### Try 3 Machine Learning Model
###### 1. Logistric Regression
###### 2. K-Nearest Neighbours Classification
###### 3. Random Forest Classification

In [None]:
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model ={
    'Logistic Regression': LinearRegression(),
    'KNN': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor()}
for name, model in model.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'----{name}----')
    print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
    print('R2 Score:', r2_score(y_test, y_pred))
    print()

In [None]:
models ={
    'Logistic Regression': LinearRegression(),
    'KNN': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor()}
def evaluate_model(models ,X_train, X_test, y_train, y_test):
    np.random.seed(42)
    model_scores = {}
    for name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [None]:
model_score = evaluate_model(models, X_train, X_test, y_train, y_test)

In [None]:
model_score

 Precision  TP / TP+FP   i.e 10 patient and 8 has decease then 8/10 = 80%
 Recall    TP / TP+FN  20 case of decease and 15 has the 15/20 = 75%
***F1 score balance between Precision and Recall by providing the single vale , its the hamonic mean of Precision and recall***

Formula  F1  = 2* (pre * recall)/(pre + recall)

Model's first predictions aren't  always what we should based 
Lets looks :
- Hypyterparameter tuning
- Feature importance 
- Confusion Matrix 
- Cross-validation
- Precission
- Recall
- F1 Score
- Classification report 
- ROC curve
- AUC - Area under the curve

### Hypyterparameter tuning (by Hand)

In [None]:
train_scroe = []
test_score = []
neighbors =range(1,21)
knn = KNeighborsRegressor()

for i in neighbors:
    knn.set_params(n_neighbors=i)
    knn.fit(X_train, y_train)
    train_scroe.append(knn.score(X_train, y_train))
    test_score.append(knn.score(X_test, y_test)) 

print(train_scroe)
plt.plot(neighbors, train_scroe, label='Train Score')
plt.plot(neighbors, test_score, label='Test Score')
plt.xticks(np.arange(1,21,1))
plt.xlabel('Number of Neighbors')
plt.ylabel('Model Score')
plt.legend()


Hyperparameter tuning  with RandomizedSearchCV
 We are tune 
- Logistic Regression
- RandomForest classifier

using RandomizeSearchCV

In [None]:
# create a hyperparameter grid for LogisticRegression
log_reg_grid = {'C' : np.logspace(-4, 4, 20), 'solver' : ['liblinear']} 
# create a hyperparameter grid for RandomForestClassifier
rf_grid = {'n_estimators' : np.arange(10, 1000, 50),
           'max_depth' : [None, 3, 5, 10],
           'min_samples_split' : np.arange(2,20,2),
           'min_samples_leaf' : np.arange(1,20,2)}

rf_grid

Lets tune  by using RandomizeSearchCv

In [None]:
np.random.seed(42)
# Lets tune  by using RandomizeSearchCv
rs_log_reg = RandomizedSearchCV(RandomForestClassifier(), 
                                param_distributions=rf_grid, 
                                cv=5, n_iter=20, verbose=True)
rs_log_reg.fit(X_train, y_train)


In [None]:

rs_log_reg.score(X_test, y_test)

In [None]:
y_preds = rs_log_reg.predict(X_test)
print(classification_report(y_test, y_preds))
model_score = evaluate_model(models, X_train, X_test, y_train, y_test)
from sklearn.metrics import precision_score , recall_score , f1_score , RocCurveDisplay