In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
heart_df = pd.read_csv('heart.csv')

In [None]:
heart_df

In [None]:
heart_df.info()

In [None]:
heart_df.describe()

In [None]:
heart_df.tail()

In [None]:
sns.pairplot(heart_df, hue = 'target', 
             vars = ['age', 'sex', 'cp', 'chol', 'thal'] )

In [None]:
sns.relplot(heart_df, y="chol", x="target")

In [None]:
sns.relplot(heart_df, x="age", y="trestbps", kind="line")

In [None]:
sns.displot(heart_df, x="oldpeak",hue='target')

In [None]:
heart_df.corr()

In [None]:
input_cols = list(heart_df.columns)[1:-1]
target_col = 'target'

In [None]:
train_val_df, test_df = train_test_split(heart_df, test_size=0.2, random_state=42)

train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)

In [None]:
train_inputs, train_targets = train_df[input_cols].copy(), train_df[target_col].copy()
val_inputs, val_targets = val_df[input_cols].copy(), val_df[target_col].copy()
test_inputs, test_targets = test_df[input_cols].copy(), test_df[target_col].copy()

In [None]:
scaler = MinMaxScaler()
scaler.fit(heart_df[input_cols])

In [None]:

train_inputs = scaler.transform(train_inputs)
val_inputs = scaler.transform(val_inputs)
test_inputs = scaler.transform(test_inputs)

In [None]:
X_train = train_inputs
X_val = val_inputs
X_test = test_inputs

In [None]:
model = DecisionTreeClassifier(random_state=42)

In [None]:
%%time
model.fit(X_train, train_targets)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
train_preds = model.predict(X_train)

In [None]:
train_preds

In [None]:
pd.Series(train_preds).value_counts()

In [None]:
train_probs = model.predict_proba(X_train)

In [None]:
train_probs

In [None]:
accuracy_score(train_targets, train_preds)

In [None]:
model.score(X_val, val_targets)

In [None]:
val_targets.value_counts() / len(val_targets)

### Visualization



In [None]:
from sklearn.tree import plot_tree, export_text

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=train_df.columns, max_depth=2, filled=True);

In [None]:
tree_text = export_text(model, max_depth=10, feature_names=list(heart_df[input_cols]))
print(tree_text[:5000])

In [None]:
model.feature_importances_

In [None]:
importance_df = pd.DataFrame({
    'feature': input_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
importance_df.head(10)

In [None]:
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

In [None]:
model = DecisionTreeClassifier(max_depth=3, random_state=42)

In [None]:
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets)

In [None]:
model.score(X_val, val_targets)

In [None]:
model.classes_

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model, feature_names=train_df.columns, max_depth=2, filled=True);

In [None]:
print(export_text(model, feature_names=list(heart_df[input_cols])))

In [None]:
def max_depth_error(md):
    model = DecisionTreeClassifier(max_depth=md, random_state=42)
    model.fit(X_train, train_targets)
    train_acc = 1 - model.score(X_train, train_targets)
    val_acc = 1 - model.score(X_val, val_targets)
    return {'Max Depth': md, 'Training Error': train_acc, 'Validation Error': val_acc}

In [None]:
%%time
errors_df = pd.DataFrame([max_depth_error(md) for md in range(1, 21)])

In [None]:
errors_df

In [None]:
plt.figure()
plt.plot(errors_df['Max Depth'], errors_df['Training Error'])
plt.plot(errors_df['Max Depth'], errors_df['Validation Error'])
plt.title('Training vs. Validation Error')
plt.xticks(range(0,21, 2))
plt.xlabel('Max. Depth')
plt.ylabel('Prediction Error (1 - Accuracy)')
plt.legend(['Training', 'Validation'])

In [None]:
model = DecisionTreeClassifier(max_depth=7, random_state=42).fit(X_train, train_targets)
model.score(X_val, val_targets)

In [None]:
model = DecisionTreeClassifier(max_leaf_nodes=128, random_state=42)

In [None]:
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets)

In [None]:
model.score(X_val, val_targets)

In [None]:
model.tree_.max_depth

In [None]:
model_text = export_text(model, feature_names=list(heart_df[input_cols]))
print(model_text[:3000])

## Training a Random Forest

In [None]:
model = RandomForestClassifier(n_jobs=-1, random_state=42)

In [None]:
%%time
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets)

In [None]:
model.score(X_val, val_targets)

In [None]:
train_probs = model.predict_proba(X_train)
train_probs

In [None]:
model.estimators_[0]

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model.estimators_[0], max_depth=2, feature_names=train_df.columns, filled=True, rounded=True);

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model.estimators_[20], max_depth=2, feature_names=train_df.columns, filled=True, rounded=True);

In [None]:
len(model.estimators_)

In [None]:
importance_df = pd.DataFrame({
    'feature': input_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
importance_df.head(10)

In [None]:
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10), x='importance', y='feature');

In [None]:
base_model = RandomForestClassifier(random_state=42, n_jobs=-1).fit(X_train, train_targets)

In [None]:
base_train_acc = base_model.score(X_train, train_targets)
base_val_acc = base_model.score(X_val, val_targets)

In [None]:
base_accs = base_train_acc, base_val_acc
base_accs

In [None]:
model = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=10)

In [None]:
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets), model.score(X_val, val_targets)

In [None]:
base_accs

In [None]:
model = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=500)
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets)

In [None]:
model.score(X_val, val_targets)

In [None]:
base_accs

In [None]:
def test_params(**params):
    model = RandomForestClassifier(random_state=42, n_jobs=-1, **params).fit(X_train, train_targets)
    return model.score(X_train, train_targets), model.score(X_val, val_targets)

In [None]:
test_params(max_depth=5)

In [None]:
test_params(max_depth=26)

In [None]:
test_params(max_leaf_nodes=2**5)

In [None]:
test_params(max_leaf_nodes=2**20)

In [None]:
base_accs # no max depth or max leaf nodes

In [None]:
test_params(max_features='log2')

In [None]:
test_params(max_features=3)

In [None]:
test_params(max_features=6)

In [None]:
base_accs

In [None]:
test_params(min_samples_split=3, min_samples_leaf=2)

In [None]:
test_params(min_samples_split=100, min_samples_leaf=60)

In [None]:
base_accs

In [None]:
test_params(min_impurity_decrease=1e-7)

In [None]:
test_params(min_impurity_decrease=1e-2)

In [None]:
base_accs

In [None]:
test_params(bootstrap=False)

In [None]:
base_accs

In [None]:
test_params(max_samples=0.9)

In [None]:
model = RandomForestClassifier(n_jobs=-1, 
                               random_state=42, 
                               n_estimators=500,
                               max_features=7,
                               max_depth=30, 
                               class_weight={'No': 1, 'Yes': 1.5})

In [None]:
import joblib

In [None]:
heart_disease = {
    'model': model,
    'scaler': scaler,
    'input_cols': input_cols,
    'target_col': target_col}
   

In [None]:
joblib.dump(heart_disease, 'heart_disease.joblib')