In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [1]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()


In [2]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [None]:
## 5.1 Tuning hyperparameters by hand



In [18]:
def evaluate_preds(y_true,y_preds):
    accuracy = accuracy_score(y_true,y_preds)
    precision = precision_score(y_true,y_preds)
    recall = recall_score(y_true,y_preds)
    f1 = f1_score(y_true,y_preds)
    metric_dict= {'accuracy': round(accuracy, 2),
                  'precision': round(precision, 2),
                  'recall':round(recall, 2),
                  'f1':round(f1,2)}
    print(f'Acc: {accuracy * 100:.2f}%')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1 score: {f1:.2f}')  
    
    return metric_dict

In [20]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)
heart_disease = pd.read_csv('scikit-learn-data/heart-disease.csv')

#shuffle data
heart_disease_shuffled = heart_disease.sample(frac=1)

#split into X and y
X = heart_disease_shuffled.drop('target', axis=1)
y = heart_disease_shuffled['target']    

train_split = round(0.7 * len(heart_disease_shuffled))   #70% of data
valid_split = round(train_split + 0.15 * len(heart_disease_shuffled))   #15% of data

X_train, y_train = X[:train_split], y[:train_split]
X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split]  
X_test, y_test = X[valid_split:], y[valid_split:]   

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

y_preds = clf.predict(X_valid)

baseline_metrics = evaluate_preds(y_valid, y_preds)
baseline_metrics

Acc: 82.22%
Precision: 0.81
Recall: 0.88
F1 score: 0.85


  X_train, y_train = X[:train_split], y[:train_split]
  X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split]
  X_test, y_test = X[valid_split:], y[valid_split:]


{'accuracy': 0.82, 'precision': 0.81, 'recall': 0.88, 'f1': 0.85}

In [16]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [21]:
np.random.seed(42)

#create a second classifier with different hyperparameters

clf_2 = RandomForestClassifier(n_estimators=100)
clf_2.fit(X_train, y_train)

#make predictions
y_preds_2 = clf_2.predict(X_valid)

#Evaluate the 2nd classifier
clf_2_metrics = evaluate_preds(y_valid, y_preds_2)

Acc: 80.00%
Precision: 0.83
Recall: 0.80
F1 score: 0.82


In [23]:
np.random.seed(42)

clf_3 = RandomForestClassifier(n_estimators=100, max_depth=10)
clf_3.fit(X_train, y_train)

y_preds_3 = clf_3.predict(X_test)

clf_3_metrics = evaluate_preds(y_test, y_preds_3)

Acc: 84.78%
Precision: 0.88
Recall: 0.84
F1 score: 0.86
