In [1]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### 5. Model Improving:
First predictions = baseline predictions
First Model = baseline model

Improving Factors (Data Perspective)
- collect and fit more data (data with better pattern for matching)
- improve data (distribute/add more features to add depth)

Improving Factors (Model Perspective)
- select better, more complex model
- improve current model (changing hyperparameters)

Note:
parameters = model find these patterns in data
hyperparameters = setting on a model to adjust  or improve its ability to find parameters

In [16]:
# Classificaiton
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

heart_disease = pd.read_csv("./data/heart-disease.csv")

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier()

clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8524590163934426

In [8]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### Improving Model With Hyperparameters:
- changing hyperparameters manually
- randomly with RandomSearchCV
- Exhaustively with GridSearchCV

Practice
- max_depth
- max_features
- min_sample_leaf
- min_sample_split
- n_estimators

In [21]:
# function for changing hyperparameters
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def evaluate_preds(y_true, y_preds):
    """
    Performs evaluation comparison on y_true (y_test) labels vs y_preds labels
    on a classificaiton problem
    """
    
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    metrics_dict = {"accuracy": round(accuracy, 2),
                   "precision": round(precision, 2),
                   "recall": round(recall, 2),
                   "f1": round(f1, 2)}
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    
    return metrics_dict

In [23]:
y_preds = clf.predict(X_test)
evaluate_preds(y_test, y_preds)

Acc: 85.25%
Precision: 0.85
Recall: 0.88
F1 Score: 0.86


{'accuracy': 0.85, 'precision': 0.85, 'recall': 0.88, 'f1': 0.86}

In [29]:
# Manual Adjust (Change) hyperparameter to check if model can be improved (or degrade) 
clf_2 = RandomForestClassifier(n_estimators=20, max_depth=70)
clf_2.fit(X_train, y_train)

y_preds_2 = clf_2.predict(X_test)
evaluate_preds(y_test, y_preds_2)

Acc: 86.89%
Precision: 0.88
Recall: 0.88
F1 Score: 0.88


{'accuracy': 0.87, 'precision': 0.88, 'recall': 0.88, 'f1': 0.88}

### Automatic Hyperparameter Adjustment For Model Improvement:
- Randomized Search CV
- Grid Search CV:

In [30]:
from sklearn.model_selection import RandomizedSearchCV

# define a dictionary with the variation of hyperparamenter to use
grid = {"n_estimators": [10,100,200,500,1000, 1200],
       "max_depth": [None, 5, 10, 20, 30],
       "max_features": ["auto", "sqrt"],
       "min_samples_split": [2,4,6],
       "min_samples_leaf": [1,2,3]}

np.random.seed(42)

# Instantiate Model/Estimator, 
# n_jobs specifies the number of jobs to run in parallel, -1 means all
clf = RandomForestClassifier(n_jobs=-1)

# Setup RandomizedSerarchCV
rs_clf = RandomizedSearchCV(estimator=clf, 
                            param_distributions=grid, 
                            n_iter=10, # number of models to try with different combination of the grid
                            cv=5,
                            verbose=2)

# Fit the RandomizedSearchCV version of clf
rs_clf.fit(X_train, y_train) # this will try all those and make rs_clf the best combination

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=6, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   2.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimato

15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mainuldip/ongoing/AI-ML-Study/apps/1-pandas/env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mainuldip/ongoing/AI-ML-Study/apps/1-pandas/env/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/home/mainuldip/ongoing/AI-ML-Study/apps/1-pandas/env/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/home/mainuldip/ongoing/AI-ML

In [31]:
# Get the best parameters setting
rs_clf.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': None}

In [32]:
# make prediction with the best hyperparameters (by default rs_clf is the best combination)
rs_y_preds = rs_clf.predict(X_test)

# Evaluate the predictions
rs_metrics = evaluate_preds(y_test, rs_y_preds)
rs_metrics

Acc: 85.25%
Precision: 0.85
Recall: 0.88
F1 Score: 0.86


{'accuracy': 0.85, 'precision': 0.85, 'recall': 0.88, 'f1': 0.86}

### Change Hyperparameters With Grid Search CV:
RandomizedSearchCV vs GridSearchCV
- its a brute force apply (will try every single combination)

In [37]:
# make second grid of the hyperparameters for trying with GridSearchCV
grid_2 = {'n_estimators': [100,200,500],
         'min_samples_split': [2],
         'min_samples_leaf': [1,2],
         'max_features': ['auto','sqrt'],
         'max_depth':[None]}

In [38]:
from sklearn.model_selection import GridSearchCV

np.random.seed(42)

# Instantiate Model/Estimator, 
# n_jobs specifies the number of jobs to run in parallel, -1 means all
clf = RandomForestClassifier(n_jobs=-1)

# Setup RandomizedSerarchCV
gs_clf = GridSearchCV(estimator=clf, 
                            param_grid=grid_2, 
                            cv=5,
                            verbose=2)

# Fit the Grid Search CV version of clf
gs_clf.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.0s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, 

30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mainuldip/ongoing/AI-ML-Study/apps/1-pandas/env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mainuldip/ongoing/AI-ML-Study/apps/1-pandas/env/lib/python3.10/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/home/mainuldip/ongoing/AI-ML-Study/apps/1-pandas/env/lib/python3.10/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/home/mainuldip/ongoing/AI-ML

In [39]:
gs_clf.best_params_

{'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 100}

In [41]:
gs_y_preds = gs_clf.predict(X_test)

gs_metrics = evaluate_preds(y_test, gs_y_preds)
gs_metrics

Acc: 85.25%
Precision: 0.85
Recall: 0.88
F1 Score: 0.86


{'accuracy': 0.85, 'precision': 0.85, 'recall': 0.88, 'f1': 0.86}

In [6]:
# Regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import cross_val_score

housing = fetch_california_housing()
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
# attatching target column, aka the y
housing_df["target"] = housing["target"]

X = housing_df.drop("target", axis=1)
y = housing_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.80119717681911

### Corelation Analyses:
Finding which attribute/column has higher/lower corelations with another attribute/column. With high corelation between columns makes no improvement to the model, so similar can be removed from data to further improve the model.
### Forward/Backword Attribute Selection:
Forward Attribute Selection: Start with all the columns and slowly reduce
Backword: Start with minimal (one) columns and add keep adding 1 column next time while training to check the impact to better improve the model.