CMP6202

# Objectives
In this lab you will implement:
* Hyperparameter Tuning
* GridSearchCV
* RandomSearchCV
* RandomForestClassifier

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [6]:
df = pd.read_csv('/content/drive/MyDrive/CMP6202/datasets/Heart.csv')

In [7]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,1,63,1,typical,145,233,1,2,150,0,2.3,3,0.0,fixed,No
1,2,67,1,asymptomatic,160,286,0,2,108,1,1.5,2,3.0,normal,Yes


In [9]:
df.drop(columns = 'Unnamed: 0', axis=1, inplace= True)

In [10]:
le = LabelEncoder()
df['AHD'] = le.fit_transform(df['AHD'])
df['ChestPain'] = le.fit_transform(df['ChestPain'])
df['Thal'] = le.fit_transform(df['Thal'])


In [11]:
df.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal,AHD
0,63,1,3,145,233,1,2,150,0,2.3,3,0.0,0,0
1,67,1,0,160,286,0,2,108,1,1.5,2,3.0,1,1
2,67,1,0,120,229,0,2,129,1,2.6,2,2.0,2,1
3,37,1,1,130,250,0,0,187,0,3.5,3,0.0,1,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,1,0


In [21]:
df.isna().sum()

Unnamed: 0,0
Age,0
Sex,0
ChestPain,0
RestBP,0
Chol,0
Fbs,0
RestECG,0
MaxHR,0
ExAng,0
Oldpeak,0


In [22]:
df = df.dropna()

In [24]:
# df.isna().sum()

In [12]:
X = df.drop(columns = 'AHD', axis= 1)
y = df['AHD']

In [13]:
X.head()

Unnamed: 0,Age,Sex,ChestPain,RestBP,Chol,Fbs,RestECG,MaxHR,ExAng,Oldpeak,Slope,Ca,Thal
0,63,1,3,145,233,1,2,150,0,2.3,3,0.0,0
1,67,1,0,160,286,0,2,108,1,1.5,2,3.0,1
2,67,1,0,120,229,0,2,129,1,2.6,2,2.0,2
3,37,1,1,130,250,0,0,187,0,3.5,3,0.0,1
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,1


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [27]:
sc = MinMaxScaler()

In [28]:
X_train = sc.fit_transform(X_train)

In [29]:
X_test = sc.transform(X_test)

# RandoForest Classifier

In [30]:
rf = RandomForestClassifier()

In [31]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

rf_acc = accuracy_score(y_pred, y_test)

print('Accuracy score ', rf_acc )


Accuracy score  0.8360655737704918


# [Hyperparameter Tuning](https://scikit-learn.org/stable/modules/grid_search.html)

In [32]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

# [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

In [None]:
from sklearn.model_selection import GridSearchCV

**Hyperparameter tuning** for a *RandomForestClassifier* in scikit-learn using **GridSearchCV** involves defining a grid of hyperparameter values and searching through this grid to find the best combination of hyperparameters.

In [36]:
param_grid = {
    'n_estimators': [3, 10, 30], # The number of trees in the forest.
    'max_features' : [2, 4, 6, 8]


}

# Use the grid of hyperparameters to search
* param_grid = {
*     'n_estimators': [10, 50, 100, 200],
*    'max_depth': [None, 10, 20, 30],
*    'min_samples_split': [2, 5, 10],
*    'min_samples_leaf': [1, 2, 4]
*}

In [37]:
grid_search = GridSearchCV(rf, param_grid= param_grid, cv =5,
                           scoring ='accuracy',
                           return_train_score= True) # we need to get positive score,

### **Be cautious about the computational cost of grid search, especially for a large parameter space.**

In [38]:
grid_search.fit(X_train, y_train)

In [39]:
grid_search.best_estimator_

In [40]:
# best combination found
grid_search.best_params_

{'max_features': 4, 'n_estimators': 30}

In [41]:
# best score achieved
grid_search.best_score_

0.8056972789115647

In [42]:
best_rf = grid_search.best_estimator_

In [43]:
best_rf.score(X_test, y_test)

0.8852459016393442

In [44]:
y_pred = best_rf.predict(X_test)

In [45]:
y_pred

array([0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0])

In [54]:
accuracy_score(y_pred, y_test)

0.819672131147541

# More improvements

In [None]:
# param_grid = {
#     'n_estimators': [30, 50, 100 ],
#     'max_features' : [8, 12, 20],
#     'min_samples_split' : [2, 4, 6, 8]


# }

In [46]:
param_grid = {
    'n_estimators': [3, 5],
    'max_features' : [2, 3],
    'min_samples_split' : [2, 4]


}

In [47]:
grid_search = GridSearchCV(rf, param_grid= param_grid, cv =5,
                           scoring ='accuracy',
                           return_train_score= True) # we need to get positive score,

In [48]:
grid_search.fit(X_train, y_train)

In [49]:
grid_search.best_estimator_

In [50]:
grid_search.best_params_

{'max_features': 3, 'min_samples_split': 2, 'n_estimators': 5}

In [51]:
rf_best = grid_search.best_estimator_

In [52]:
y_pred = rf_best.predict(X_test)

In [53]:
accuracy_score(y_pred, y_test)

0.819672131147541

# [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

 #### Hyperparameter tuning for a **RandomForestClassifier** using **RandomizedSearchCV** involves searching through a range of hyperparameter values to find the combination that optimizes the performance of the model.

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [55]:
rf = RandomForestClassifier(random_state=42)

In [None]:
# param_grid = {
#     'n_estimators': [30, 50],
#     'max_features' : [2, 4]

# }

In [56]:
param_grid = {
    'n_estimators': [3, 5],
    'max_features' : [2, 3],
}

## Adjust the param_grid according to your specific needs and the nature of your dataset :
* n_estimators: The number of trees in the forest.
* max_features: The number of features to consider at every split.
* max_depth: The maximum depth of the tree.
* min_samples_split: The minimum number of samples required to split an internal node.
* min_samples_leaf: The minimum number of samples required to be at a leaf node.

## The RandomizedSearchCV will then randomly sample from the provided hyperparameter grid, allowing you to efficiently explore a wide range of hyperparameter combinations.


In [57]:
random_search = RandomizedSearchCV(rf, param_distributions = param_grid, cv=5, random_state=42,
                          # scoring ='neg_mean_squared_error',
                          scoring ='accuracy', # Use a suitable scoring metric for your problem
                          n_jobs=-1,  # Use all available CPU cores
                           return_train_score= True)

In [None]:
# ?random_search


In [58]:
random_search.fit(X_train,y_train)

In [59]:
random_search.best_estimator_

In [60]:
random_search.best_params_

{'n_estimators': 5, 'max_features': 2}

In [61]:
best_random_search_estimator = random_search.best_estimator_

In [62]:
y_pred = best_random_search_estimator.predict(X_test)

In [63]:
accuracy_score(y_pred, y_test)

0.8032786885245902

In [64]:
best_random_search_estimator.score(X_train, y_train)

0.9586776859504132

In [65]:
best_random_search_estimator.score(X_test, y_test)

0.8032786885245902

# References
* [California Housing Prices](https://www.kaggle.com/datasets/camnugent/california-housing-prices/code)
* [Hyper-parameter tuning for scikit learn models](https://medium.com/dvt-engineering/hyper-parameter-tuning-for-scikit-learn-ml-models-860747bc3d72)
* [GridSearchCV vs RandomizedSearchCV](https://www.kdnuggets.com/hyperparameter-tuning-gridsearchcv-and-randomizedsearchcv-explained)