## RandomSearchCV
**When you have too many parameters to test, your training time is longer. So, Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. It helps reduce the cost of computation.** <br>
ie., n_iter - Number of parameter settings that are sampled - from the given set of hyperparameters, no. of combinaions of parameters to be applied and searched <br>
**Refer : https://www.youtube.com/watch?v=w4frwjt8uCo** <br>
**Refer Jupyter Notebooks:** 16_hyper_parameter_tuning and 16_hyper_parameter_tuning_digits

#### Difference:
**GridSearchCV** - Exhaustively tries all combinations within the sample space <br>
**RandomSearchCV** - Randomly selects a subset of combinations within the sample space (n_iter)

## Random Forest Classification with RandomizedSearchCV

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
dataset = pd.read_csv('Datasets/07_social_network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

In [3]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state =0)

In [4]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [5]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 50)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=50)

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

rf_p_dist={'max_depth':[3,5,10,None],
              'n_estimators':[10,100,200,300,400,500],
              'max_features':randint(1,3),
               'criterion':['gini','entropy'],
               'bootstrap':[True,False],
               'min_samples_leaf':randint(1,4),
              }

In [7]:
rdmsearch = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1) , param_distributions = rf_p_dist,
                              n_jobs=-1, n_iter=40, cv=9)
rdmsearch_fit = rdmsearch.fit(X_train,y_train)

In [8]:
rdmsearch_fit.best_estimator_
# Estimator that was chosen by the search, 
# i.e. estimator which gave highest score (or smallest loss if specified) on the left out data. 

RandomForestClassifier(criterion='entropy', max_depth=3, max_features=2,
                       n_estimators=500, n_jobs=-1)

In [9]:
rdmsearch_fit.best_score_
# Mean cross-validated score of the best_estimator.

0.9037433155080213

In [10]:
rdmsearch_fit.best_params_
# Parameter setting that gave the best results on the hold out data.

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 3,
 'max_features': 2,
 'min_samples_leaf': 1,
 'n_estimators': 500}

In [11]:
rdmsearch_fit.best_index_
# The index (of the cv_results_ arrays) which corresponds to the best candidate parameter setting.

23

In [12]:
rdmsearch_fit.n_splits_
# The number of cross-validation splits (folds/iterations).

9

In [13]:
df = pd.DataFrame(rdmsearch_fit.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_n_estimators,...,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,mean_test_score,std_test_score,rank_test_score
0,0.40796,0.052675,0.044884,0.007114,True,gini,5.0,1,1,100,...,0.794118,0.909091,0.909091,0.848485,0.909091,0.969697,0.969697,0.890572,0.056948,11
1,1.980353,0.387481,0.160894,0.008391,True,entropy,3.0,1,1,500,...,0.823529,0.878788,0.909091,0.878788,0.909091,1.0,0.969697,0.900475,0.055533,2
2,0.858299,0.07222,0.073812,0.004703,True,gini,5.0,2,2,200,...,0.823529,0.878788,0.848485,0.818182,0.909091,1.0,0.969697,0.883739,0.061793,25
3,0.520634,0.082183,0.042729,0.003776,True,gini,5.0,1,1,100,...,0.794118,0.909091,0.848485,0.848485,0.909091,1.0,0.969697,0.890473,0.063867,12
4,0.045911,0.005874,0.020022,0.00348,False,entropy,3.0,1,1,10,...,0.823529,0.878788,0.818182,0.878788,0.909091,0.969697,0.969697,0.887007,0.055484,23
5,0.86904,0.153947,0.075373,0.004789,True,gini,,1,3,200,...,0.823529,0.878788,0.878788,0.848485,0.909091,1.0,0.969697,0.893741,0.057711,7
6,0.040643,0.004125,0.016876,0.003759,False,entropy,5.0,2,2,10,...,0.735294,0.878788,0.848485,0.818182,0.909091,1.0,0.878788,0.857298,0.074285,32
7,1.969371,0.244154,0.136998,0.01077,True,gini,5.0,2,2,400,...,0.823529,0.878788,0.878788,0.848485,0.909091,1.0,0.969697,0.890473,0.05743,12
8,0.049054,0.00332,0.014426,0.00189,True,gini,5.0,2,1,10,...,0.852941,0.909091,0.848485,0.848485,0.878788,0.969697,0.969697,0.883739,0.051048,25
9,0.35658,0.015706,0.05143,0.007323,False,entropy,,1,3,100,...,0.823529,0.878788,0.878788,0.818182,0.909091,1.0,0.969697,0.890374,0.057806,17


In [17]:
# sort by test score
df.sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_n_estimators,...,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,mean_test_score,std_test_score,rank_test_score
23,2.158455,0.363353,0.163097,0.018438,True,entropy,3.0,2,1,500,...,0.823529,0.878788,0.909091,0.878788,0.909091,1.0,0.969697,0.903743,0.056948,1
1,1.980353,0.387481,0.160894,0.008391,True,entropy,3.0,1,1,500,...,0.823529,0.878788,0.909091,0.878788,0.909091,1.0,0.969697,0.900475,0.055533,2
12,2.298426,0.20511,0.174672,0.010456,True,gini,,1,3,500,...,0.823529,0.909091,0.909091,0.848485,0.909091,1.0,0.969697,0.900475,0.05734,2
26,0.92,0.110671,0.073403,0.007298,True,entropy,10.0,2,3,200,...,0.882353,0.878788,0.909091,0.818182,0.909091,1.0,0.969697,0.900277,0.056238,4
36,0.557942,0.060911,0.046562,0.00237,True,gini,,1,3,100,...,0.823529,0.878788,0.878788,0.878788,0.909091,1.0,0.969697,0.897108,0.055826,5
20,0.380085,0.055656,0.046008,0.003809,False,gini,10.0,1,3,100,...,0.823529,0.878788,0.878788,0.848485,0.909091,1.0,0.969697,0.897009,0.052582,6
30,1.209174,0.156225,0.098462,0.020529,True,entropy,5.0,2,3,300,...,0.823529,0.878788,0.909091,0.818182,0.909091,1.0,0.969697,0.893741,0.061145,7
27,0.840798,0.102998,0.083306,0.011267,True,entropy,5.0,1,2,200,...,0.823529,0.878788,0.878788,0.848485,0.909091,1.0,0.969697,0.893741,0.057711,7
5,0.86904,0.153947,0.075373,0.004789,True,gini,,1,3,200,...,0.823529,0.878788,0.878788,0.848485,0.909091,1.0,0.969697,0.893741,0.057711,7
39,0.05331,0.011145,0.01687,0.002501,True,gini,3.0,2,2,10,...,0.823529,0.878788,0.878788,0.878788,0.878788,1.0,0.939394,0.893642,0.054139,10


In [14]:
claasifier = RandomForestClassifier(max_depth=5, max_features=1, min_samples_leaf=3,
                       n_estimators=400, n_jobs=-1)

In [15]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.92