# Random forest Classifier with Pipeline and Hyper parameter Tunning

In [48]:
import seaborn as sns
import pandas as pd
df = sns.load_dataset('tips')

In [49]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [50]:
'''
Prob: input features:  [total_bill,tip,sex,smoker,day,size] 
      Output feature : time
      
using : Random forest classifier and goal is to predict time weather is dinner time or lunch time? 
'''

df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [51]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [52]:
df.time        # dependent feature

0      Dinner
1      Dinner
2      Dinner
3      Dinner
4      Dinner
        ...  
239    Dinner
240    Dinner
241    Dinner
242    Dinner
243    Dinner
Name: time, Length: 244, dtype: category
Categories (2, object): ['Lunch', 'Dinner']

In [53]:
df['time'].value_counts()

Dinner    176
Lunch      68
Name: time, dtype: int64

In [54]:
# applying label encoding on time feature to convert into numeric format by using sklearn
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

# fit-transform 
df['time'] = encoder.fit_transform(df['time'])
df.time

0      0
1      0
2      0
3      0
4      0
      ..
239    0
240    0
241    0
242    0
243    0
Name: time, Length: 244, dtype: int64

In [55]:
df['time'].value_counts()

0    176
1     68
Name: time, dtype: int64

In [56]:
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3


In [57]:
# creating independent and dependent feature
x = df.iloc[:,[0,1,2,3,4,6]]         # independent feature
y = df.time                 # dependent feature

In [58]:
x,y

(     total_bill   tip     sex smoker   day  size
 0         16.99  1.01  Female     No   Sun     2
 1         10.34  1.66    Male     No   Sun     3
 2         21.01  3.50    Male     No   Sun     3
 3         23.68  3.31    Male     No   Sun     2
 4         24.59  3.61  Female     No   Sun     4
 ..          ...   ...     ...    ...   ...   ...
 239       29.03  5.92    Male     No   Sat     3
 240       27.18  2.00  Female    Yes   Sat     2
 241       22.67  2.00    Male    Yes   Sat     2
 242       17.82  1.75    Male     No   Sat     2
 243       18.78  3.00  Female     No  Thur     2
 
 [244 rows x 6 columns],
 0      0
 1      0
 2      0
 3      0
 4      0
       ..
 239    0
 240    0
 241    0
 242    0
 243    0
 Name: time, Length: 244, dtype: int64)

In [59]:
# train test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.20, random_state=42)

In [60]:
x_train.shape, y_train.shape

((195, 6), (195,))

In [61]:
# to creating encoding handling missing values feature engeeniring into a automate way we used pipelines.

# importing the libraries
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer         # for Handling missing values
from sklearn.preprocessing import StandardScaler       # for Feature scalling
from sklearn.preprocessing import OneHotEncoder        # Categorical to Numerical
from sklearn.compose import ColumnTransformer          # combining Pipelines

In [62]:
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3


In [63]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [64]:
# define categorical or numerical feature to model
categorical_cols = ['sex', 'smoker', 'day']
numerical_cols = ['total_bill', 'tip','size']

# Feature Engineering Automation

In [65]:
# Numerical feature 
num_pipeline = Pipeline(
    steps= [
        ('imputer',SimpleImputer(strategy= 'median')), # missing values
        ('scaler', StandardScaler())    # feature scalling
    ])

# categorical feature
cat_pipeline = Pipeline(
    steps= [
        ('imputer', SimpleImputer(strategy= 'most_frequent')),      # handling missing values
        ('onehotencoder', OneHotEncoder())        # categorical to numerical
    ])

In [66]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat_pipeline',cat_pipeline , categorical_cols)

])

preprocessor

In [67]:
# transforming the x-train, x-test
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

In [83]:
# importing random forest classifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Model Training Automation

In [84]:
models = {
    'Random Forest' : RandomForestClassifier(),
    'Logistic Regression':LogisticRegression(),
    'Decision Tree':DecisionTreeClassifier()

}

In [85]:
def evaluate_model(x_train, y_train, x_test, y_test, models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        
        # train model
        model.fit(x_train, y_train)
        
        
        # predict testing data 
        y_test_pred = model.predict(x_test)
        
        #get accuracy for test data prediction
        test_model_score = accuracy_score(y_test,y_test_pred)
        
        report[list(models.keys())[i]] = test_model_score
        
    return report

In [86]:
evaluate_model(x_train, y_train, x_test, y_test, models)

{'Random Forest': 0.9591836734693877,
 'Logistic Regression': 1.0,
 'Decision Tree': 0.9387755102040817}

In [87]:
classifier = RandomForestClassifier()

In [93]:
# Hyper parameter tunning
param = {
    'max_depth' : [3,5,10,None],
    'n_estimators' : [100,200,300],
    'criterion' : ['gini', 'entropy']
}

In [94]:
from sklearn.model_selection import RandomizedSearchCV

In [95]:
cv = RandomizedSearchCV(classifier, param_distributions= param, scoring= 'accuracy', cv = 5, verbose=3)
cv.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=10, n_estimators=300;, score=0.974 total time=   0.6s
[CV 2/5] END criterion=gini, max_depth=10, n_estimators=300;, score=0.923 total time=   0.6s
[CV 3/5] END criterion=gini, max_depth=10, n_estimators=300;, score=1.000 total time=   0.6s
[CV 4/5] END criterion=gini, max_depth=10, n_estimators=300;, score=0.949 total time=   0.6s
[CV 5/5] END criterion=gini, max_depth=10, n_estimators=300;, score=0.949 total time=   0.6s
[CV 1/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.974 total time=   0.4s
[CV 2/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.923 total time=   0.4s
[CV 3/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=1.000 total time=   0.4s
[CV 4/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.923 total time=   0.4s
[CV 5/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.923 total time=  

In [96]:
cv.best_params_

{'n_estimators': 300, 'max_depth': 10, 'criterion': 'gini'}