# Random Forest Classifier with Pipline and Hyperparameter Tuning


### My Output feature is `time` 

In [367]:
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline # for pipeline
from sklearn.impute import SimpleImputer # to handle missing values
from sklearn.preprocessing import StandardScaler # for feature scaling
from sklearn.preprocessing import OneHotEncoder # To encode Categorical data to numerical data
from sklearn.compose import ColumnTransformer # To concat piplines
from sklearn.ensemble import RandomForestClassifier ,RandomForestRegressor
from sklearn.metrics import accuracy_score # to check accuracy
from sklearn.tree import DecisionTreeClassifier , DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV , GridSearchCV # for hyper parameter tuning
from sklearn.metrics import r2_score # for RandomForestRegressor accuracy score
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression


In [368]:
df = sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [369]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [370]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


### label encoding in time feature

In [371]:

df['time']=LabelEncoder().fit_transform(df['time'])

In [372]:
df['time'].value_counts()

time
0    176
1     68
Name: count, dtype: int64

### segregating into independet and dependent features

In [373]:
# segregating into independet and dependent features
X = df.drop('time',axis=1)
y = df['time']

### Train test split

In [374]:
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.20, random_state=42)

In [375]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.50,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.00,Female,Yes,Sat,0,2
241,22.67,2.00,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2


In [376]:
cat_cols = ['sex','smoker','day']
num_cols = ['total_bill','tip','size']

In [377]:
df[cat_cols]

Unnamed: 0,sex,smoker,day
0,Female,No,Sun
1,Male,No,Sun
2,Male,No,Sun
3,Male,No,Sun
4,Female,No,Sun
...,...,...,...
239,Male,No,Sat
240,Female,Yes,Sat
241,Male,Yes,Sat
242,Male,No,Sat


In [378]:
df[num_cols]

Unnamed: 0,total_bill,tip,size
0,16.99,1.01,2
1,10.34,1.66,3
2,21.01,3.50,3
3,23.68,3.31,2
4,24.59,3.61,4
...,...,...,...
239,29.03,5.92,3
240,27.18,2.00,2
241,22.67,2.00,2
242,17.82,1.75,2


## Feature Engineering Automation

### Numercal Pipelines

In [379]:
num_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')), # To handle missing values of numerical features
    ('scaler',StandardScaler()) # to do Scaling
    ]
)

In [380]:
cat_pipeline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')), # to handle mising values of categorical features
    ('onehotencoder',OneHotEncoder()) # to do encoding of nominal categorical features
    ]
)

In [381]:
preprocessor = ColumnTransformer(
    transformers=[
    ('num_pipeline',num_pipeline,num_cols),
    ('cat_pipeline',cat_pipeline,cat_cols)
    ]
)

In [382]:
X_train_preprocessor= preprocessor.fit_transform(X_train) # we do fit and transform X_train data
X_test_preprocessor = preprocessor.transform(X_test) # we do only transform X_test data

In [383]:
X_test_preprocessor.shape, X_train_preprocessor.shape

((49, 11), (195, 11))

## Model Trainig Automation

In [384]:
models = {
    'Random Forest' : RandomForestClassifier()
}
models.update({'Decision Tree': DecisionTreeClassifier()})

In [385]:
def evaluate_model(X_train,y_train,X_test,y_test,models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i] # this i is giving index of model
        # Train model 
        model.fit(X_train,y_train)

        # Predict Testing data
        y_pred_test = model.predict(X_test)

        # accuracy scores of test data
        test_model_score = accuracy_score(y_true= y_test,y_pred=y_pred_test)

        report[list(models.keys())[i]] = test_model_score

    return report

In [386]:
model_report = evaluate_model(X_train=X_train_preprocessor,y_train=y_train,y_test=y_test,X_test=X_test_preprocessor,models=models)
model_report

{'Random Forest': 0.9591836734693877, 'Decision Tree': 0.9387755102040817}

## Hyper parameter Tuning
with RandomizedSearchCV

In [387]:
parametes = {
    'max_depth' : [3,5,10,15,None],
    'criterion' : ['gini','entropy'],
    'random_state' : [None,23,44,42,3],
    'n_estimators' : [100,200,300]
}

In [388]:

randCV =RandomizedSearchCV(estimator=RandomForestClassifier(),param_distributions=parametes,n_jobs=-1,cv=10,verbose=3)

In [389]:
import warnings 
# warnings.filterwarnings('ignore')

In [390]:
randCV.fit(X_train_preprocessor,y_train)


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [391]:
randCV.best_params_

{'random_state': 42,
 'n_estimators': 200,
 'max_depth': 3,
 'criterion': 'entropy'}

In [392]:
randCV.predict(X_test_preprocessor)

array([0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1])

In [393]:
accuracy_score(y_test,randCV.predict(X_test_preprocessor))

1.0

# Random Forest Regressor with Pipline and Hyperparameter Tuning
## my output feature is total_bill


In [394]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.50,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.00,Female,Yes,Sat,0,2
241,22.67,2.00,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2


### segregating Dependent and independent features

In [395]:
df = sns.load_dataset('tips')

In [396]:
X = df.iloc[:,1:]
y = df.iloc[:,:1]

In [397]:
X , y

(      tip     sex smoker   day    time  size
 0    1.01  Female     No   Sun  Dinner     2
 1    1.66    Male     No   Sun  Dinner     3
 2    3.50    Male     No   Sun  Dinner     3
 3    3.31    Male     No   Sun  Dinner     2
 4    3.61  Female     No   Sun  Dinner     4
 ..    ...     ...    ...   ...     ...   ...
 239  5.92    Male     No   Sat  Dinner     3
 240  2.00  Female    Yes   Sat  Dinner     2
 241  2.00    Male    Yes   Sat  Dinner     2
 242  1.75    Male     No   Sat  Dinner     2
 243  3.00  Female     No  Thur  Dinner     2
 
 [244 rows x 6 columns],
      total_bill
 0         16.99
 1         10.34
 2         21.01
 3         23.68
 4         24.59
 ..          ...
 239       29.03
 240       27.18
 241       22.67
 242       17.82
 243       18.78
 
 [244 rows x 1 columns])

## Train test split

In [398]:
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.33, random_state=42)

In [399]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [400]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [401]:
# for piplines 
categorical_cols = ['sex', 'smoker', 'day', 'time']
numerical_cols = ['tip', 'size']

## Feature Engineering Automation

In [402]:
categorical_pipline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehotencoder',OneHotEncoder())
    ]
)

numerical_pipline = Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)

In [403]:
preprocessor_reg = ColumnTransformer(
    transformers=[
    ('categorical_pipline',categorical_pipline,categorical_cols),
    ('numerical_pipline',numerical_pipline,numerical_cols)
    ]
)

In [404]:
X_train_preprocessor_reg = preprocessor_reg.fit_transform(X_train)
X_test_preprocessor_reg = preprocessor_reg.transform(X_test)

In [405]:
X_train_preprocessor_reg.shape ,X_test_preprocessor_reg.shape

((163, 12), (81, 12))

In [406]:
y_test

Unnamed: 0,total_bill
24,19.82
6,8.77
153,24.55
211,25.89
198,13.00
...,...
180,34.65
5,25.29
56,38.01
125,29.80


## Hyper parameter Tuning
with RandomizedSearchCV

In [407]:
parametes_reg = {
    'max_depth' : [3,5,10,15,None],
    'criterion' : ['friedman_mse','absolute_error','squared_error'],
     'max_features' : ['sqrt','log2'],
    'n_estimators' : [100,200,300]
}

In [408]:
randCV_reg = RandomizedSearchCV(estimator=RandomForestRegressor(),
                                param_distributions=parametes_reg,
                                n_iter=10,scoring='neg_mean_absolute_error',
                                verbose=3,cv=5)

In [409]:
randCV_reg.fit(X_train_preprocessor_reg,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=absolute_error, max_depth=10, max_features=log2, n_estimators=200;, score=-3.898 total time=   0.4s
[CV 2/5] END criterion=absolute_error, max_depth=10, max_features=log2, n_estimators=200;, score=-4.110 total time=   0.3s
[CV 3/5] END criterion=absolute_error, max_depth=10, max_features=log2, n_estimators=200;, score=-4.593 total time=   0.2s
[CV 4/5] END criterion=absolute_error, max_depth=10, max_features=log2, n_estimators=200;, score=-6.416 total time=   0.4s
[CV 5/5] END criterion=absolute_error, max_depth=10, max_features=log2, n_estimators=200;, score=-4.417 total time=   0.4s
[CV 1/5] END criterion=absolute_error, max_depth=3, max_features=sqrt, n_estimators=300;, score=-3.320 total time=   0.4s
[CV 2/5] END criterion=absolute_error, max_depth=3, max_features=sqrt, n_estimators=300;, score=-5.984 total time=   0.6s
[CV 3/5] END criterion=absolute_error, max_depth=3, max_features=sqrt, n_estimat

In [410]:
randCV_reg.best_params_

{'n_estimators': 200,
 'max_features': 'sqrt',
 'max_depth': None,
 'criterion': 'friedman_mse'}

## Model Trainig Automation

In [417]:
models_reg = {
    'Random Forest Regressor' : RandomForestRegressor(random_state=9),
    'DecisionTree Regressor' : DecisionTreeRegressor(random_state=34)
    
}


In [419]:
models_regCv = {
    'Random Forest Regressor' : RandomForestRegressor(n_estimators=200,criterion='friedman_mse',random_state=9,max_depth= None,max_features='sqrt'),
    'DecisionTree Regressor' : DecisionTreeRegressor(random_state=34)
    
}


In [413]:
def modelAuto(X_train,X_test,y_train,y_test,models):
    model_report_reg = {}
    for i in range(len(models)):
        modelsTrain = list(models.values())[i]
        modelsTrain.fit(X_train,y_train)

        y_pred_test_reg = modelsTrain.predict(X_test)

        acc_score = r2_score(y_true=y_test,y_pred=y_pred_test_reg) 
        
        accAdjusted_r2_score =  1-(1-acc_score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
        model_report_reg[list(models.keys())[i]] = acc_score , accAdjusted_r2_score

    return model_report_reg

In [414]:
modelAuto(X_train=X_train_preprocessor_reg,y_train=y_train,X_test=X_test_preprocessor_reg,y_test=y_test,models=models_reg)

{'Random Forest Regressor': (0.42825412274806485, 0.32735779146831157),
 'DecisionTree Regressor': (0.26491985905464044, 0.13519983418192993)}

In [420]:
modelAuto(X_train=X_train_preprocessor_reg,y_train=y_train,X_test=X_test_preprocessor_reg,y_test=y_test,models=models_regCv)


{'Random Forest Regressor': (0.4278066877895865, 0.3268313973995136),
 'DecisionTree Regressor': (0.26491985905464044, 0.13519983418192993)}