# Random Forest Classifier with Pipeline and HyperParameter

In [1]:
import seaborn as sns


In [15]:
df=sns.load_dataset('tips')

In [16]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [17]:
df.isna().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [18]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [19]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [20]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['time']=encoder.fit_transform(df['time'])

In [21]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.50,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,0,3
240,27.18,2.00,Female,Yes,Sat,0,2
241,22.67,2.00,Male,Yes,Sat,0,2
242,17.82,1.75,Male,No,Sat,0,2


In [22]:
X=df.drop(labels=['time'],axis=1)
y=df.time

In [23]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [24]:
from sklearn.pipeline import Pipeline

In [25]:
from sklearn.impute import SimpleImputer # to handle missing values

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
from sklearn.preprocessing import OneHotEncoder

In [28]:
from sklearn.compose import ColumnTransformer

In [29]:
cat_features=['sex','smoker','day']
num_features=['total_bill','tip','size']

In [30]:
# Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')), #missing val
    ('scaler',StandardScaler())# feature scaling
    ]
)

#categoricl Pipeline

cat_pipeline=Pipeline(
steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehotencoder',OneHotEncoder()) # cat to num
])

In [31]:
preprocessor=ColumnTransformer([

    ('num_pipeline',num_pipeline,num_features),
    ('cat_pipeline',cat_pipeline,cat_features)
])

In [32]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [33]:
y_train

228    0
208    0
96     0
167    0
84     1
      ..
106    0
14     0
92     0
179    0
102    0
Name: time, Length: 195, dtype: int64

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [38]:
models={'Random Forest':RandomForestClassifier(),'DTC':DecisionTreeClassifier(),'LR':LogisticRegression()}

In [39]:
from sklearn.metrics import accuracy_score

In [40]:
def train(X_train,y_train,X_test,y_test,models):
    result={};
    for i in models:
        m=models[i];
        m.fit(X_train,y_train)
        y_pred=m.predict(X_test);
        score=accuracy_score(y_pred,y_test);
        result [i]=score;
    
    
    return result
        
        
        


In [41]:
train(X_train,y_train,X_test,y_test,models)

{'Random Forest': 0.9591836734693877, 'DTC': 0.9387755102040817, 'LR': 1.0}

In [42]:
#HyperParameter tuning

In [52]:
## Hypeparameter Tuning
params={'max_depth':[3,5,10,None],
              'n_estimators':[100,200,300],
               'criterion':['gini','entropy']
              }

In [53]:
from sklearn.model_selection import RandomizedSearchCV


In [54]:
cv=RandomizedSearchCV(RandomForestClassifier(),param_distributions=params,scoring='accuracy',cv=5,verbose=3)
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.974 total time=   0.2s
[CV 2/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.923 total time=   0.2s
[CV 3/5] END criterion=gini, max_depth=10, n_estimators=100;, score=1.000 total time=   0.2s
[CV 4/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.949 total time=   0.2s
[CV 5/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.949 total time=   0.2s
[CV 1/5] END criterion=entropy, max_depth=None, n_estimators=100;, score=0.974 total time=   0.2s
[CV 2/5] END criterion=entropy, max_depth=None, n_estimators=100;, score=0.923 total time=   0.2s
[CV 3/5] END criterion=entropy, max_depth=None, n_estimators=100;, score=1.000 total time=   0.2s
[CV 4/5] END criterion=entropy, max_depth=None, n_estimators=100;, score=0.949 total time=   0.2s
[CV 5/5] END criterion=entropy, max_depth=None, n_estimators=100;, score=0.923 tot

In [55]:
cv.best_params_

{'n_estimators': 100, 'max_depth': 10, 'criterion': 'gini'}

In [56]:
clf=RandomForestClassifier(n_estimators=300,max_depth=10,criterion='entropy',oob_score=True)

In [57]:
clf.fit(X_train,y_train)

In [58]:
y_pred=clf.predict(X_test)

In [59]:
accuracy_score(y_pred,y_test)

0.9591836734693877