# Random forest classifier implementation with pipelines and hyperparamenter tuning

In [23]:
import seaborn as sns
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


We can perform the following operations.

1. Handling missing values
2. Handling outliers
3. Feature scaling

EDA cannot be automated. It is all about analyzing data. We can automate feature engineering, model training, model evaluation etc.

In [24]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
df['time'] = LE.fit_transform(df['time'])

# Divide dataset into independent and dependent features

In [25]:
X = df.drop(columns='time')
y = df['time']

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=963258)

# Pipeline
Automating Feature Engineering

In [27]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [28]:
categorical_cols = ['sex', 'smoker', 'day']
numerical_cols = ['total_bill', 'tip', 'size']

In [29]:
num_pipeline = Pipeline(
                           steps = [
                                       ('imputer', SimpleImputer(strategy='median')),
                                       ('scaler', StandardScaler())
                                   ]
                       )

In [30]:
cat_pipeline = Pipeline(
                           steps = [
                                       ('imputer', SimpleImputer(strategy='most_frequent')),
                                       ('OH_Encoder', OneHotEncoder())
                                   ]
                       )

In [31]:
# Creating a wrapper
preprocessor = ColumnTransformer([
                                     ('num_pipeline', num_pipeline, numerical_cols),
                                     ('cat_pipeline', cat_pipeline, categorical_cols)
                                ])

In [32]:
X_train = preprocessor.fit_transform(X_train)

In [33]:
X_test = preprocessor.transform(X_test)

# Automate model training

In [44]:
# import all required models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [45]:
models = {
             'Random forest classifier' : RandomForestClassifier(),
             'Decision Tree classifier' : DecisionTreeClassifier(),
             'Support Vector Classifier' : SVC()
         }

In [46]:
from sklearn.metrics import accuracy_score

In [47]:
def evaluate_model (X_train, y_train, X_test, y_test, models) :
    report = {}
    for i in range(len(models)) :
        model = list(models.values())[i]
        # Train model
        model.fit(X_train, y_train)
        # prediction
        y_pred = model.predict(X_test)
        # accuracy
        acc = accuracy_score(y_test, y_pred)
        report[list(models.keys())[i]] = acc
    return report

In [48]:
evaluate_model(X_train, y_train, X_test, y_test, models)

{'Random forest classifier': 0.8979591836734694,
 'Decision Tree classifier': 0.9183673469387755,
 'Support Vector Classifier': 0.9387755102040817}

After knowing which model is performing better you can do hyperparameter tuning

In [55]:
clf = SVC()
params = {
             'max_iter' : [3, 5, 10, 15],
             'random_state' : [100, 200, 300],
         }   

In [56]:
from sklearn.model_selection import RandomizedSearchCV
RSCV = RandomizedSearchCV(clf, param_distributions=params, cv = 5, scoring='accuracy', verbose=3)
RSCV.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ......max_iter=3, random_state=300;, score=1.000 total time=   0.0s
[CV 2/5] END ......max_iter=3, random_state=300;, score=0.974 total time=   0.0s
[CV 3/5] END ......max_iter=3, random_state=300;, score=0.897 total time=   0.0s
[CV 4/5] END ......max_iter=3, random_state=300;, score=0.872 total time=   0.0s
[CV 5/5] END ......max_iter=3, random_state=300;, score=0.872 total time=   0.0s
[CV 1/5] END .....max_iter=10, random_state=200;, score=0.949 total time=   0.0s
[CV 2/5] END .....max_iter=10, random_state=200;, score=0.923 total time=   0.0s
[CV 3/5] END .....max_iter=10, random_state=200;, score=0.923 total time=   0.0s
[CV 4/5] END .....max_iter=10, random_state=200;, score=0.923 total time=   0.0s
[CV 5/5] END .....max_iter=10, random_state=200;, score=0.974 total time=   0.0s
[CV 1/5] END ......max_iter=5, random_state=300;, score=0.897 total time=   0.0s
[CV 2/5] END ......max_iter=5, random_state=300;



In [57]:
RSCV.best_params_

{'random_state': 300, 'max_iter': 15}

# Internal Assignment
Use total_bill as dependent feature