# Random Forest Implementation

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
# Loading the dataset

df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [None]:
df.isnull().sum()

Unnamed: 0,0
total_bill,0
tip,0
sex,0
smoker,0
day,0
time,0
size,0


In [None]:
df['day'].unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [None]:
df['time'].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

# Steps:

1. We will convert the target(time) column into numerical feature using Label Encoder.

2. Seggregate independent and dependent features.

3. Train test split

In [None]:
# Step1
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['time'] = encoder.fit_transform(df['time'])

In [None]:
df.time.unique()

array([0, 1])

In [None]:
# Step2:
x = df.drop('time', axis =1)
y = df['time']

In [None]:
# Step3

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 42)


# NOTE:

1.) We don't do Feature scaling and feature engineering manually / step by step like we used to do in a production project.

2.) Instead, we automate the whole process. This is done by using Pipeline.

3.) We will define all the processes like handling numerical features, handling categorical features, handling missing values in a **Pipeline**.

4.) And we will run this **Pipeline** in an automated way.

# Pipeline technique implementation

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer   #  Handles missing values      # only used in Automation
from sklearn.preprocessing import StandardScaler # Feature Scaling
from sklearn.preprocessing import OneHotEncoder # categorical to numerical
from sklearn.compose import ColumnTransformer

For our application, its not like we need to train our model only once, instead model training will happen regularly whenver new data comes, which will be generated continuously.

# Segregating categorical and numerical features

In [None]:
categorical_cols = ['sex', 'smoker', 'day']
numerical_cols = ['total_bill', 'tip', 'size']

# Feature Engineering Automation

In [None]:
# Numerical features
num_Pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'median')),          # Handling missing values
        ('scaler', StandardScaler() )                             # feature scaling
        ]
)


# Categorical features
cat_Pipeline = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy = 'most_frequent')),   # Handling missing values
        ('onehotencoder', OneHotEncoder() )                       # Categorical features into numerical
    ]
)


# Using Column Transformer to join the 2 pipelines

In [None]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_Pipeline, numerical_cols),
    ('cat_pipeline', cat_Pipeline, categorical_cols)
])

In [None]:
 preprocessor

In [None]:
X_train = preprocessor.fit_transform(x_train)
X_test = preprocessor.transform(x_test)

# Model training Automation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
models = {
    'Random Forest': RandomForestClassifier()
}

In [None]:
type(models)

dict

In [None]:
list(models.values())[0]

In [None]:
def Evaluate_model(x_train, y_train, x_test, y_test, models):

    report = {}
    for i in range(len(models)):
      model = list(models.values())[i]

      # Train model
      model.fit(X_train, y_train)

      # Predict testing data
      y_test_pred = model.predict(X_test)

      # Get accuracy for test data prediction
      model_test_score = accuracy_score(y_test, y_test_pred)

      report[list(models.keys())[i]] = model_test_score

    return report


In [None]:
Evaluate_model(X_train, y_train, X_test, y_test, models)

{'Random Forest': 0.9591836734693877}

# Random Forest with Hyperparameter tuning

In [None]:
parameters = {
    'max_depth' : [3,5,10, None],
    'n_estimators' : [100, 200, 300],
    'criterion' : ['gini', 'entropy']
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
classifier = RandomForestClassifier()

In [None]:
classifier_cv = RandomizedSearchCV(classifier, param_distributions= parameters, scoring = 'accuracy',
                                   cv =5, verbose =3)

In [None]:
classifier_cv.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=entropy, max_depth=3, n_estimators=200;, score=0.974 total time=   0.4s
[CV 2/5] END criterion=entropy, max_depth=3, n_estimators=200;, score=0.949 total time=   0.5s
[CV 3/5] END criterion=entropy, max_depth=3, n_estimators=200;, score=0.974 total time=   0.5s
[CV 4/5] END criterion=entropy, max_depth=3, n_estimators=200;, score=0.923 total time=   0.5s
[CV 5/5] END criterion=entropy, max_depth=3, n_estimators=200;, score=0.949 total time=   0.5s
[CV 1/5] END criterion=entropy, max_depth=None, n_estimators=200;, score=0.974 total time=   0.5s
[CV 2/5] END criterion=entropy, max_depth=None, n_estimators=200;, score=0.923 total time=   0.5s
[CV 3/5] END criterion=entropy, max_depth=None, n_estimators=200;, score=1.000 total time=   0.4s
[CV 4/5] END criterion=entropy, max_depth=None, n_estimators=200;, score=0.949 total time=   0.3s
[CV 5/5] END criterion=entropy, max_depth=None, n_estimators=200;, score

In [None]:
classifier_cv.best_params_

{'n_estimators': 300, 'max_depth': 3, 'criterion': 'entropy'}

# Create Random Forest model with best parameters

In [None]:
best_classifier = RandomForestClassifier(criterion = 'entropy', max_depth = 3, n_estimators = 300)

In [None]:
best_classifier.fit(X_train, y_train)

In [None]:
y_pred = best_classifier.predict(X_test)

In [None]:
score = accuracy_score(y_test, y_pred)
score

1.0

# Analysis:

Here, accuracy score becomes from 95.9 percent to  100 after Hyperparameter Tuning.