In [3]:
import pandas as pd
import numpy as np

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.metrics import root_mean_squared_error

In [2]:
# load the dataset
data = px.data.tips()
data.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# seperating the x and y 

X = data.drop(columns = ['tip']).copy()
y = data['tip']

# encode the categorical variables in X
cat_col = ['sex','smoker','day','time']
encoder = LabelEncoder()
for col in cat_col:
    X[col] = encoder.fit_transform(X[col])


# scale the entire features in X
scaler = StandardScaler()
columns = list(X.columns) # get the column names
X = scaler.fit_transform(X)
X = pd.DataFrame(data = X, columns = columns)

X.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,-0.314711,-1.343353,-0.784789,0.279158,-0.621582,-0.600193
1,-1.063235,0.744406,-0.784789,0.279158,-0.621582,0.453383
2,0.13778,0.744406,-0.784789,0.279158,-0.621582,0.453383
3,0.438315,0.744406,-0.784789,0.279158,-0.621582,-0.600193
4,0.540745,-1.343353,-0.784789,0.279158,-0.621582,1.506958


In [6]:
# split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,
                                                    random_state=23)

In [21]:
# train the model

no_depth = list(range(1, 10))
train_test_rmse = {}

for depth in no_depth:

    model = RandomForestRegressor(random_state=23,
                                n_estimators=30, max_depth = depth)
    model.fit(X_train, y_train)
    train_preds = model.predict(X_train) # get train prediction
    test_preds = model.predict(X_test) # get test prediction

    # evaluate the model performance
    train_rmse = root_mean_squared_error(y_train, train_preds)
    test_rmse = root_mean_squared_error(y_test, test_preds)
    
    train_test_rmse.update({train_rmse:test_rmse})
    
    print(f'At n_estimator = 30, max_depth: {depth}...Train rmse: {train_rmse}.....Test_rmse: {test_rmse}')

At n_estimator = 30, max_depth: 1...Train rmse: 1.0659140552173725.....Test_rmse: 1.1571148910917834
At n_estimator = 30, max_depth: 2...Train rmse: 0.9349870551111064.....Test_rmse: 1.0171788086975255
At n_estimator = 30, max_depth: 3...Train rmse: 0.8611649437666431.....Test_rmse: 1.0404996707813297
At n_estimator = 30, max_depth: 4...Train rmse: 0.7951444661158821.....Test_rmse: 1.0535821463769888
At n_estimator = 30, max_depth: 5...Train rmse: 0.718807855502031.....Test_rmse: 1.0249475760907472
At n_estimator = 30, max_depth: 6...Train rmse: 0.6436767564997581.....Test_rmse: 1.0178111937306094
At n_estimator = 30, max_depth: 7...Train rmse: 0.5806796167253975.....Test_rmse: 1.0103168813799817
At n_estimator = 30, max_depth: 8...Train rmse: 0.5276710100800655.....Test_rmse: 0.9746665956751261
At n_estimator = 30, max_depth: 9...Train rmse: 0.4962017415385506.....Test_rmse: 0.9773269479345692


2.99827868852459

In [15]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 23,
 'verbose': 0,
 'warm_start': False}

### Classification 

In [23]:
data_flower = px.data.iris()
data_flower.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1
