In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
import plotly.express as px

In [2]:
# create the dataset
data = px.data.tips()
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
# seperate the categorical and continuous variables
cat_cols = ['sex','smoker','day', 'time']
cont_cols = [x for x in list(data.columns) if x not in cat_cols]

In [None]:
# seperate the X and Y 
X = data.drop(columns=['tip'])
y = data['tip']

In [6]:
# encode the categorical variables
label_encoders = {}

for col in cat_cols:
    encoder = LabelEncoder()
    X[col] = encoder.fit_transform(X[col])
    label_encoders[col] = encoder

In [9]:
# split our data into train and test

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state= 23
)

In [12]:
X_train.columns

Index(['total_bill', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [13]:
# scale the dataset

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
# train the model

model = RandomForestRegressor(random_state= 23,
                              n_estimators=100, max_depth=10)
model.fit(X_train, y_train)
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_score = root_mean_squared_error(y_train, train_preds)
test_score = root_mean_squared_error(y_test, test_preds)

print(f'Train: {train_score}')
print(f'Test: {test_score}')

Train: 0.45493211007137724
Test: 0.9753777497397842


In [20]:
# package the models artifacts for production inference

import joblib

# save all the artifacts
joblib.dump(value=label_encoders, filename= 'label_encoders.pkl')
joblib.dump(value=scaler, filename= 'scaler.pkl')
joblib.dump(value=model, filename= 'model_v1.pkl')

['model_v1.pkl']