In [1]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
import seaborn as sns

In [3]:
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
X=df.iloc[:,1:]
y=df['total_bill']

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [6]:
## Pipelining
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

In [7]:
from sklearn import set_config
set_config(display='diagram')
numeric_preprocessor

In [8]:
categorical_preprocessor = Pipeline(
    steps=[
        (
            "imputation_constant",
            SimpleImputer(fill_value="missing", strategy="constant"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [9]:
# Inorder to combine both numerical and actegorical features we use column transformer

preprocessor=ColumnTransformer(
    [ ("categorical",categorical_preprocessor,["sex","smoker","day","time"]),
      ("numerical",numeric_preprocessor,["tip","size"])]

)

preprocessor

In [10]:
pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor())]

)

pipe

In [11]:
pipe.fit(X_train,y_train)

In [12]:
pipe.predict(X_test)

array([23.5523    , 11.75598667, 29.0026    , 14.66700667, 11.6894    ,
       12.5197    , 23.89368   , 20.66575   , 21.5812875 , 27.5222    ,
       28.769     , 17.565175  ,  9.40657429, 13.94798333, 22.82336667,
       22.59486667, 20.37522   , 20.70886667, 28.2085    , 40.5225    ,
       18.0806    , 33.0478    , 22.02105333, 14.01855381, 11.99472333,
       17.75293333, 27.1045    , 21.1279    , 11.33744714, 14.11418667,
        8.86357143, 12.60018167, 22.77      , 22.8261    ,  9.85924   ,
       13.26142667, 12.7263334 , 41.4689    , 21.5854    , 15.86795   ,
       21.5225    ,  9.40657429, 29.344     , 17.36549   , 27.975     ,
       16.2613    , 22.82336667, 18.4964    , 29.2558    ])

In [17]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
# Hyper parameter tuning 
param_grid= {
    "regressor__n_estimators":[200,500],
    "regressor__max_features":["auto","sqrt","log2"],
    "regressor__max_depth":[4,5,6,7,8]
}

In [19]:
grid_search = GridSearchCV(pipe,param_grid=param_grid, n_jobs=1)

In [20]:
grid_search.fit(X_train,y_train)

In [21]:
grid_search.best_params_

{'regressor__max_depth': 5,
 'regressor__max_features': 'log2',
 'regressor__n_estimators': 200}

In [23]:
# plug the best params into the RF regressor
pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor(max_depth=5,max_features='log2',n_estimators=200))]

)

pipe.fit (X_train,y_train)

In [24]:
pipe.predict(X_test)

array([22.80233874, 15.94459364, 17.9316557 , 17.25868419, 13.55673708,
       12.38989485, 16.08080824, 21.0315569 , 28.5862895 , 14.69024164,
       14.05267423, 28.80404865, 27.45995966, 21.99496825, 21.87928338,
       16.46771831, 24.16112689, 20.34505887, 18.6360371 , 15.68532037,
       14.19245519, 16.09372051, 22.73614238, 29.13756235, 21.86800199,
       14.3575783 , 21.78284332, 16.80356836, 26.67123264, 15.82378287,
       17.10865695, 14.9470686 , 29.20652911, 16.13566664, 21.76107684,
       20.79196609, 18.80139388, 16.2860431 , 12.97774628, 15.15191106,
       19.85709417, 22.71217854, 19.37737807, 17.2697693 , 13.74359484,
       27.2420357 , 13.89316064, 21.24880053, 37.42015593])

In [None]:
# we can use R2 or anything for analyzing the model performance