In [1]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
import seaborn as sns

In [3]:
df=sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [5]:
x=df.iloc[:,1:]
y=df['total_bill']

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2)

In [8]:
numeric_prepossor=Pipeline(
                steps=[("imputation mean",SimpleImputer(missing_values=np.nan, strategy="mean")),("scalar",StandardScaler())])

In [9]:
from sklearn import set_config
set_config(display='diagram')

In [10]:
numeric_prepossor

In [11]:
categorical_preprocessor = Pipeline(
    steps=[
        (
            "imputation_constant",
            SimpleImputer(fill_value="missing", strategy="constant"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

In [12]:
categorical_preprocessor

In [13]:
preprocessor= ColumnTransformer([
    ('categorical',categorical_preprocessor, ["sex","smoker", "day", "time"] ),
    ('numerical',numeric_prepossor,["tip",'size'] )
])

In [14]:
pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor())]

)

In [15]:
pipe

In [16]:
pipe.fit(x_train,y_train)

In [17]:
pipe.predict(x_test)

array([15.90770849, 15.49552905, 12.43528333, 17.44845333, 30.4257    ,
       18.06105   , 18.06105   , 22.34      , 17.2067    , 24.5135    ,
       22.5633    , 10.43338   , 21.7816    , 16.47698214, 18.516625  ,
       20.2195    , 15.63796762, 21.4796    , 19.836     , 18.97635   ,
       20.276     , 14.06319905, 11.96855437, 14.55570667, 33.4993    ,
       15.63796762, 20.24124   , 14.93202   , 11.12158024, 23.99365   ,
       12.61302   , 24.3441    , 19.27266667, 14.06319905, 20.24124   ,
       14.67875024, 17.6947    , 19.1573    , 24.5718    , 15.4263    ,
       11.1281    , 16.22759048, 25.62182333, 20.1986    , 11.7368    ,
       24.4035    , 20.89588333, 19.3317    , 16.2339    ])

In [19]:
param_grid={'regressor__n_estimators': [200,500],'regressor__max_features':["auto","sqrt","log2"],
           'regressor__max_depth':[2,3,4,5]}

In [20]:
grid_search=GridSearchCV(pipe, param_grid=param_grid,n_jobs=-1 )

In [21]:
grid_search.fit(x_train,y_train)

In [22]:
grid_search.best_params_

{'regressor__max_depth': 5,
 'regressor__max_features': 'sqrt',
 'regressor__n_estimators': 200}

In [23]:
pipe=Pipeline(
    [("preprocessor",preprocessor),("regressor",RandomForestRegressor(n_estimators=200,max_features="sqrt",max_depth=5))]

)

In [24]:
pipe.fit(x_train,y_train)

In [27]:
pipe.predict(x_test)

array([13.97278259, 14.9681757 , 15.60370314, 14.65673575, 27.31065221,
       16.94982476, 16.94982476, 15.43873501, 18.85065887, 23.4249129 ,
       22.77665645, 14.56535445, 18.00342791, 15.87073   , 15.20663656,
       19.14069239, 16.33526436, 20.5059638 , 21.19509288, 17.8678071 ,
       20.15306039, 20.3233935 , 12.83079859, 13.69411452, 27.15372419,
       16.33526436, 20.53194636, 15.23326777, 12.56354579, 22.59957536,
       13.49010413, 23.7144694 , 19.19013055, 20.3233935 , 20.53194636,
       16.13130902, 16.42348888, 20.52383432, 22.76781428, 14.42880093,
       12.50286283, 16.12955618, 26.40091789, 18.98394507, 13.71155663,
       23.29579622, 19.54284608, 20.57140993, 18.86909641])