In [2]:
import os 
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split 

import seaborn as sns 

In [3]:
os.chdir('../')
%pwd 

'd:\\pythonProjects\\SurgeSense'

In [55]:
data=pd.read_csv('dataset/cleaned_dataset.csv')
data.head()
data.columns

Index(['distance', 'cab_type', 'destination', 'source', 'price',
       'surge_multiplier', 'name', 'date_time', 'temp', 'location', 'clouds',
       'pressure', 'rain', 'humidity', 'wind', 'day', 'hour', 'month'],
      dtype='object')

In [61]:
from sklearn.ensemble import RandomForestRegressor
categorical_columns=['cab_type','destination','source','name']
numerical_columns=['distance','surge_multiplier','temp','clouds','pressure','rain','humidity','wind','day','hour','month']

numerical_preprocessor=Pipeline(
    steps=[
        ('imputation_menu',SimpleImputer(missing_values=np.nan,strategy='median').set_output(transform='pandas')),
        ('scalar',StandardScaler())
    ]
)

categorical_preprocessor=Pipeline(
    steps=[
        ('imputation_constant',SimpleImputer(strategy='most_frequent').set_output(transform='pandas')),
        ('encode',OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor=ColumnTransformer(
    transformers=[
        ('categorical_columns',categorical_preprocessor,categorical_columns),
        ('numerical_columns',numerical_preprocessor,numerical_columns)
    ]
)

pipe=Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model', GradientBoostingRegressor())
    ]
)
pipe 

In [62]:
x=data.drop(columns=['price','date_time','location'])
y=data['price']
# xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.4,random_state=43)
# xtrain.shape
x.head()

Unnamed: 0,distance,cab_type,destination,source,surge_multiplier,name,temp,clouds,pressure,rain,humidity,wind,day,hour,month
0,0.44,Lyft,North Station,Haymarket Square,1.0,Shared,38.46,0.29,1022.25,0.0,0.76,7.68,6,9,12
1,0.44,Lyft,North Station,Haymarket Square,1.0,Lux,44.31,1.0,1003.17,0.1123,0.9,13.69,1,2,11
2,0.44,Lyft,North Station,Haymarket Square,1.0,Lux,43.82,0.99,1002.59,0.0997,0.89,11.57,1,2,11
3,0.44,Lyft,North Station,Haymarket Square,1.0,Lux Black XL,35.08,0.0,1013.71,0.0,0.7,5.25,4,4,11
4,0.44,Lyft,North Station,Haymarket Square,1.0,Lyft XL,37.58,0.42,998.64,0.0,0.71,11.3,3,3,11


In [63]:
# transformed_data=preprocessor.fit_transform(x)
# transformed_data

In [28]:
# xtrain,xtest,ytrain,ytest=train_test_split(transformed_data,y,test_size=0.3,random_state=43)

In [29]:
#model=RandomForestRegressor(random_state=42)
# model.fit(xtrain,ytrain)

In [64]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import KFold 

param_grid=[
    {
        'model': [RandomForestRegressor(random_state=42)],
        'model__n_estimators': [50, 100],
        'model__max_depth': [None, 5],
    },
    {
        'model': [GradientBoostingRegressor(random_state=42)],
        'model__n_estimators': [50, 100],
        'model__learning_rate': [0.1, 0.05],
        'model__max_depth': [3, 5],
    },
    {
        'model': [XGBRegressor(random_state=42)],
        'model__n_estimators': [50, 100],
        'model__learning_rate': [0.1, 0.05],
        'model__max_depth': [3, 5],
    }
]

grid_search=GridSearchCV(pipe,
                         param_grid,
                         cv=3,
                         scoring='neg_mean_squared_error',
                         n_jobs=-1
                         )

In [54]:
grid_search.fit(x[:10000],y[:10000])

ValueError: Invalid parameter 'model' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('categorical_columns',
                                                  Pipeline(steps=[('imputation_constant',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encode',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['cab_type', 'destination',
                                                   'source', 'name']),
                                                 ('numerical_columns',
                                                  Pipeline(steps=[('imputation_menu',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scalar',
                                                                   StandardScaler())]),
                                                  ['distance',
                                                   'surge_multiplier', 'temp',
                                                   'clouds', 'pressure', 'rain',
                                                   'humidity', 'wind', 'day',
                                                   'hour', 'month'])]))]). Valid parameters are: ['memory', 'steps', 'transform_input', 'verbose'].

In [51]:
grid_search.best_params_

{'model': GradientBoostingRegressor(random_state=42),
 'model__learning_rate': 0.1,
 'model__max_depth': 5,
 'model__n_estimators': 100}