In [4]:
import numpy as np
import pandas as pd
import sklearn 

In [5]:
import warnings

In [6]:
pd.set_option("display.max_columns",None)

In [7]:
sklearn.set_config(transform_output="pandas")

In [8]:
warnings.filterwarnings("ignore")

In [9]:
path = "/Users/mukulagarwal/Desktop/Python_Code/flights_sagemaker_project/Data/train.csv"
train = pd.read_csv(path)

In [10]:
train = train.dropna()

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 639 entries, 0 to 639
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airline          639 non-null    object 
 1   date_of_journey  639 non-null    object 
 2   source           639 non-null    object 
 3   destination      639 non-null    object 
 4   dep_time         639 non-null    object 
 5   arrival_time     639 non-null    object 
 6   duration         639 non-null    int64  
 7   total_stops      639 non-null    float64
 8   additional_info  639 non-null    object 
 9   price            639 non-null    int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 54.9+ KB


In [12]:
X_train = train.drop(columns=['price'])
y_train = train['price']

## Airline

In [13]:
X_train['airline']

0            Jet Airways
1                 Indigo
2            Jet Airways
3      Multiple Carriers
4                 Indigo
             ...        
635            Air India
636            Air India
637          Jet Airways
638          Jet Airways
639          Jet Airways
Name: airline, Length: 639, dtype: object

In [14]:
train['airline'].value_counts()

airline
Jet Airways          236
Indigo               125
Air India             94
Multiple Carriers     73
Spicejet              55
Vistara               26
Goair                 16
Air Asia              14
Name: count, dtype: int64

In [15]:
from sklearn.impute import SimpleImputer
from feature_engine.encoding import RareLabelEncoder
from sklearn.preprocessing import (
    OneHotEncoder
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [16]:
l1 = list(train['airline'].value_counts().index[:4])
t1 = train.copy()
t1.loc[[True if v not in l1 else False for v in train['airline'].values],'airline'] = 'other'

In [17]:
air_transformer = Pipeline(steps=[("imputer",SimpleImputer(strategy= 'most_frequent')),
                                  ("grouper",RareLabelEncoder(tol = 0.1,replace_with="other",n_categories=2)),
                                  ("ohe",OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

In [18]:
air_transformer.fit_transform(X_train.loc[:,['airline']])

Unnamed: 0,airline_Air India,airline_Indigo,airline_Jet Airways,airline_Multiple Carriers,airline_other
0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...
635,1.0,0.0,0.0,0.0,0.0
636,1.0,0.0,0.0,0.0,0.0
637,0.0,0.0,1.0,0.0,0.0
638,0.0,0.0,1.0,0.0,0.0


## Date of journey

In [19]:
X_train['date_of_journey']

0      2019-12-03
1      2019-03-06
2      2019-09-05
3      2019-05-27
4      2019-03-18
          ...    
635    2019-06-06
636    2019-09-06
637    2019-06-24
638    2019-05-21
639    2019-05-21
Name: date_of_journey, Length: 639, dtype: object

In [20]:
from feature_engine.datetime import DatetimeFeatures
from sklearn.preprocessing import MinMaxScaler
features_to_extract = ['month','week','day_of_week','day_of_year']

daoj_transformer = Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract = features_to_extract,yearfirst = True,format = "mixed")),
    ("scaler",MinMaxScaler())
])

daoj_transformer.fit_transform(X_train.loc[:,['date_of_journey']])

Unnamed: 0,date_of_journey_month,date_of_journey_week,date_of_journey_day_of_week,date_of_journey_day_of_year
0,1.000000,1.000000,0.166667,0.991098
1,0.181818,0.187500,0.333333,0.183976
2,0.727273,0.729167,0.500000,0.727003
3,0.363636,0.437500,0.000000,0.427300
4,0.181818,0.229167,0.000000,0.219585
...,...,...,...,...
635,0.454545,0.458333,0.500000,0.456973
636,0.727273,0.729167,0.666667,0.729970
637,0.454545,0.520833,0.000000,0.510386
638,0.363636,0.416667,0.166667,0.409496


## Source and Destination

In [21]:
location_subset = X_train.loc[:,['source','destination']]
location_subset

Unnamed: 0,source,destination
0,Banglore,New Delhi
1,Mumbai,Hyderabad
2,Kolkata,Banglore
3,Delhi,Cochin
4,Chennai,Kolkata
...,...,...
635,Kolkata,Banglore
636,Delhi,Cochin
637,Delhi,Cochin
638,Delhi,Cochin


In [22]:
from feature_engine.encoding import MeanEncoder
from sklearn.preprocessing import PowerTransformer

location_pipe1 = Pipeline(steps = [
    ('grouper',RareLabelEncoder(tol = 0.1,replace_with='other',n_categories=2)),
    ('encoder',MeanEncoder()),
    ('symm_transf',PowerTransformer())
])

location_pipe1.fit_transform(location_subset,y_train)

Unnamed: 0,source,destination
0,-0.750539,-1.331615
1,-1.900962,-1.331615
2,-0.255939,-0.091071
3,1.048279,1.046065
4,-1.900962,-1.331615
...,...,...
635,-0.255939,-0.091071
636,1.048279,1.046065
637,1.048279,1.046065
638,1.048279,1.046065


In [23]:
def is_north(X):
    columns = X.columns.to_list()
    return (
                X.assign(**{ f"{col}_is_north" : X.loc[:,col].isin(['New Delhi','Kolkata','Delhi','Mumbai']).astype(int)  
                             for col in columns}
                         )
                .drop(columns=['source','destination'])
            )
is_north(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
635,1,0
636,1,0
637,1,0
638,1,0


In [24]:
from sklearn.preprocessing import FunctionTransformer
FunctionTransformer(is_north).fit_transform(location_subset)

Unnamed: 0,source_is_north,destination_is_north
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
635,1,0
636,1,0
637,1,0
638,1,0


In [25]:
from sklearn.pipeline import FeatureUnion
location_transformer = FeatureUnion(transformer_list=[
    ("part1",location_pipe1),
    ("part2",FunctionTransformer(is_north))
])

location_transformer.fit_transform(location_subset,y_train)

Unnamed: 0,source,destination,source_is_north,destination_is_north
0,-0.750539,-1.331615,0,1
1,-1.900962,-1.331615,1,0
2,-0.255939,-0.091071,1,0
3,1.048279,1.046065,1,0
4,-1.900962,-1.331615,0,1
...,...,...,...,...
635,-0.255939,-0.091071,1,0
636,1.048279,1.046065,1,0
637,1.048279,1.046065,1,0
638,1.048279,1.046065,1,0


## Dep_Time and Arrival_Time

In [26]:
time_subset = X_train[['dep_time','arrival_time']]
time_subset

Unnamed: 0,dep_time,arrival_time
0,08:55:00,21:20:00
1,01:40:00,03:10:00
2,20:00:00,04:40:00
3,11:30:00,19:15:00
4,07:55:00,10:15:00
...,...,...
635,08:20:00,21:50:00
636,09:45:00,23:00:00
637,02:15:00,04:25:00
638,13:00:00,04:25:00


In [27]:
time_pipe1 = Pipeline(steps=[
    ('dt',DatetimeFeatures(features_to_extract=['hour','minute'])),
    ('scaler',MinMaxScaler())
])
time_pipe1.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute
0,0.347826,1.000000,0.913043,0.363636
1,0.043478,0.727273,0.130435,0.181818
2,0.869565,0.000000,0.173913,0.727273
3,0.478261,0.545455,0.826087,0.272727
4,0.304348,1.000000,0.434783,0.272727
...,...,...,...,...
635,0.347826,0.363636,0.913043,0.909091
636,0.391304,0.818182,1.000000,0.000000
637,0.086957,0.272727,0.173913,0.454545
638,0.565217,0.000000,0.173913,0.454545


In [28]:
def part_of_day(X,morning=4,noon=12,evening = 16, night = 20):
    temp_x = X.assign(
        **{
            col : pd.to_datetime(X[col]).dt.hour
            for col in X.columns
        }
    )
    return temp_x.assign(
        **{
            f"{col}_part_of_day": np.select(
                [temp_x.loc[:,col].between(morning,noon,inclusive = 'left'),
                 temp_x.loc[:,col].between(noon,evening,inclusive = 'left'),
                 temp_x.loc[:,col].between(evening,night,inclusive = 'left')],
                ['morning','noon','evening'],
                default='night'
            )
            for col in temp_x.columns
        }
    ).drop(columns = ['dep_time','arrival_time'])

In [29]:
from feature_engine.encoding import CountFrequencyEncoder
time_pipe2 = Pipeline(steps=[
    ('part_of_day',FunctionTransformer(part_of_day)),
    ('count_encoder',CountFrequencyEncoder()),
    ('scale',MinMaxScaler())
])

In [30]:
time_transformer = FeatureUnion(
    transformer_list=[
        ('part1',time_pipe1),
        ('part2',time_pipe2)
    ]
)

time_transformer.fit_transform(time_subset)

Unnamed: 0,dep_time_hour,dep_time_minute,arrival_time_hour,arrival_time_minute,dep_time_part_of_day,arrival_time_part_of_day
0,0.347826,1.000000,0.913043,0.363636,1.000000,1.000000
1,0.043478,0.727273,0.130435,0.181818,0.085714,1.000000
2,0.869565,0.000000,0.173913,0.727273,0.085714,0.956989
3,0.478261,0.545455,0.826087,0.272727,1.000000,0.655914
4,0.304348,1.000000,0.434783,0.272727,1.000000,0.956989
...,...,...,...,...,...,...
635,0.347826,0.363636,0.913043,0.909091,1.000000,1.000000
636,0.391304,0.818182,1.000000,0.000000,1.000000,1.000000
637,0.086957,0.272727,0.173913,0.454545,0.085714,0.956989
638,0.565217,0.000000,0.173913,0.454545,0.000000,0.956989


In [31]:
X_train[['duration']].quantile([0.25,0.5,0.75]).values

array([[170. ],
       [480. ],
       [902.5]])

## Duration

In [32]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel
class RBFPercentileSimilarity(BaseEstimator,TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma
    
    def fit(self,X,y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()
        reference_values = X[self.variables].quantile(self.percentiles).values
        self.reference_values_ = {self.variables[i] : reference_values[:,i].reshape(-1,1) for i in range(len(self.variables))}
        return self
    
    def transform(self,X):
        objects = []
        for col in self.variables:           
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns)
            objects.append(obj)      
        return pd.concat(objects,axis=1)  

In [33]:
def duration_cat(X,short=180,medium=400):
    return(
        X.assign(
            duration_cat = np.select([X['duration'].lt(short),
                                      X['duration'].between(short,medium,inclusive = 'left')],
                                     ['short','medium'],
                                     default='long')
        )
        .drop(columns = ['duration'])
    )
duration_cat(X_train[['duration']])    

Unnamed: 0,duration_cat
0,long
1,short
2,long
3,long
4,short
...,...
635,long
636,long
637,long
638,long


In [34]:
def is_over(X,value = 1000):
    return (
        X.assign(
            duration_is_over = lambda df_ : df_['duration'].ge(1000).astype(int)
        )
        .drop(columns = 'duration')
    )
    
is_over(X_train[['duration']])

Unnamed: 0,duration_is_over
0,0
1,0
2,0
3,0
4,0
...,...
635,0
636,0
637,1
638,0


In [35]:
RBFPercentileSimilarity(['duration']).fit_transform(X_train)

Unnamed: 0,duration_rbf_25,duration_rbf_50,duration_rbf_75
0,0.000000e+00,0.000000e+00,0.000000e+00
1,1.125982e-278,0.000000e+00,0.000000e+00
2,0.000000e+00,3.257489e-70,0.000000e+00
3,0.000000e+00,1.691898e-10,0.000000e+00
4,8.194013e-40,0.000000e+00,0.000000e+00
...,...,...,...
634,0.000000e+00,0.000000e+00,0.000000e+00
635,0.000000e+00,0.000000e+00,0.000000e+00
636,0.000000e+00,0.000000e+00,0.000000e+00
637,0.000000e+00,0.000000e+00,1.032385e-22


In [36]:
from feature_engine.outliers import Winsorizer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
duration_pipe1 = Pipeline(steps=[
	#("rbf", RBFPercentileSimilarity()),
	("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
	("cat", FunctionTransformer(func=duration_cat)),
	("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
	#("part1", duration_pipe1),
	("part2", duration_pipe2),
	("part3", FunctionTransformer(func=is_over)),
	("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
	("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
	("imputer", SimpleImputer(strategy="median")),
	("union", duration_union)
])

duration_transformer.fit_transform(X_train.loc[:, ["duration"]])

Unnamed: 0,duration_cat,duration_is_over,duration
0,2.0,0,0.264261
1,0.0,0,-1.065469
2,2.0,0,-0.192516
3,2.0,0,-0.304173
4,0.0,0,-0.963962
...,...,...,...
635,2.0,0,0.396219
636,2.0,0,0.365767
637,2.0,1,1.939112
638,2.0,0,0.629683


In [37]:
X_train['total_stops'].astype(int) == 0

0      False
1       True
2      False
3      False
4       True
       ...  
635    False
636    False
637    False
638    False
639    False
Name: total_stops, Length: 639, dtype: bool

In [38]:
def is_more_than_one(X):
    return(
        X.assign(
            is_direct_flight = lambda df_: (df_['total_stops'].astype(int) == 0).astype(int)   
        )
    )
is_more_than_one(X_train)

total_stops_transformer = Pipeline(steps=[
    ('is_direct',FunctionTransformer(is_more_than_one))
])
total_stops_transformer.fit_transform(X_train[['total_stops']])

Unnamed: 0,total_stops,is_direct_flight
0,1.0,0
1,0.0,1
2,1.0,0
3,1.0,0
4,0.0,1
...,...,...
635,2.0,0
636,1.0,0
637,1.0,0
638,1.0,0


In [39]:
info_pipe1 = Pipeline(steps=[
	("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
	("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

info_pipe1.fit_transform(X_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
635,0.0,1.0,0.0
636,0.0,1.0,0.0
637,1.0,0.0,0.0
638,1.0,0.0,0.0


In [40]:
def have_info(X):
	return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

In [41]:
info_union = FeatureUnion(transformer_list=[
	("part1", info_pipe1),
	("part2", FunctionTransformer(func=have_info))
])

In [42]:
info_transformer = Pipeline(steps=[
	("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
	("union", info_union)
])

info_transformer.fit_transform(X_train.loc[:, ["additional_info"]])

Unnamed: 0,additional_info_In-flight meal not included,additional_info_No Info,additional_info_Other,additional_info
0,0.0,1.0,0.0,0
1,0.0,1.0,0.0,0
2,1.0,0.0,0.0,1
3,0.0,1.0,0.0,0
4,0.0,1.0,0.0,0
...,...,...,...,...
635,0.0,1.0,0.0,0
636,0.0,1.0,0.0,0
637,1.0,0.0,0.0,1
638,1.0,0.0,0.0,1


## Column Transformer

In [43]:
column_transformer = ColumnTransformer(transformers=[
    ('air',air_transformer,["airline"]),
    ('doj_column',daoj_transformer,["date_of_journey"]),
    ('location',location_transformer,["source","destination"]),
    ('time',time_transformer,["dep_time","arrival_time"]),
    ('dur',duration_transformer,["duration"]),
    ('stops',total_stops_transformer,["total_stops"]),
    ("info", info_transformer, ["additional_info"])
],remainder='passthrough')

In [44]:
val_data = pd.read_csv("/Users/mukulagarwal/Desktop/Python_Code/flights_sagemaker_project/Data/val.csv")
val_data = column_transformer.fit_transform(val_data.drop(columns='price'),val_data['price'])

In [45]:
val_data

Unnamed: 0,air__airline_Air India,air__airline_Indigo,air__airline_Jet Airways,air__airline_Multiple Carriers,air__airline_other,doj_column__date_of_journey_month,doj_column__date_of_journey_week,doj_column__date_of_journey_day_of_week,doj_column__date_of_journey_day_of_year,location__source,location__destination,location__source_is_north,location__destination_is_north,time__dep_time_hour,time__dep_time_minute,time__arrival_time_hour,time__arrival_time_minute,time__dep_time_part_of_day,time__arrival_time_part_of_day,dur__duration_cat,dur__duration_is_over,dur__duration,stops__total_stops,stops__is_direct_flight,info__additional_info_In-flight meal not included,info__additional_info_No Info,info__additional_info_Other,info__additional_info
0,0.0,0.0,0.0,0.0,1.0,0.181818,0.187500,0.000000,0.178042,-0.599433,-1.530193,0,1,0.363636,0.545455,0.521739,0.363636,1.000000,0.00,0.0,0,-0.857499,0.0,1,0.0,1.0,0.0,0
1,0.0,1.0,0.0,0.0,0.0,0.181818,0.208333,0.666667,0.210682,-0.320260,-0.262098,1,0,0.863636,0.454545,1.000000,0.090909,0.243902,1.00,0.0,0,-0.878341,0.0,1,0.0,1.0,0.0,0
2,0.0,1.0,0.0,0.0,0.0,0.181818,0.166667,1.000000,0.175074,-0.320260,-0.262098,1,0,0.863636,0.454545,1.000000,0.090909,0.243902,1.00,0.0,0,-0.878341,0.0,1,0.0,1.0,0.0,0
3,0.0,0.0,1.0,0.0,0.0,0.000000,0.000000,0.833333,0.005935,-1.936792,-1.410424,1,0,0.818182,0.636364,0.913043,0.090909,0.024390,1.00,0.0,0,-1.024231,0.0,1,1.0,0.0,0.0,1
4,0.0,0.0,1.0,0.0,0.0,0.454545,0.458333,0.500000,0.456973,-0.599433,-1.530193,0,1,0.272727,0.181818,0.434783,0.181818,1.000000,0.88,1.0,0,-0.836658,0.0,1,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,0.0,0.0,0.0,0.0,1.0,0.454545,0.479167,0.833333,0.483680,1.227177,0.772974,1,0,0.409091,0.636364,0.826087,0.636364,1.000000,0.36,2.0,0,-0.086362,1.0,0,0.0,1.0,0.0,0
156,0.0,0.0,1.0,0.0,0.0,0.727273,0.729167,0.666667,0.729970,-0.320260,-0.262098,1,0,0.590909,0.090909,1.000000,0.636364,0.000000,1.00,2.0,0,-0.023838,1.0,0,1.0,0.0,0.0,1
157,0.0,0.0,1.0,0.0,0.0,0.727273,0.729167,0.333333,0.724036,-0.599433,-1.530193,0,1,0.818182,0.909091,0.956522,0.909091,0.024390,1.00,1.0,0,-0.836658,0.0,1,1.0,0.0,0.0,1
158,0.0,0.0,0.0,0.0,1.0,0.363636,0.416667,0.666667,0.418398,-0.599433,-1.530193,0,1,0.363636,0.545455,0.521739,0.363636,1.000000,0.00,0.0,0,-0.857499,0.0,1,0.0,1.0,0.0,0


Model Training 

In [46]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import (
    RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor)
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from catboost import CatBoostRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

models = {
    'lr' : LinearRegression(),
    'gb' : GradientBoostingRegressor(),
    'rf' : RandomForestRegressor(),
    'cat' : CatBoostRegressor(verbose=False),
    'ada' : AdaBoostRegressor(),
    'hist' : HistGradientBoostingRegressor()
}

metric_dict = {}

for estimator in models:
     model_pipeline = Pipeline(steps=[
         ('transform',column_transformer),
         ('reg',models[estimator])
     ])
     
     val_scores = cross_val_score(model_pipeline,X_train,y_train,cv=5,scoring = 'r2')
     val_score = np.mean(val_scores)
     metric_dict[models[estimator].__class__.__name__]=val_score
     print(models[estimator].__class__.__name__,"-",{val_score})     

LinearRegression - {0.6381748943503358}
GradientBoostingRegressor - {0.7620897376251874}
RandomForestRegressor - {0.7430377104098382}
CatBoostRegressor - {0.7592985528101351}
AdaBoostRegressor - {0.5510951574258561}
HistGradientBoostingRegressor - {0.7333421003479877}


In [47]:
best_model = pd.DataFrame(data = metric_dict.items(),columns=['model','r2_score']).sort_values(by='r2_score',ascending=False).reset_index().loc[0,'model']
metric_dict[best_model]

0.7620897376251874

In [48]:
model_pipeline = Pipeline(steps=[
         ('transform',column_transformer),
         ('reg',GradientBoostingRegressor())
     ])

In [49]:
model_pipeline.get_params()

{'memory': None,
 'steps': [('transform',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('air',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('grouper',
                                                     RareLabelEncoder(n_categories=2,
                                                                      replace_with='other',
                                                                      tol=0.1)),
                                                    ('ohe',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse_output=False))]),
                                    ['airline']),
                                   ('doj_column',
                                    Pipelin

### Model Hyperparameter Tuning

In [50]:
import optuna
from functools import partial
def objective_func(trial,model,X_train,y_train):
    trial.suggest_float('reg__subsample',0,1)
    trial.suggest_float('reg__min_samples_split',0,1)
    trial.suggest_int('reg__max_depth',1,10,step=1)
    trial.suggest_int('reg__n_estimators',100,1200,step = 100)
    trial.suggest_categorical('reg__criterion',['friedman_mse', 'squared_error'])
    return cross_val_score(model,
                           X_train,
                           y_train,
                           scoring='r2',
                           cv=5,
                           n_jobs=-1).mean()
    
optimization_func = partial(objective_func,
                            model = model_pipeline,
                            X_train = X_train,
                            y_train = y_train) 

In [51]:
study = optuna.create_study(direction='maximize')
study.optimize(optimization_func, n_trials=100)

[I 2024-09-14 19:18:15,185] A new study created in memory with name: no-name-9a2a8bac-a334-4917-8fb8-82453ce829b3
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
[I 2024-09-14 19:18:17,845] Trial 0 finished with value: 0.7638158709488464 and parameters: {'reg__subsample': 0.8814642565465054, 'reg__min_samples_split': 0.369808408129637, 'reg__max_depth': 2, 'reg__n_estimators': 1100, 'reg__criterion': 'squared_error'}. Best is trial 0 with value: 0.7638158709488464.
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  pd.to_datetime(
  

In [52]:
study.best_params

{'reg__subsample': 0.7866091744400742,
 'reg__min_samples_split': 0.9459835769151456,
 'reg__max_depth': 10,
 'reg__n_estimators': 800,
 'reg__criterion': 'friedman_mse'}

In [53]:
study.best_value

0.7678327193512121

### Save Model

In [57]:
import joblib
path = '/Users/mukulagarwal/Desktop/Python_Code/flights_sagemaker_project/model.joblib'
model_pipeline = model_pipeline.set_params(**study.best_params).fit(X_train,y_train)
joblib.dump(model_pipeline,path)

['/Users/mukulagarwal/Desktop/Python_Code/flights_sagemaker_project/model.joblib']

### Ignore this piece (Practice/Experiments)

In [53]:
from sklearn.model_selection import RandomizedSearchCV
params = {
    
    'reg__n_estimators': [n for n in range(50, 201, 8)],
    'reg__criterion': ['poisson', 'friedman_mse', 'squared_error', 'absolute_error'],
    'reg__max_depth': [3, 4, 5, 6, 7, 8],
    'reg__max_features': ['auto', 'log2']    
}

rnfcgs = RandomizedSearchCV(estimator=model_pipeline, 
                            param_distributions=params, 
                            n_iter=100, 
                            cv=5, 
                            random_state=42)

rnfcgs.fit(X_train, y_train)

In [54]:
rnfcgs.best_params_

{'reg__n_estimators': 162,
 'reg__max_features': 'log2',
 'reg__max_depth': 8,
 'reg__criterion': 'absolute_error'}

In [55]:
rnfcgs.best_score_

0.7053514238311547

In [565]:
class transformer_name(BaseEstimator,TransformerMixin):
    def __init__(self,threshold,column_name):
        self.threshold = threshold
        self.column_name = column_name
        
    def fit(self,X):
        pass
    
    def transform(self,X):
        return pd.DataFrame(data = X.loc[:,self.column_name].ge(self.threshold).astype(int).values,
                            columns=['duration_'])

In [566]:
X_train.loc[:,'duration'].ge(1000).astype(int)

0      0
1      0
2      0
3      0
4      0
      ..
635    0
636    0
637    1
638    0
639    0
Name: duration, Length: 639, dtype: int64

In [567]:
a = transformer_name(1000,'duration')

In [568]:
a.transform(X_train)

Unnamed: 0,duration_
0,0
1,0
2,0
3,0
4,0
...,...
634,0
635,0
636,1
637,0


In [570]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
classifer = RandomForestClassifier()
param_grid = {
    "n_estimators" : [100,200,300,400],
    "max_depth" : [1,3,5,7],
    "criterion" : ['gini','entropy']
}

model = GridSearchCV(
    estimator = classifer,
    param_grid = param_grid,
    scoring = "accuracy",
    verbose = 10,
    cv = 5
)

In [571]:
from sklearn import decomposition
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

scl = StandardScaler()
pca = decomposition.PCA()
rf = RandomForestClassifier()

classifier = Pipeline(steps=[
    ('scaling' , scl),
    ('pca',pca),
    ('rf',rf)
])

param_grid = {
    "pca__n_components" : [1,4,6,7,8,9],
    "rf__n_estimators" : [100,200,300,400],
    "rf__max_depth" : [1,3,5,7],
    "rf__criterion" : ['gini','entropy']
}

model = GridSearchCV(
    estimator = classifer,
    param_grid = param_grid,
    scoring = "accuracy",
    verbose = 10,
    cv = 5
)