 ## Importing Libraries


In [1]:
#!pip install feature-engine

In [47]:
# !pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.0-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 991.0 kB/s eta 0:02:06
   ---------------------------------------- 1.1/124.9 MB 13.9 MB/s eta 0:00:09
    --------------------------------------- 3.1/124.9 MB 24.5 MB/s eta 0:00:05
   - -------------------------------------- 5.0/124.9 MB 28.8 MB/s eta 0:00:05
   -- ------------------------------------- 7.0/124.9 MB 32.0 MB/s eta 0:00:04
   -- ------------------------------------- 8.6/124.9 MB 32.3 MB/s eta 0:00:04
   --- ------------------------------------ 10.6/124.9 MB 40.9 MB/s eta 0:00:03
   ---- ----------------------------------- 12.7/124.9 MB 40.9 MB/s eta 0:00:03
   ---- ----------------------------------- 14.7/124.9 MB 43.7 MB/s eta 0:00:03
   ----- ---------------------------------- 16.8/124.9 MB 43.7 MB/

In [11]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import r2_score
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    StandardScaler,
    PowerTransformer,
    FunctionTransformer,
    OrdinalEncoder
)
from sklearn.ensemble import RandomForestRegressor

from feature_engine.outliers import Winsorizer
from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder
)
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance

import matplotlib.pyplot as plt

import warnings

import os
import pickle

import boto3 #sdk

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinousParameter,
    HyperparameterTuner
)

import xgboost as xgb

## Display Settings

In [12]:
pd.set_option("display.max_columns",None)

In [13]:
sklearn.set_config(transform_output="pandas") 

In [14]:
warnings.filterwarnings("ignore")

## Reading datasets

In [15]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-06-03,Delhi,Cochin,13:15:00,09:25:00,1210,2.0,No Info,10467
1,Jet Airways,2019-05-27,Delhi,Cochin,19:15:00,04:25:00,550,1.0,In-flight meal not included,12898
2,Jet Airways,2019-05-21,Kolkata,Banglore,06:30:00,19:50:00,800,1.0,No Info,14781
3,Jet Airways,2019-05-18,Delhi,Cochin,11:00:00,14:15:00,195,0.0,No Info,9564
4,Air India,2019-05-15,Kolkata,Banglore,14:15:00,16:45:00,150,0.0,No Info,5510
...,...,...,...,...,...,...,...,...,...,...
6690,Spicejet,2019-06-24,Kolkata,Banglore,17:10:00,19:40:00,150,0.0,No Info,3873
6691,Multiple Carriers,2019-06-15,Delhi,Cochin,10:20:00,19:15:00,535,1.0,No Info,9526
6692,Indigo,2019-04-21,Delhi,Cochin,02:00:00,07:45:00,345,1.0,No Info,6258
6693,Jet Airways,2019-03-24,Kolkata,Banglore,21:10:00,08:10:00,660,1.0,In-flight meal not included,11134


In [16]:
val = pd.read_csv("validation.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-06-12,Chennai,Kolkata,05:15:00,07:40:00,145,0.0,No Info,3540
1,Spicejet,2019-03-24,Chennai,Kolkata,08:20:00,10:35:00,135,0.0,No Info,3859
2,Air Asia,2019-05-09,Kolkata,Banglore,19:55:00,23:30:00,215,1.0,No Info,5162
3,Indigo,2019-05-27,Chennai,Kolkata,13:15:00,15:35:00,140,0.0,No Info,3597
4,Air India,2019-05-09,Delhi,Cochin,06:05:00,09:25:00,1640,1.0,No Info,7480
...,...,...,...,...,...,...,...,...,...,...
1669,Air India,2019-06-12,Kolkata,Banglore,05:50:00,23:15:00,1045,2.0,No Info,12723
1670,Jet Airways,2019-04-03,Delhi,Cochin,11:00:00,14:15:00,195,0.0,In-flight meal not included,4886
1671,Air India,2019-03-12,Chennai,Kolkata,11:40:00,13:55:00,135,0.0,No Info,7082
1672,Air India,2019-03-03,Banglore,New Delhi,10:00:00,12:45:00,165,0.0,No Info,8588


In [17]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-05-24,Kolkata,Banglore,10:00:00,01:20:00,920,2.0,No Info,14083
1,Indigo,2019-03-12,Kolkata,Banglore,21:25:00,00:05:00,160,0.0,No Info,4148
2,Multiple Carriers,2019-03-09,Delhi,Cochin,07:30:00,19:45:00,735,1.0,No Info,12615
3,Air India,2019-05-21,Mumbai,Hyderabad,21:05:00,22:25:00,80,0.0,No Info,3100
4,Jet Airways,2019-05-01,Kolkata,Banglore,21:10:00,10:05:00,775,1.0,In-flight meal not included,10844
...,...,...,...,...,...,...,...,...,...,...
2088,Multiple Carriers,2019-05-15,Delhi,Cochin,09:00:00,21:00:00,720,1.0,No Info,13727
2089,Jet Airways,2019-03-09,Delhi,Cochin,16:00:00,04:25:00,745,1.0,No Info,16289
2090,Air India,2019-06-27,Delhi,Cochin,09:45:00,23:00:00,795,1.0,No Info,8907
2091,Indigo,2019-05-21,Kolkata,Banglore,15:30:00,18:05:00,155,0.0,No Info,4804


## Preprocessing Operations

In [18]:
# airline
airline_transformer = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("grouper",RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False,handle_unknown="ignore"))
])


#doj
features_to_extract = ["month","week","day_of_week","day_of_year"]

doj_transformer = Pipeline(steps= [
    ("dt",DatetimeFeatures(features_to_extract=features_to_extract,yearfirst=True,format="mixed")),
    ("scaler",MinMaxScaler())
])

# source and destination
location_pipe1 = Pipeline(steps=[
    ("grouper",RareLabelEncoder(tol=0.1,replace_with="other",n_categories=2)),
    ("encoder",MeanEncoder()),
    ("scaler",PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi","New Delhi","Kolkata","Mumbai"]
    return(
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1",location_pipe1),
    ("part2",FunctionTransformer(func=is_north))
])


#dep_time and arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=["hour","minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_the_day(X,morning=4,afternoon=12,evening=16,night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:,col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:,col].between(morning,afternoon,inclusive="left"),
                X_temp.loc[:,col].between(afternoon,evening,inclusive="left"),
                X_temp.loc[:,col].between(evening,night,inclusive="left"),],
                ["morning","afternoon","evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part",FunctionTransformer(func=part_of_the_day)),
    ("encoder",CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
    
])

time_transformer = FeatureUnion(transformer_list=[
    ("time_part1",time_pipe1),
    ("time_part2",time_pipe2)
])

# Duration
def flight_duration_categories(X,short=180,medium=400):
    return(
        X
        .assign(flight_duration_categories =  np.select([X.duration.lt(short),
                                                        X.duration.between(short,medium,inclusive="left"),],
                                                        ["short","medium"],
                                                        default="long")
               )
        .drop(columns="duration")
    )

def is_over(X,value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("categories",FunctionTransformer(func=flight_duration_categories)),
    ("encoding",OrdinalEncoder(categories=[["short","medium","long"]]))
])

class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25,0.5,0.75],gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma

    def fit(self,X,y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                X
                .loc[:,col]
                .quantile(self.percentiles)
                .values
                .reshape(-1,1)
            )
            for col in self.variables
        }
        
        return self

    def transform(self,X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile*100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:,[col]],Y=self.reference_values_[col],gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects,axis=1)

duration_pipe2 = Pipeline(steps=[
    ("rbf",RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_union = FeatureUnion(transformer_list=[
    ("part1",duration_pipe1),
    ("part2",duration_pipe2),
    ("part3",FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers",Winsorizer(capping_method="iqr",fold=1.5)),
    ("imputer",SimpleImputer(strategy="median")),
    ("union",duration_union)
])

# total_stops
def is_direct(X):
    return (
        X
        .assign(
            is_direct_flight=X.total_stops.eq(0).astype(int)
        )
    )

total_stops_transformer = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("direct",FunctionTransformer(func=is_direct))
])

additional_info_pipe1 = Pipeline(steps=[
    ("group",RareLabelEncoder(tol=0.1,n_categories=2,replace_with="other")),
    ("encoder",OneHotEncoder(handle_unknown="ignore",sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
    ("part1",additional_info_pipe1),
    ("part2",FunctionTransformer(func=have_info))
])

additional_info_transformer = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="constant",fill_value="unknown")),
    ("union",info_union)
])


# column transformer
column_transformer = ColumnTransformer(transformers=[
    ("airline",airline_transformer,["airline"]),
    ("doj",doj_transformer,["date_of_journey"]),
    ("location",location_transformer,["source","destination"]),
    ("time",time_transformer,["dep_time","arrival_time"]),
    ("dur",duration_transformer,["duration"]),
    ("stops",total_stops_transformer,["total_stops"]),
    ("info",additional_info_transformer,["additional_info"])
],remainder="passthrough")


# feature selector
estimator = RandomForestRegressor(n_estimators=10,max_depth=3,random_state=9)

selector = SelectBySingleFeaturePerformance(
    estimator=estimator,
    scoring="r2",
    threshold=0.1
)

# final preprocessor pipeline
preprocessor = Pipeline(steps=[
    ("column_tranformer",column_transformer),
    ("selector",selector)
])

In [19]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

In [20]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,airline__airline_Indigo,airline__airline_Jet Airways,airline__airline_Other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__flight_duration_categories,dur__duration_rbf_25,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,0.0,0.0,0.823529,0.796610,1.040220,1.038009,2.0,-0.360200,1,1.170295,2.0,0
1,0.0,1.0,0.0,0.764706,0.737288,1.040220,1.038009,2.0,-0.360200,0,-0.159932,1.0,0
2,0.0,1.0,0.0,0.705882,0.686441,-0.175987,-0.196275,2.0,-0.360200,0,0.343942,1.0,0
3,0.0,1.0,0.0,0.647059,0.661017,1.040220,1.038009,1.0,-0.360200,0,-0.875432,0.0,1
4,0.0,0.0,0.0,0.647059,0.635593,-0.175987,-0.196275,0.0,-0.360200,0,-0.966129,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,0.0,0.0,1.0,1.000000,0.974576,-0.175987,-0.196275,0.0,-0.360200,0,-0.966129,0.0,1
6691,0.0,0.0,0.0,0.882353,0.898305,1.040220,1.038009,2.0,-0.360200,0,-0.190164,1.0,0
6692,1.0,0.0,0.0,0.411765,0.432203,1.040220,1.038009,1.0,-0.360200,0,-0.573108,1.0,0
6693,0.0,1.0,0.0,0.176471,0.194915,-0.175987,-0.196275,2.0,-0.360200,0,0.061773,1.0,0


## Preprocessing data and Uploading to S3 Bucket

In [21]:
BUCKET_NAME = "sagemaker-flights-bucket"
DATA_PREFIX = "data"

In [22]:
def get_file_name(name):
    return f"{name}_preprocessed.csv"

In [23]:
def export_data(data,name,pre):
    # splitting data into X and y
    X = data.drop(columns="price")
    y = data.price.copy()

    # transforming data
    X_pre = pre.transform(X)

    #exporting
    file_name = get_file_name(name)
    
    #Sagemaker needs [target,othercolumns...]
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name,index=False,header=False)
    )    

In [24]:
def upload_to_bucket(name):
    file_name = get_file_name(name)
    (
        boto3
        .Session()
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX,f"{name}/{name}.csv"))
        .upload_file(file_name)
    )
    

In [25]:
def export_and_upload_to_bucket(data,name,pre):
    export_data(data,name,pre)
    upload_to_bucket(name)

In [26]:
export_and_upload_to_bucket(train,"train",preprocessor)

NoCredentialsError: Unable to locate credentials

In [27]:
export_and_upload_to_bucket(val,"validation",preprocessor)

NoCredentialsError: Unable to locate credentials

In [28]:
export_and_upload_to_bucket(test,"test",preprocessor)

NoCredentialsError: Unable to locate credentials

## Model and Hyperparameter tuning setup

#### Try using optuna on local for full hyperparameter training then use the final params to train on sagemaker

In [29]:
session = sagemaker.Session()
region_name = session.boto_region_name

ValueError: Must setup local AWS configuration with a region supported by SageMaker.

In [30]:
output_path = f"s3://{BUCKET_NAME}/model/output"

In [32]:
model = Estimator(
    image_uri = sagemaker.image_uris.retrieve("xgboost",region_name,"1.2-1"),
    role = sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    output_path=output_path,
    use_spot_instances=True,
    max_run=300,
    max_wait=600,
    sagemaker_session=session
)

NameError: name 'region_name' is not defined

In [34]:
model.set_hyperparameters(
    objective="reg:linear",     #mean squared error
    num_round=10,    # base estimators
    eta=0.1,
    max_depth=5,
    subsample_bytree=0.8,    # rows random sampling
    col=0.8,     #colums random sampling
    alpha=0.1
)

NameError: name 'model' is not defined

In [37]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05,0.2),
    "aplha": ContinuousParameter(0,1),
    "max_depth": IntegerParameter(3,5)
}

NameError: name 'ContinuousParameter' is not defined

In [36]:
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize"
)

NameError: name 'HyperparameterTuner' is not defined

## Data Channels

In [38]:
def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path,content_type="csv")

In [39]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x1e1ed1fd820>

In [40]:
val_data_channel = get_data_channel("validation")
val_data_channel

<sagemaker.inputs.TrainingInput at 0x1e1ed524aa0>

In [42]:
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

## Training and Tuning model

In [44]:
tuner.fit(data_channels)

NameError: name 'tuner' is not defined

In [46]:
#### tuner.best_estimator().deploy()    ## to deploy model on sagemaker and use it but i have deployed it on a streamlit app to avoid aws charges

### Model Evaluation

In [51]:
with open("xgboost-model","rb") as f:
    best_model = pickle.load(f)

best_model

FileNotFoundError: [Errno 2] No such file or directory: 'xgboost-model'

In [56]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:,1:])
    y = data.iloc[:,0].copy()
    
    pred = best_model.predict(X)
    
    return r2_score(y,pred)

In [None]:
evaluate_model("train")

In [None]:
evaluate_model("validation")

In [None]:
evaluate_model("test")