## 1. Import libraries

In [257]:
# !pip install xgboost


Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3


In [262]:
import os
import pickle 
import boto3

import numpy as np

import pandas as pd

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import matplotlib.pyplot as plt

import warnings

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

import xgboost as xgb

## 2. Display settings

In [181]:
pd.set_option("display.max_columns", None)
sklearn.set_config(transform_output = "pandas")
warnings.filterwarnings("ignore")

## 3. Read Datasets

In [182]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-12,Delhi,Cochin,20:55:00,19:00:00,1325,1.0,In-flight meal not included,10262
1,Jet Airways,2019-05-27,Banglore,Delhi,15:15:00,18:10:00,175,0.0,In-flight meal not included,3502
2,Jet Airways,2019-04-01,Kolkata,Banglore,20:25:00,18:00:00,1295,1.0,No Info,11467
3,Jet Airways,2019-04-01,Kolkata,Banglore,09:35:00,21:05:00,690,1.0,No Info,10953
4,Spicejet,2019-04-01,Banglore,Delhi,05:55:00,08:35:00,160,0.0,No check-in baggage included,3919
...,...,...,...,...,...,...,...,...,...,...
635,Air India,2019-04-24,Banglore,Delhi,13:20:00,16:10:00,170,0.0,No Info,6121
636,Indigo,2019-04-01,Mumbai,Hyderabad,09:10:00,10:40:00,90,0.0,No Info,3342
637,Indigo,2019-06-27,Chennai,Kolkata,14:45:00,17:05:00,140,0.0,No Info,3597
638,Indigo,2019-03-06,Mumbai,Hyderabad,02:30:00,04:00:00,90,0.0,No Info,3175


In [183]:
val = pd.read_csv("val.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-03-12,Chennai,Kolkata,22:05:00,00:30:00,145,0.0,No Info,6297
1,Jet Airways,2019-06-06,Banglore,Delhi,19:50:00,22:50:00,180,0.0,No Info,8016
2,Air India,2019-04-21,Banglore,Delhi,10:00:00,12:45:00,165,0.0,No Info,5228
3,Jet Airways,2019-05-24,Kolkata,Banglore,20:00:00,12:00:00,960,1.0,No Info,14571
4,Jet Airways,2019-05-21,Delhi,Cochin,20:55:00,04:25:00,450,1.0,No Info,16079
...,...,...,...,...,...,...,...,...,...,...
155,Jet Airways,2019-05-01,Kolkata,Banglore,21:10:00,10:05:00,775,1.0,In-flight meal not included,10844
156,Jet Airways,2019-05-18,Delhi,Cochin,19:10:00,19:00:00,1430,2.0,No Info,16704
157,Spicejet,2019-06-09,Kolkata,Banglore,09:00:00,11:25:00,145,0.0,No check-in baggage included,3841
158,Air India,2019-05-24,Kolkata,Banglore,09:25:00,10:30:00,1505,2.0,No Info,12797


In [184]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-04-21,Kolkata,Banglore,20:20:00,22:55:00,155,0.0,No Info,4804
1,Air Asia,2019-05-21,Kolkata,Banglore,09:45:00,12:15:00,150,0.0,No Info,4409
2,Indigo,2019-03-27,Delhi,Cochin,07:35:00,16:10:00,515,1.0,No Info,6442
3,Vistara,2019-03-09,Banglore,New Delhi,07:00:00,09:40:00,160,0.0,No Info,7975
4,Jet Airways,2019-05-18,Delhi,Cochin,20:55:00,19:00:00,1325,1.0,In-flight meal not included,12373
...,...,...,...,...,...,...,...,...,...,...
195,Spicejet,2019-06-09,Chennai,Kolkata,09:45:00,12:00:00,135,0.0,No Info,3597
196,Indigo,2019-03-03,Banglore,New Delhi,18:25:00,21:20:00,175,0.0,No Info,8855
197,Jet Airways,2019-06-01,Delhi,Cochin,18:15:00,12:35:00,1100,1.0,In-flight meal not included,10262
198,Multiple Carriers,2019-06-12,Delhi,Cochin,18:00:00,01:30:00,450,1.0,No Info,7198


## 4. Preprocessing Operations

In [185]:
# airline
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma


    def fit(self, X, y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self


    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)
    

def duration_category(X, short=180, med=400):
    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                    X.duration.between(short, med, inclusive="left")],
                                    ["short", "medium"],
                                    default="long"))
        .drop(columns="duration")
    )

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

# total_stops
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
("part1", info_pipe1),
("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
("air", air_transformer, ["airline"]),
("doj", doj_transformer, ["date_of_journey"]),
("location", location_transformer, ["source", 'destination']),
("time", time_transformer, ["dep_time", "arrival_time"]),
("dur", duration_transformer, ["duration"]),
("stops", total_stops_transformer, ["total_stops"]),
("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])

In [186]:
preprocessor.fit(
    train.drop(columns = "price"),
    train.price.copy()
    
)

In [208]:
preprocessor.transform(train.drop(columns = "price"))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,1.0,0.0,0.882353,0.872881,0.919922,0.920491,-0.330408,2.0,1,1.315738,1.0,0
1,0.0,1.0,0.0,0.764706,0.737288,-1.114170,-1.858790,3.204046,0.0,0,-0.958002,0.0,1
2,0.0,1.0,0.0,0.294118,0.262712,0.148166,0.161202,-0.330408,2.0,1,1.256423,1.0,0
3,0.0,1.0,0.0,0.294118,0.262712,0.148166,0.161202,-0.330408,2.0,0,0.060238,1.0,0
4,0.0,0.0,1.0,0.294118,0.262712,-1.114170,-1.858790,-0.330408,0.0,0,-0.987660,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,0.0,0.0,0.470588,0.457627,-1.114170,-1.858790,2.946623,0.0,0,-0.967888,0.0,1
636,1.0,0.0,0.0,0.294118,0.262712,-1.888971,-1.188332,-0.330408,0.0,0,-1.126061,0.0,1
637,1.0,0.0,0.0,1.000000,1.000000,-1.888971,-1.188332,-0.330408,0.0,0,-1.027203,0.0,1
638,1.0,0.0,0.0,0.058824,0.042373,-1.888971,-1.188332,-0.330408,0.0,0,-1.126061,0.0,1


## 5. Preprocess Data and Upload to Bucket 

In [209]:
BUCKET_NAME = "sagemaker-flights-bucket-01"

DATA_PREFIX = "data"

In [210]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [211]:
def export_data(data, name, pre):
    # split data into X and y subsets
    X = data.drop(columns="price")
    y = data.price.copy()
    
    # transformation
    X_pre = pre.transform(X)
    
    # exporting
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False, header=False)
    )

In [212]:
def upload_to_bucket(name):
    file_name = get_file_name(name)
    
    (
        boto3
        .Session()
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv"))
        .upload_file(file_name)
    )

In [213]:
def export_and_upload_bucket(data, name, pre):
    export_data(data, name, pre)
    upload_to_bucket(name)

In [214]:
export_and_upload_bucket(train,"train", preprocessor)

In [215]:
export_and_upload_bucket(val, "val", preprocessor)


In [216]:
export_and_upload_bucket(val, "test", preprocessor)


In [217]:
pd.read_csv("train-pre.csv")

Unnamed: 0,10262,0.0,1.0,0.0.1,0.8823529411764705,0.8728813559322035,0.9199224633560013,0.9204909257936135,-0.33040797104120173,2.0,1,1.3157377447783654,1.0.1,0
0,3502,0.0,1.0,0.0,0.764706,0.737288,-1.114170,-1.858790,3.204046,0.0,0,-0.958002,0.0,1
1,11467,0.0,1.0,0.0,0.294118,0.262712,0.148166,0.161202,-0.330408,2.0,1,1.256423,1.0,0
2,10953,0.0,1.0,0.0,0.294118,0.262712,0.148166,0.161202,-0.330408,2.0,0,0.060238,1.0,0
3,3919,0.0,0.0,1.0,0.294118,0.262712,-1.114170,-1.858790,-0.330408,0.0,0,-0.987660,0.0,1
4,5192,1.0,0.0,0.0,0.705882,0.711864,0.148166,0.161202,-0.330408,1.0,0,-0.730628,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,6121,0.0,0.0,0.0,0.470588,0.457627,-1.114170,-1.858790,2.946623,0.0,0,-0.967888,0.0,1
635,3342,1.0,0.0,0.0,0.294118,0.262712,-1.888971,-1.188332,-0.330408,0.0,0,-1.126061,0.0,1
636,3597,1.0,0.0,0.0,1.000000,1.000000,-1.888971,-1.188332,-0.330408,0.0,0,-1.027203,0.0,1
637,3175,1.0,0.0,0.0,0.058824,0.042373,-1.888971,-1.188332,-0.330408,0.0,0,-1.126061,0.0,1


## 5.  Model and Hyperparameter Tuning Set-up

In [229]:
# session = sagemaker.Session()
# region_name = session.boto_region_name

# Set the region
region = 'eu-central-1'
boto_session = sagemaker.Session(boto3.Session(region_name=region))


In [230]:
#where to save the model 
output_path = f"s3://{BUCKET_NAME}/model/output"

In [243]:
model = Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost", region_name, "1.2-1"),  #docker images of algorithm     #uri - unofrom resource identifier

    role=sagemaker.get_execution_role(),  # Iamrole
    instance_count=1,                     # how many ec2 instance we need
    instance_type="ml.m5.xlarge",         # Instance type instance_type  # or any other available instance type

    volume_size=5,                        # size of s3 (5gb)
    output_path=output_path,
    use_spot_instances=True,
    max_run=300,                          # 300 sec
    max_wait=600,                         # 600 sec to save rhe cost
    sagemaker_session=boto_session
)

In [244]:
model.set_hyperparameters(
    objective="reg:linear", # mean squared error
    num_round=10,           # base estimator
    eta=0.1,                # learning rate 
    max_depth=5,
    subsample=0.8,           # randomly sample 80% of rows for training individual tree
    colsample_bytree=0.8,    # 80% of the columns
    alpha=0.1                # L2 regularization
)

In [245]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2),
    "alpha": ContinuousParameter(0, 1),
    "max_depth": IntegerParameter(3, 5)
}

In [246]:
# Set the region
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize",

)

## 6. Data Channels

In [247]:
#connecting model to s3 bucket

def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type="csv")

In [248]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7f4f11993b50>

In [249]:
val_data_channel = get_data_channel("val")
val_data_channel

<sagemaker.inputs.TrainingInput at 0x7f4f11993e20>

In [250]:
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

## 7. Train and Tune the Model

In [251]:
tuner.fit(inputs=data_channels)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


...........................!


In [253]:
# how to deploy model on sagemaker
# tuner.best_estimator().deploy()

## 8 Model Evaluation

In [258]:
with open("xgboost-model", "rb") as f:
    best_model = pickle.load(f)
    
best_model

<xgboost.core.Booster at 0x7f4f10fa1360>

In [260]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:, 1:])                             # specific data format for xgb model
    y = data.iloc[:, 0].copy()
    
    pred = best_model.predict(X) 
    
    return r2_score(y, pred)

In [263]:
evaluate_model("train")


-0.0753380597865243

In [264]:
evaluate_model("val")


-0.11946285004791002

In [265]:
evaluate_model("test")


-0.11946285004791002