## 1. Import Libraries

In [41]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-manylinux2014_x86_64.whl (297.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.1/297.1 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.3


In [4]:
!pip install feature-engine



In [77]:
import os

import boto3

import pickle

import warnings

import numpy as np

import pandas as pd

import xgboost as xgb

import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

## 2. Display Settings

In [6]:
# to show all cpls of DataFrame
pd.set_option("display.max_columns", None)

In [7]:
# to ensure that the o/p of sklearn transformer are pandas DataFrame
sklearn.set_config(transform_output="pandas")

In [8]:
# to ignore warnings
warnings.filterwarnings("ignore")

## 3. Read the Datasets

In [9]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-05-21,Delhi,Cochin,14:05:00,17:55:00,230,0.0,No Info,6094
1,Indigo,2019-03-27,Delhi,Cochin,10:45:00,21:00:00,615,1.0,No Info,6838
2,Jet Airways,2019-05-12,Banglore,Delhi,19:50:00,22:50:00,180,0.0,No Info,7229
3,Jet Airways,2019-06-06,Kolkata,Banglore,20:25:00,14:25:00,1080,1.0,No Info,12692
4,Multiple Carriers,2019-05-15,Delhi,Cochin,11:40:00,19:15:00,455,1.0,No Info,9001


In [10]:
train.shape

(1600, 10)

In [11]:
val = pd.read_csv("val.csv")
val.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Vistara,2019-03-03,Banglore,New Delhi,16:00:00,18:35:00,155,0.0,No Info,13383
1,Jet Airways,2019-05-15,Kolkata,Banglore,08:25:00,20:45:00,740,1.0,No Info,14781
2,Air Asia,2019-03-01,Kolkata,Banglore,19:35:00,22:05:00,150,0.0,No Info,5964
3,Jet Airways,2019-06-27,Delhi,Cochin,06:45:00,12:35:00,1790,2.0,In-flight meal not included,10919
4,Multiple Carriers,2019-05-27,Delhi,Cochin,11:30:00,21:00:00,570,1.0,In-flight meal not included,6943


In [12]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Multiple Carriers,2019-06-12,Delhi,Cochin,14:00:00,01:30:00,690,1.0,No Info,13377
1,Jet Airways,2019-05-12,Kolkata,Banglore,20:25:00,21:05:00,1480,1.0,In-flight meal not included,10703
2,Air India,2019-04-01,Kolkata,Banglore,16:50:00,18:30:00,1540,1.0,No Info,7893
3,Air India,2019-06-12,Banglore,Delhi,21:05:00,23:55:00,170,0.0,No Info,5228
4,Goair,2019-06-06,Banglore,Delhi,11:40:00,14:30:00,170,0.0,No Info,3898


## 4. Preprocessing Operations

In [13]:
# airline

air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj

feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source and destination

location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma


    def fit(self, X, y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self


    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)
    

def duration_category(X, short=180, med=400):
    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                    X.duration.between(short, med, inclusive="left")],
                                    ["short", "medium"],
                                    default="long"))
        .drop(columns="duration")
    )

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

# total_stops
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
("part1", info_pipe1),
("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
("air", air_transformer, ["airline"]),
("doj", doj_transformer, ["date_of_journey"]),
("location", location_transformer, ["source", 'destination']),
("time", time_transformer, ["dep_time", "arrival_time"]),
("dur", duration_transformer, ["duration"]),
("stops", total_stops_transformer, ["total_stops"]),
("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector

estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor

preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])

In [14]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

In [15]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_month,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,0.0,0.0,0.666667,0.705882,0.686441,1.019799,1.018865,-0.328770,1.0,0,-0.807925,0.0,1
1,1.0,0.0,0.0,0.000000,0.235294,0.220339,1.019799,1.018865,-0.328770,2.0,0,-0.027156,1.0,0
2,0.0,1.0,0.0,0.666667,0.588235,0.610169,-0.938081,-1.862572,1.412739,1.0,0,-0.909323,0.0,1
3,0.0,1.0,0.0,1.000000,0.823529,0.822034,-0.121222,-0.131749,-0.328770,2.0,1,0.915851,1.0,0
4,0.0,0.0,0.0,0.666667,0.647059,0.635593,1.019799,1.018865,-0.328770,2.0,0,-0.351631,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,0.0,1.0,0.0,0.666667,0.529412,0.533898,-0.938081,-1.862572,-0.272984,0.0,0,-0.939743,0.0,1
1596,0.0,1.0,0.0,0.666667,0.588235,0.559322,-0.121222,-0.131749,-0.328770,2.0,0,0.429138,1.0,0
1597,1.0,0.0,0.0,0.000000,0.000000,0.016949,1.019799,1.018865,-0.328770,2.0,0,-0.422610,1.0,0
1598,0.0,0.0,0.0,0.000000,0.176471,0.169492,1.019799,1.018865,-0.328770,2.0,0,-0.148834,1.0,0


## 5. Preprocess Data and Upload to Bucket

In [17]:
BUCKET_NAME = "sagemaker-flight-buckets"

DATA_PREFIX = "data"

In [18]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [19]:
def export_data(data, name, pre):
    # split data into X and y subsets
    X = data.drop(columns="price")
    y = data.price.copy()
    
    # transformation
    X_pre = pre.transform(X)
    
    # exporting
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False, header=False)
    )

In [20]:
# X_tr = train.drop(columns="price")
# y_tr = train.price.copy()

# (
#     y_tr
#     .to_frame()
#     .join(X_tr)
# )

In [21]:
def upload_to_bucket(name):
    file_name = get_file_name(name)
    
    (
        boto3
        .Session()
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv"))
        .upload_file(file_name)
    )

In [22]:
def export_and_upload_bucket(data, name, pre):
    export_data(data, name, pre)
    upload_to_bucket(name)

In [23]:
export_and_upload_bucket(train, "train", preprocessor)

In [24]:
# to see the uploaded data
# columns names are in decimals or digits 
# sagemaker does not take cols name from data

# pd.read_csv("train-pre.csv")

In [25]:
export_and_upload_bucket(val, "val", preprocessor)

In [26]:
export_and_upload_bucket(test, "test", preprocessor)

## 6. Model and Hyperparameter Tuning Set-up

In [27]:
# informing on which environment we are working
session = sagemaker.Session()
# telling sagemaker to use our model
region_name = session.boto_region_name

In [28]:
# where to save the model
output_path = f"s3://{BUCKET_NAME}/model/output"

In [29]:
# our model using "xgboost" algorithm
model = Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost", region_name, "1.2-1"),
    # IAM role
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    output_path=output_path,
    use_spot_instances=True,
    max_run=300,
    max_wait=600,
    sagemaker_session=session
)

In [30]:
# with below parameters
model.set_hyperparameters(
    objective="reg:linear",
    num_round=10,
    eta=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    alpha=0.1
)

In [31]:
# setting limits for hyperparameter tuner
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2),
    "alpha": ContinuousParameter(0, 1),
    "max_depth": IntegerParameter(3, 5)
}

In [32]:
# setting up hyperparameter tuner

tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize"
)

## 7. Data Channels

In [33]:
# data location
def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type="csv")

In [34]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7f7506881240>

In [35]:
val_data_channel = get_data_channel("val")

In [36]:
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

## 8. Train and Tune the Model

In [37]:
tuner.fit(data_channels)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


......................................!


In [38]:
# in above cell training done

In [39]:
# to get best model

# tuner.best_estimator()

In [40]:
# to deploy

# tuner.best_estimator().deploy()

## 9. Model Evaluation

In [72]:
with open("xgboost-model", "rb") as f:
    best_model = pickle.load(f)
    
best_model

<xgboost.core.Booster at 0x7f7504f8be80>

In [61]:
# import tarfile

# # Path to the downloaded tar.gz file
# tar_path = r"model.tar.gz"
# extract_path = r"xgboost-model"

# # Extract the tar.gz file
# with tarfile.open(tar_path, 'r:gz') as tar:
#     tar.extractall(path=extract_path)

# model_file_path = 'xgboost-model/xgboost-model'

# # Load the model
# with open(model_file_path, 'rb') as model_file:
#     best_model = pickle.load(model_file)

# best_model    

In [74]:
pd.read_csv("train-pre.csv").head()

Unnamed: 0,6094,0.0,0.0.1,0.0.2,0.6666666666666665,0.7058823529411765,0.6864406779661016,1.019798846218085,1.0188650540533362,-0.3287703411251796,1.0,0,-0.8079248964036058,0.0.3,1
0,6838,1.0,0.0,0.0,0.0,0.235294,0.220339,1.019799,1.018865,-0.32877,2.0,0,-0.027156,1.0,0
1,7229,0.0,1.0,0.0,0.666667,0.588235,0.610169,-0.938081,-1.862572,1.412739,1.0,0,-0.909323,0.0,1
2,12692,0.0,1.0,0.0,1.0,0.823529,0.822034,-0.121222,-0.131749,-0.32877,2.0,1,0.915851,1.0,0
3,9001,0.0,0.0,0.0,0.666667,0.647059,0.635593,1.019799,1.018865,-0.32877,2.0,0,-0.351631,1.0,0
4,12898,0.0,1.0,0.0,0.666667,0.705882,0.686441,1.019799,1.018865,-0.32877,2.0,1,0.763753,1.0,0


In [75]:
# to get target column
pd.read_csv("train-pre.csv").iloc[:,0]

0        6838
1        7229
2       12692
3        9001
4       12898
        ...  
1594     7229
1595    12121
1596    14871
1597     8401
1598     2753
Name: 6094, Length: 1599, dtype: int64

In [76]:
# to get all other cols
pd.read_csv("train-pre.csv").iloc[:,1:]

Unnamed: 0,0.0,0.0.1,0.0.2,0.6666666666666665,0.7058823529411765,0.6864406779661016,1.019798846218085,1.0188650540533362,-0.3287703411251796,1.0,0,-0.8079248964036058,0.0.3,1
0,1.0,0.0,0.0,0.000000,0.235294,0.220339,1.019799,1.018865,-0.328770,2.0,0,-0.027156,1.0,0
1,0.0,1.0,0.0,0.666667,0.588235,0.610169,-0.938081,-1.862572,1.412739,1.0,0,-0.909323,0.0,1
2,0.0,1.0,0.0,1.000000,0.823529,0.822034,-0.121222,-0.131749,-0.328770,2.0,1,0.915851,1.0,0
3,0.0,0.0,0.0,0.666667,0.647059,0.635593,1.019799,1.018865,-0.328770,2.0,0,-0.351631,1.0,0
4,0.0,1.0,0.0,0.666667,0.705882,0.686441,1.019799,1.018865,-0.328770,2.0,1,0.763753,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594,0.0,1.0,0.0,0.666667,0.529412,0.533898,-0.938081,-1.862572,-0.272984,0.0,0,-0.939743,0.0,1
1595,0.0,1.0,0.0,0.666667,0.588235,0.559322,-0.121222,-0.131749,-0.328770,2.0,0,0.429138,1.0,0
1596,1.0,0.0,0.0,0.000000,0.000000,0.016949,1.019799,1.018865,-0.328770,2.0,0,-0.422610,1.0,0
1597,0.0,0.0,0.0,0.000000,0.176471,0.169492,1.019799,1.018865,-0.328770,2.0,0,-0.148834,1.0,0


In [78]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:, 1:])
    y = data.iloc[:, 0].copy()
    
    pred = best_model.predict(X)
    
    return r2_score(y, pred)

In [79]:
evaluate_model("train")

0.6773635541384203

In [80]:
evaluate_model("val")

0.5497829679647479

In [81]:
evaluate_model("test")

0.6464836845612261