# XGBoost

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Project level modules
import modules.preprocessing_functions as ppf

In [2]:
# Random State
rs = 99

## Load Data

In [3]:
google_drive_path = ('~/Google Drive/My Drive/Lighthouse Labs/'
                     + 'Mid-term Project/Data-Jan/')

In [4]:
jan_18_df = pd.read_csv(
    google_drive_path + '2018-01.csv',
    skiprows=0,
    nrows=10,
    parse_dates=[0]
)

In [6]:
type(jan_18_df['fl_date'][0])

pandas._libs.tslibs.timestamps.Timestamp

## Data Split

In [5]:
# Predictor features set
X = ppf.flight_test_features(jan_18_df, purged=True)
# Target variable
y = jan_18_df['arr_delay']

In [6]:
X

Unnamed: 0,fl_date,op_unique_carrier,tail_num,origin,origin_city_name,dest,dest_city_name,crs_dep_time,crs_arr_time,distance
0,2018-01-01,9E,N292PQ,TVC,"Traverse City, MI",DTW,"Detroit, MI",615,738,207
1,2018-01-01,9E,N931XJ,AVL,"Asheville, NC",ATL,"Atlanta, GA",615,724,164
2,2018-01-01,9E,N916XJ,DTW,"Detroit, MI",PIT,"Pittsburgh, PA",1200,1318,201
3,2018-01-01,9E,N937XJ,PIT,"Pittsburgh, PA",DTW,"Detroit, MI",1330,1455,201
4,2018-01-01,9E,N316PQ,ATL,"Atlanta, GA",GSO,"Greensboro/High Point, NC",1500,1618,306
5,2018-01-01,9E,N604LR,BNA,"Nashville, TN",DTW,"Detroit, MI",835,1118,456
6,2018-01-01,9E,N176PQ,DTW,"Detroit, MI",CVG,"Cincinnati, OH",2015,2151,229
7,2018-01-01,9E,N903XJ,ILM,"Wilmington, NC",ATL,"Atlanta, GA",700,846,377
8,2018-01-01,9E,N308PQ,DTW,"Detroit, MI",DSM,"Des Moines, IA",2000,2110,533
9,2018-01-01,B6,N954JB,JFK,"New York, NY",FLL,"Fort Lauderdale, FL",1055,1405,1069


## Preprocessing

In [None]:
# NA replacement
# Split X into categorical and numeric features
# For numeric
# Scale and transform
# For categorical
# One hot encode/dummy

### Categorical

In [8]:
X_numerical, X_categorical = ppf.numerical_categorical_split(df=X)

In [9]:
X_categorical

Unnamed: 0,fl_date,op_unique_carrier,tail_num,origin,origin_city_name,dest,dest_city_name
0,2018-01-01,9E,N292PQ,TVC,"Traverse City, MI",DTW,"Detroit, MI"
1,2018-01-01,9E,N931XJ,AVL,"Asheville, NC",ATL,"Atlanta, GA"
2,2018-01-01,9E,N916XJ,DTW,"Detroit, MI",PIT,"Pittsburgh, PA"
3,2018-01-01,9E,N937XJ,PIT,"Pittsburgh, PA",DTW,"Detroit, MI"
4,2018-01-01,9E,N316PQ,ATL,"Atlanta, GA",GSO,"Greensboro/High Point, NC"
5,2018-01-01,9E,N604LR,BNA,"Nashville, TN",DTW,"Detroit, MI"
6,2018-01-01,9E,N176PQ,DTW,"Detroit, MI",CVG,"Cincinnati, OH"
7,2018-01-01,9E,N903XJ,ILM,"Wilmington, NC",ATL,"Atlanta, GA"
8,2018-01-01,9E,N308PQ,DTW,"Detroit, MI",DSM,"Des Moines, IA"
9,2018-01-01,B6,N954JB,JFK,"New York, NY",FLL,"Fort Lauderdale, FL"


In [10]:
X_numerical

Unnamed: 0,crs_dep_time,crs_arr_time,distance
0,615,738,207
1,615,724,164
2,1200,1318,201
3,1330,1455,201
4,1500,1618,306
5,835,1118,456
6,2015,2151,229
7,700,846,377
8,2000,2110,533
9,1055,1405,1069


#### One hot encoding
Doing One hot encoding before the train test split is based on the Machine Learning Mastery tutorial [Data Preparation for Gradient Boosting with XGBoost in Python](https://machinelearningmastery.com/data-preparation-gradient-boosting-xgboost-python/)

In [11]:
# One hot encoding
X_cat_one_hot = pd.get_dummies(
    X_categorical,
    prefix=X_categorical.columns[1:],
    # will fl_date be encoded if there was more than one date?
    drop_first=True
)
X_cat_one_hot

Unnamed: 0,fl_date,op_unique_carrier_B6,tail_num_N292PQ,tail_num_N308PQ,tail_num_N316PQ,tail_num_N604LR,tail_num_N903XJ,tail_num_N916XJ,tail_num_N931XJ,tail_num_N937XJ,...,dest_DTW,dest_FLL,dest_GSO,dest_PIT,"dest_city_name_Cincinnati, OH","dest_city_name_Des Moines, IA","dest_city_name_Detroit, MI","dest_city_name_Fort Lauderdale, FL","dest_city_name_Greensboro/High Point, NC","dest_city_name_Pittsburgh, PA"
0,2018-01-01,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
1,2018-01-01,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2018-01-01,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
3,2018-01-01,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
4,2018-01-01,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
5,2018-01-01,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
6,2018-01-01,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,2018-01-01,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,2018-01-01,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
9,2018-01-01,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0


In [12]:
# Reconstitue original data table with one hot encoded features
X_one_hot = pd.concat([X_cat_one_hot, X_numerical], axis=1)
X_one_hot

Unnamed: 0,fl_date,op_unique_carrier_B6,tail_num_N292PQ,tail_num_N308PQ,tail_num_N316PQ,tail_num_N604LR,tail_num_N903XJ,tail_num_N916XJ,tail_num_N931XJ,tail_num_N937XJ,...,dest_PIT,"dest_city_name_Cincinnati, OH","dest_city_name_Des Moines, IA","dest_city_name_Detroit, MI","dest_city_name_Fort Lauderdale, FL","dest_city_name_Greensboro/High Point, NC","dest_city_name_Pittsburgh, PA",crs_dep_time,crs_arr_time,distance
0,2018-01-01,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,615,738,207
1,2018-01-01,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,615,724,164
2,2018-01-01,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,1200,1318,201
3,2018-01-01,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1330,1455,201
4,2018-01-01,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1500,1618,306
5,2018-01-01,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,835,1118,456
6,2018-01-01,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,2015,2151,229
7,2018-01-01,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,700,846,377
8,2018-01-01,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,2000,2110,533
9,2018-01-01,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1055,1405,1069


### Train Test Split

In [50]:
X_train, X_test, y_train, y_test = train_test_split(
    X_one_hot, y,
    train_size=0.8,
    test_size=0.2,
    random_state=rs
)

In [51]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

### Numerical

In [82]:
# Separate numeric and categorical features

numeric_features = X_numerical.columns.to_list() + ['fl_date']

In [94]:
X_train_num = X_train[numeric_features].copy()
X_train_cat = X_train.drop(numeric_features, axis=1)  # One hot encoded

X_test_num = X_test[numeric_features].copy()
X_test_cat = X_test.drop(numeric_features, axis=1)  # One hot encoded

In [95]:
X_train_num.loc[:, 'fl_date'] = X_train_num['fl_date'].view(int) / 1e11
X_test_num.loc[:, 'fl_date'] = X_test_num['fl_date'].view(int) / 1e11

In [96]:
X_train_num

Unnamed: 0,crs_dep_time,crs_arr_time,distance,fl_date
0,1500,1618,306,15147648.0
1,1200,1318,201,15147648.0
2,2015,2151,229,15147648.0
3,700,846,377,15147648.0
4,615,738,207,15147648.0
5,1055,1405,1069,15147648.0
6,1330,1455,201,15147648.0
7,615,724,164,15147648.0


In [97]:
# Scale numeric features
scaler = MinMaxScaler()
X_train_num_scaled = pd.DataFrame(
    scaler.fit_transform(X=X_train_num),
    columns=X_train_num.columns
)
X_test_num_scaled = pd.DataFrame(
    scaler.transform(X=X_test_num),
    columns=X_test_num.columns
)

In [98]:
X_test_num_scaled

Unnamed: 0,crs_dep_time,crs_arr_time,distance,fl_date
0,0.989286,0.971268,0.407735,0.0
1,0.157143,0.276104,0.322652,0.0


In [99]:
# Reconstitue train and test datasets
X_train = pd.concat([X_train_cat, X_train_num_scaled], axis=1)
X_test = pd.concat([X_test_cat, X_test_num_scaled], axis=1)

In [100]:
X_train

Unnamed: 0,op_unique_carrier_B6,tail_num_N292PQ,tail_num_N308PQ,tail_num_N316PQ,tail_num_N604LR,tail_num_N903XJ,tail_num_N916XJ,tail_num_N931XJ,tail_num_N937XJ,tail_num_N954JB,...,"dest_city_name_Cincinnati, OH","dest_city_name_Des Moines, IA","dest_city_name_Detroit, MI","dest_city_name_Fort Lauderdale, FL","dest_city_name_Greensboro/High Point, NC","dest_city_name_Pittsburgh, PA",crs_dep_time,crs_arr_time,distance,fl_date
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0.632143,0.626489,0.156906,0.0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0.417857,0.416258,0.040884,0.0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1.0,1.0,0.071823,0.0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0.060714,0.085494,0.235359,0.0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0.0,0.009811,0.047514,0.0
5,1,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0.314286,0.477225,1.0,0.0
6,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0.510714,0.512263,0.040884,0.0
7,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0.0,0.0,0.0,0.0


In [101]:
# Dmatrix : XGBoost optimized data structure
DMat_train = xgb.DMatrix(
    data=X_train,
    label=y_train,
    feature_names=X_train.columns
)
DMat_test = xgb.DMatrix(
    data=X_test,
    label=y_test,
    feature_names=X_test.columns
)

## Training

In [None]:
# See documentation for all XGBoost parameters
# https://xgboost.readthedocs.io/en/stable/parameter.html

parameters = dict(
    # General parameters
    booster = 'gbtree',  # default gbtree
    # Tree Booster Parameters
    eta = 0.3,  # learning_rate default 0.3 range [0, 1]
    max_depth = 6,  # default 6 typical 1–10
    colsample_bytree = 0.3,  # default 1 range (0, 1]
    reg_lambda = 1,  # L2 regularization default 1
    alpha = 0,  # L1 regularization default 0
    # Learning Task Parameters
    objective = 'reg:squarederror',  # default reg:squarederror
    eval_metric = 'rmse',  # regression default 'rmse'
    seed = rs  # random seed
)


In [20]:
# SKlearn wrapper
xg_reg = xgb.XGBRegressor(
    objective='reg:linear',
    colsample_bytree = 0.3,
    learning_rate = 0.1,
    max_depth = 5,
    alpha = 10,
    n_estimators = 10
)

In [None]:
xg_reg.fit(X_train, y_train)

## Predicting 

In [None]:
y_pred = xg_reg.predict(X_test)

## Model Evaluation

In [None]:
# Root mean squared error
rmse = np.sqrt(mean_squared_error(y_true=y_test,
                                  y_pred=y_pred))
print(f'RMSE: {rmse}')

## Cross validation

In [None]:
params = {"objective":"reg:linear",
          'colsample_bytree': 0.3,
          'learning_rate': 0.1,
          'max_depth': 5,
          'alpha': 10}

cv_results = xgb.cv(
    dtrain=data_dmatrix,
    params=params,
    nfold=3,
    num_boost_round=50,
    early_stopping_rounds=10,
    metrics="rmse",
    as_pandas=True,
    seed=123
)

## Visualize

In [None]:
xg_reg = xgb.train(
    params=params,
    dtrain=data_dmatrix,
    num_boost_round=10
)

In [None]:
fig, ax = plt.subplots(figsize=[50, 10])
ax = xgb.plot_tree(booster=xg_reg, num_trees=0)
plt.show()

In [None]:
# Feature Importance
fig, ax = plt.subplots(figsize=[5, 5])
ax = xgb.plot_importance(xg_reg)
plt.show()