# XGBoost

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,

# Project level modules
import modules.preprocessing_functions as ppf

In [2]:
# Random State
rs = 99

## Load Data

In [69]:
google_drive_path = ('~/Google Drive/My Drive/Lighthouse Labs/'
                     + 'Mid-term Project/Data-Jan/')

In [131]:
# Use the command line to take a random sample of the input data without
# loading the entire file to memory.
# On MacOS Monterey gshuf is available by installing coreutils
# https://formulae.brew.sh/formula/coreutils

# Keep the header
!head -n 1 ~/Google\ Drive/My\ Drive/Lighthouse\ Labs/Mid-term\ Project/Data-Jan/2018-01.csv > ../data/sample.csv
# randomly sample rows and append to header
!tail -n +2 ~/Google\ Drive/My\ Drive/Lighthouse\ Labs/Mid-term\ Project/Data-Jan/2018-01.csv | gshuf -n 10000 >> ../data/sample.csv

In [3]:
def load_and_process(csv_path: 'str'):
    """
    Load the csv, process NAN values in the target variable, and
    drop irrelevant rows
    
    Parameters
    ----------
    csv_path : string
    
    Returns
    -------
    df : Pandas Dataframe
    """
    
    # Load csv and parse the first column as dates
    df = pd.read_csv(csv_path, parse_dates=[0])
    
    # Set NAN values in the target feature to 0
    df = ppf.process_nan_values(
        df=df,
        features_to_zero=['arr_delay']
    )
    
    # Drop flight rows that were cancelled or diverted
    df = df[
        (df['cancelled'] == 0) &
        (df['diverted'] == 0)
    ]
    
    return df

In [4]:
file_name = 'sample'
path = f'../data/{file_name}.csv'

In [5]:
data = load_and_process(csv_path=path)

## Data Split

In [6]:
# Predictor features set
X = ppf.flight_test_features(data, purged=True)
# Target variable
y = data['arr_delay']

In [7]:
X.shape, y.shape

((9982, 10), (9982,))

## Preprocessing

In [None]:
# NA replacement
# Split X into categorical and numeric features
# For numeric
# Scale and transform
# For categorical
# One hot encode/dummy

### Categorical

In [8]:
X_numerical, X_categorical = ppf.numerical_categorical_split(df=X)

#### One hot encoding
Doing One hot encoding before the train test split is based on the Machine Learning Mastery tutorial [Data Preparation for Gradient Boosting with XGBoost in Python](https://machinelearningmastery.com/data-preparation-gradient-boosting-xgboost-python/)

In [9]:
# One hot encoding
X_cat_one_hot = pd.get_dummies(
    X_categorical,
    prefix=X_categorical.columns[1:],
    # will fl_date be encoded if there was more than one date?
    drop_first=True
)
X_cat_one_hot.head(2)

Unnamed: 0,fl_date,op_unique_carrier_9K,op_unique_carrier_AA,op_unique_carrier_AS,op_unique_carrier_AX,op_unique_carrier_B6,op_unique_carrier_C5,op_unique_carrier_CP,op_unique_carrier_DL,op_unique_carrier_EV,...,"dest_city_name_West Palm Beach/Palm Beach, FL","dest_city_name_White Plains, NY","dest_city_name_Wichita Falls, TX","dest_city_name_Wichita, KS","dest_city_name_Williamsport, PA","dest_city_name_Williston, ND","dest_city_name_Wilmington, NC","dest_city_name_Worcester, MA","dest_city_name_Yakima, WA","dest_city_name_Yuma, AZ"
0,2018-01-04,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2018-01-23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Reconstitue original data table with one hot encoded features
X_one_hot = pd.concat([X_cat_one_hot, X_numerical], axis=1)
X_one_hot.head(2)

Unnamed: 0,fl_date,op_unique_carrier_9K,op_unique_carrier_AA,op_unique_carrier_AS,op_unique_carrier_AX,op_unique_carrier_B6,op_unique_carrier_C5,op_unique_carrier_CP,op_unique_carrier_DL,op_unique_carrier_EV,...,"dest_city_name_Wichita, KS","dest_city_name_Williamsport, PA","dest_city_name_Williston, ND","dest_city_name_Wilmington, NC","dest_city_name_Worcester, MA","dest_city_name_Yakima, WA","dest_city_name_Yuma, AZ",crs_dep_time,crs_arr_time,distance
0,2018-01-04,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,845,1125,1790
1,2018-01-23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1334,1655,875


### Train Test Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X_one_hot, y,
    train_size=0.8,
    test_size=0.2,
    random_state=rs
)

In [12]:
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

### Numerical

In [13]:
# Separate numeric and categorical features

numeric_features = X_numerical.columns.to_list() + ['fl_date']

X_train_num = X_train[numeric_features].copy()
X_train_cat = X_train.drop(numeric_features, axis=1)  # One hot encoded

X_test_num = X_test[numeric_features].copy()
X_test_cat = X_test.drop(numeric_features, axis=1)  # One hot encoded

In [14]:
X_train_num.loc[:, 'fl_date'] = X_train_num['fl_date'].view(int) / 1e11
X_test_num.loc[:, 'fl_date'] = X_test_num['fl_date'].view(int) / 1e11

In [15]:
# Scale numeric features
scaler = MinMaxScaler()
X_train_num_scaled = pd.DataFrame(
    scaler.fit_transform(X=X_train_num),
    columns=X_train_num.columns
)
X_test_num_scaled = pd.DataFrame(
    scaler.transform(X=X_test_num),
    columns=X_test_num.columns
)

In [16]:
# Reconstitue train and test datasets
X_train = pd.concat([X_train_cat, X_train_num_scaled], axis=1)
X_test = pd.concat([X_test_cat, X_test_num_scaled], axis=1)

In [114]:
# Dmatrix : XGBoost optimized data structure
DMat_train = xgb.DMatrix(
    data=X_train,
    label=y_train
)
DMat_test = xgb.DMatrix(
    data=X_test,
    label=y_test
)

## Training

### Default

In [17]:
xg_reg_default = xgb.XGBRegressor()
xg_reg_default.fit(X=X_train, y=y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [18]:
y_pred_default = xg_reg_default.predict(X_test)

In [20]:
# Root mean squared error
rmse = mean_squared_error(
    y_true=y_test,
    y_pred=y_pred_default,
    squared=False
)
print(f'RMSE: {rmse}')

RMSE: 54.3046192617478


In [None]:
# See documentation for all XGBoost parameters
# https://xgboost.readthedocs.io/en/stable/parameter.html

parameters = dict(
    # General parameters
    booster = 'gbtree',  # default gbtree
    # Tree Booster Parameters
    eta = 0.3,  # learning_rate default 0.3 range [0, 1]
    max_depth = 6,  # default 6 typical 1–10
    colsample_bytree = 0.3,  # default 1 range (0, 1]
    reg_lambda = 1,  # L2 regularization default 1
    alpha = 0,  # L1 regularization default 0
    # Learning Task Parameters
    objective = 'reg:squarederror',  # default reg:squarederror
    eval_metric = 'rmse',  # regression default 'rmse'
    seed = rs  # random seed
)


In [20]:
# SKlearn wrapper
xg_reg = xgb.XGBRegressor(
    objective='reg:linear',
    colsample_bytree = 0.3,
    learning_rate = 0.1,
    max_depth = 5,
    alpha = 10,
    n_estimators = 10
)

In [None]:
xg_reg.fit(X_train, y_train)

## Predicting 

In [None]:
y_pred = xg_reg.predict(X_test)

## Model Evaluation

In [None]:
# Root mean squared error
rmse = np.sqrt(mean_squared_error(y_true=y_test,
                                  y_pred=y_pred))
print(f'RMSE: {rmse}')

## Cross validation

In [None]:
params = {"objective":"reg:linear",
          'colsample_bytree': 0.3,
          'learning_rate': 0.1,
          'max_depth': 5,
          'alpha': 10}

cv_results = xgb.cv(
    dtrain=data_dmatrix,
    params=params,
    nfold=3,
    num_boost_round=50,
    early_stopping_rounds=10,
    metrics="rmse",
    as_pandas=True,
    seed=123
)

## Visualize

In [None]:
xg_reg = xgb.train(
    params=params,
    dtrain=data_dmatrix,
    num_boost_round=10
)

In [None]:
fig, ax = plt.subplots(figsize=[50, 10])
ax = xgb.plot_tree(booster=xg_reg, num_trees=0)
plt.show()

In [None]:
# Feature Importance
fig, ax = plt.subplots(figsize=[5, 5])
ax = xgb.plot_importance(xg_reg)
plt.show()