In [5]:
import pickle
from datetime import datetime
import pandas as pd
import numpy as np

## 1. Import the XGB Model

In [14]:

filename = 'xgb_best_model'

infile = open(filename,'rb')
xgb_best_model = pickle.load(infile)
infile.close()

In [15]:
type(xgb_best_model)

sklearn.model_selection._search.GridSearchCV

## 2. Import the Test Data with Features

In [11]:
flight_sample = pd.read_csv('flights_test_2020.zip', compression='zip')
flight_sample.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time',
       'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance',
       'flight_speed', 'flight_haul_type', 'flight_month',
       'arrival_hour_of_day', 'departure_hour_of_day', 'percent_delay',
       'percent_flights', 'avg_flights', 'avg_passengers', 'total_distance',
       'total_passengers', 'avg_fuel', 'state', 'avgCloud', 'avgDryDays',
       'avgRainDays', 'avgSnowDays', 'avgThunderDays'],
      dtype='object')

## 3. Remove un-needed features

'crs_dep_time',  
 'crs_arr_time',  
 'crs_elapsed_time',  
 'flights',  
 'distance',  
 'flight_speed',  
 'flight_haul_type',  
 'flight_month',  
 'arrival_hour_of_day',  
 'departure_hour_of_day',  
 'percent_delay',  
 'percent_flights',  
 'avg_flights',  
 'avg_passengers',   
 'total_distance',  
 'total_passengers',  
 'avg_fuel',  
 'avgCloud',  
 'avgDryDays',  
 'avgRainDays',  
 'avgSnowDays',  
 'avgThunderDays']  
 

In [12]:
pruned_data = flight_sample.drop(columns=[
    'mkt_carrier', 'mkt_unique_carrier',
    'op_carrier_fl_num', 'origin', 'origin_city_name',
    'dest', 'dest_city_name','state',
    'branded_code_share','mkt_carrier_fl_num','op_unique_carrier','tail_num',
    'origin_airport_id','dest_airport_id',
    'dup','fl_date'])
pruned_data.columns

Index(['crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'flights',
       'distance', 'flight_speed', 'flight_haul_type', 'flight_month',
       'arrival_hour_of_day', 'departure_hour_of_day', 'percent_delay',
       'percent_flights', 'avg_flights', 'avg_passengers', 'total_distance',
       'total_passengers', 'avg_fuel', 'avgCloud', 'avgDryDays', 'avgRainDays',
       'avgSnowDays', 'avgThunderDays'],
      dtype='object')

In [19]:
X = pruned_data
X.shape

(144711, 22)

In [20]:
def convert_pred_y(y_pred):
    return np.exp(y_pred,) - 63

## Convert to Arrays

In [21]:
X = X.to_numpy()

## 4. Run the model on the test data

In [22]:
y_pred = xgb_best_model.best_estimator_.predict(X)

## 5. Transform the prediction back from the log

In [23]:
y_final = convert_pred_y(y_pred)

In [28]:
y_final.mean()

-9.142537

In [34]:
flight_orig = flight_sample.copy()
flight_orig = flight_orig[['fl_date','mkt_carrier',
                          'mkt_carrier_fl_num','origin','dest']]

In [36]:
flight_orig['predicted_delay'] = y_final

In [37]:
flight_orig.head()

Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,origin,dest,predicted_delay
0,2020-01-01,WN,5888,ONT,SFO,-10.331104
1,2020-01-01,WN,5722,ONT,SMF,-10.331104
2,2020-01-01,WN,1499,ONT,PHX,-10.331104
3,2020-01-02,WN,2966,ONT,DEN,-10.331104
4,2020-01-02,WN,2047,ONT,SJC,-10.331104


In [38]:
flight_orig.to_csv('submission_xgb.csv')  