In [None]:

import pandas as pd
import numpy as np
from sklearn import linear_model, datasets

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn import metrics
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import from Google Drive
df_flights_final = pd.read_csv('/content/drive/My Drive/df_flights_final.csv')

In [None]:
# Drop fl_date, mkt_carrier, and mkt_carrier_fl_num (these columns will only appear in sample_submission.csv)
df_flights_modelling = df_flights_final.drop(columns=['Unnamed: 0', 'fl_date', 'mkt_carrier', 'mkt_carrier_fl_num'])

# Preview dataframe data
df_flights_modelling.head(1)

Unnamed: 0,origin,dest,arr_delay,crs_elapsed_time,distance,daily_arr_delay_mean,daily_carrier_delay_mean,daily_weather_delay_mean,daily_nas_delay_mean,daily_security_delay_mean,daily_late_aircraft_delay_mean,daily_arr_delay_std,daily_carrier_delay_std,daily_weather_delay_std,daily_nas_delay_std,daily_security_delay_std,daily_late_aircraft_delay_std,dep_mean_hourly_delay,arr_mean_hourly_delay,dep_std_hourly_delay,arr_std_hourly_delay,mean_mkt_carrier_delay,mean_op_carrier_delay,std_mkt_carrier_delay,std_op_carrier_delay,mean_tail_num_arr_delay,std_tail_num_arr_delay
0,DSM,CLT,-7.0,156.0,815.0,-8.077677,1.972592,0.067005,0.446991,0.015104,1.71087,25.955746,13.840267,3.313573,3.589225,1.152993,14.014627,-0.449436,-3.107996,10.583675,22.461241,4.441386,3.721582,19.546037,18.803913,3.590909,22.829314


In [None]:
def data_splitter(df, sample_num=0, keep_categorical=False, scaling_method='None', ):
    '''
    Given a dataframe of specific structure, create X and y test/train splits.
        Arguments:
                df: the name of the dataframe to be split
                keep_categorical: whether to keep or drop categorical data (False = drop, True = keep)
                scaling_method: one of 'None', 'Standard', or 'MinMax'
                sample_num: sample the data for faster regressions (default: no sampling)
        
        Order of Operations:
            1. Sample (if applicable)
            2. Split
            3. Encode (if applicable)            
            4. Scale  (if applicable)        
    '''

    # Sample (if applicable)
    if sample_num > 0:
        df = df.sample(n=sample_num)

    # Split
    X = df.loc[:,df.columns != 'arr_delay']
    y = df.loc[:,'arr_delay']
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

    # Encode (if applicable)
    if keep_categorical == False:
        X_train = X_train.drop(columns = ['origin', 'dest'])
        X_test = X_test.drop(columns = ['origin', 'dest'])
    else:
        # Encode categorical features as a one-hot numeric array
        enc = OneHotEncoder(handle_unknown='ignore')
        
        fitted_model = enc.fit(X_train[['origin', 'dest']])
        transformed_columns_train = fitted_model.transform(X_train[['origin', 'dest']]).toarray()
        transformed_columns_test = fitted_model.transform(X_test[['origin', 'dest']]).toarray()
               
        X_train = pd.concat([X_train.drop(columns = ['origin', 'dest']).reset_index(), pd.DataFrame(transformed_columns_train)], axis=1).drop(columns='index')
        X_test = pd.concat([X_test.drop(columns = ['origin', 'dest']).reset_index(), pd.DataFrame(transformed_columns_test)], axis=1).drop(columns='index')
    
    # Scaling (if applicable)
    if scaling_method == 'Standard':
        scaler = StandardScaler()
        X_train = pd.DataFrame(scaler.fit_transform(X_train.astype(float)), columns = X_train.columns)
        X_test = pd.DataFrame(scaler.fit_transform(X_test.astype(float)), columns = X_test.columns)
    elif scaling_method == 'MinMax':
        scaler = MinMaxScaler()
        X_train = pd.DataFrame(scaler.fit_transform(X_train.astype(float)), columns = X_train.columns)
        X_test = pd.DataFrame(scaler.fit_transform(X_test.astype(float)), columns = X_test.columns)
    elif scaling_method == 'None':
        pass
    else:
        print('No valid scaling method specified!')


    return X_train, X_test, y_train, y_test

In [None]:
# XGBoost model

from xgboost import XGBRegressor

X_train_XGBoost, X_test_XGBoost, y_train_XGBoost, y_test_XGBoost = data_splitter(df_flights_modelling, sample_num = 0, keep_categorical = True, scaling_method = 'MinMax')
model_XGBoost = XGBRegressor(random_state=42,silent=True)
model_XGBoost.fit(X_train_XGBoost, y_train_XGBoost)

#Predict the response for test dataset
y_pred_XGBoost = model_XGBoost.predict(X_test_XGBoost)

In [None]:
# XGBoost metrics
print('XGBoost R2 score:', metrics.r2_score(y_test_XGBoost, y_pred_XGBoost))




XGBoost R2 score: 0.17529148563650543


In [None]:
# Grid Search for XGBoost
#params_XGBoost = {"learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 #"max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 #"min_child_weight" : [ 1, 3, 5, 7 ],
 #"gamma"            : [ 0.0, 0.2 , 0.4 ] }

#grid_search_XGBoost = GridSearchCV(estimator = model_XGBoost, param_grid = params_XGBoost)
#grid_search_XGBoost.fit(X_train_XGBoost, y_train_XGBoost)

In [None]:
#print(grid_search_XGBoost.best_estimator_)

In [None]:
#print("Precision:",metrics.precision_score(y_test_XGBoost, y_pred_XGBoost_grid_search))
#print("Recall:",metrics.recall_score(y_test_XGBoost, y_pred_XGBoost_grid_search))

In [None]:

X_train_XGBoost, X_test_XGBoost, y_train_XGBoost, y_test_XGBoost = data_splitter(df_flights_modelling, sample_num = 0, keep_categorical = False, scaling_method = 'Standard')
model_XGBoost = XGBRegressor(learning_rate=0.05, random_state=42, max_depth=15, silent=True )
model_XGBoost.fit(X_train_XGBoost, y_train_XGBoost)

#Predict the response for test dataset
y_pred_XGBoost = model_XGBoost.predict(X_test_XGBoost)

In [None]:
print('XGBoost R2 score:', metrics.r2_score(y_test_XGBoost, y_pred_XGBoost))

XGBoost R2 score: 0.21412918838853834


In [None]:
X_train_XGBoost, X_test_XGBoost, y_train_XGBoost, y_test_XGBoost = data_splitter(df_flights_modelling, sample_num = 0, keep_categorical = True, scaling_method = 'Standard')
model_XGBoost = XGBRegressor(learning_rate=0.05, random_state=42, max_depth=15, n_estimators=200, silent=True )
model_XGBoost.fit(X_train_XGBoost, y_train_XGBoost)

#Predict the response for test dataset
y_pred_XGBoost = model_XGBoost.predict(X_test_XGBoost)
print('XGBoost R2 score:', metrics.r2_score(y_test_XGBoost, y_pred_XGBoost))

In [None]:
print('MSE:', metrics.mean_squared_error(y_test_XGBoost, y_pred_XGBoost))
print('RMSE:', metrics.mean_squared_error(y_test_XGBoost, y_pred_XGBoost)**0.5)