# Machine Learning

In this file, instructions how to approach the challenge can be found.

We are going to work on different types of Machine Learning problems:

- **Regression Problem**: The goal is to predict delay of flights.
- **(Stretch) Multiclass Classification**: If the plane was delayed, we will predict what type of delay it is (will be).
- **(Stretch) Binary Classification**: The goal is to predict if the flight will be cancelled.

In [1]:
# import pandas
import pandas as pd
import numpy as np
import copy

import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor, XGBClassifier, plot_importance
from sklearn.metrics import r2_score, mean_squared_error

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from pandas import MultiIndex, Int64Index


### Read Preprocessed Data

In [36]:
# load data
df = pd.read_csv("data/flights_preprocessed_42k.csv", index_col=0)
df.head(3)

Unnamed: 0,fl_date,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,dep_time,dep_delay,taxi_out,...,arr_hr_sin,arr_hr_cos,fl_mnth_sin,fl_mnth_cos,fl_wkday_sin,fl_wkday_cos,day_num_of_flights,num_flights_6hrs,inbound_fl_num,inbound_fl
0,2019-06-07,WN,N8683D,2784,12889,11066,1900-01-01 18:30:00,1825.0,-5.0,18.0,...,0.258819,0.965926,1.224647e-16,-1.0,-0.433884,-0.900969,25,9,0,0
1,2019-07-26,B6,N535JB,1347,11278,14843,1900-01-01 08:00:00,808.0,8.0,15.0,...,0.258819,-0.965926,-0.5,-0.866025,-0.433884,-0.900969,4,2,0,0
2,2019-06-03,CP,N217NN,6092,12892,11292,1900-01-01 19:43:00,2014.0,31.0,14.0,...,-0.258819,0.965926,1.224647e-16,-1.0,0.0,1.0,15,12,1,1


In [37]:
df.shape

(42127, 62)

In [38]:
# reset dtypes
categorical_features = ['op_unique_carrier',
                        'tail_num', 
                        'op_carrier_fl_num',
                        'origin_airport_id',
                        'dest_airport_id',
                        # 'share_code',
                        'origin_city',
                        'origin_state',
                        'dest_city',
                        'dest_state',
                        'fl_month',
                        'fl_weekday',
                        'season',
                        'inbound_fl']

df[categorical_features] = df[categorical_features].astype('str')
# df_train[categorical_features] = df_train[categorical_features].astype('str')
# df_test[categorical_features] =df_test[categorical_features].astype('str')

#### More Feature Engineering

##### Transform some new features by using 'arr_delay'

##### Target Encoding before splitting dataset

In [42]:
def leave_one_out_pct(df, i, d='arr_delay'):
    """
    Caculate group occurance percentage with cross calculation for interested categorical column, and imput leave_one_out_mean value into dataframe
    PARAMS:
        df (pd.DataFrame): 
        i (str): categorial independent variable
        d (str): dependent variable
    RETURNS (pd.Series):
        pandas series containing leave-one-out occurance percentage
    """
    data = df.copy()[[i, d]]
    group_ct = data.groupby(i, as_index=False).count().rename(columns={d: 'ct'})
    group_delay_ct = data[data[d] >= np.log(15 - diff)].groupby(i, as_index=False).count().rename(columns={d: 'delay_ct'})
    data = pd.merge(data, group_ct, how='left', on=i)
    data = pd.merge(data, group_delay_ct, how='left', on=i)
    data['leftout_pct'] = (data['delay_ct'] - 1) / (data['ct'] - 1)
        
    data = data.fillna(0)
    return data['leftout_pct']

In [43]:
def leave_one_out_mean(df, i, d='arr_delay'):
    """
    Caculate group means with cross calculation for interested categorical column, and imput leave_one_out_mean value into dataframe
    PARAMS:
        df (pd.DataFrame): 
        i (str): categorial independent variable
        d (str): dependent variable
    RETURNS (pd.Series):
        pandas series containing leave-one-out mean values
    """
    data = df.copy()[[i, d]]
    group_sum_count = data.groupby(i)[d].agg(['sum', 'count']).reset_index()
    data = pd.merge(data, group_sum_count, how='left', on=i)
    data['leftout_sum'] = data['sum'] - data[d]
    data['leftout_mean'] = data['leftout_sum'] / (data['count'] - 1)
    data = data.fillna(0)
    return data['leftout_mean']

In [44]:
df.shape

(42127, 62)

In [45]:
# calculate how many delay count percentage ('arr_delay' > 15) happened on each carrier/flight_num/tail_num/carrier/origin_airport/dest_airport/origin_city/origin_state/dest_city/dest_state 
# calculate average delay time of each ... (same as above)
# merge with df

tran_features = ['op_unique_carrier', 'tail_num',  'op_carrier_fl_num', 'origin_airport_id', 'dest_airport_id', 'origin_city', 'origin_state', 'dest_city', 'dest_state']

for col in tran_features:    
    df[f'{col}_leftout_pct'] = leave_one_out_pct(df, col)
    df[f'{col}_leftout_mean'] = leave_one_out_mean(df, col)

In [46]:
df.shape

(42127, 80)

In [47]:
df.iloc[:, -9:].isnull().sum()

dest_airport_id_leftout_mean    0
origin_city_leftout_pct         0
origin_city_leftout_mean        0
origin_state_leftout_pct        0
origin_state_leftout_mean       0
dest_city_leftout_pct           0
dest_city_leftout_mean          0
dest_state_leftout_pct          0
dest_state_leftout_mean         0
dtype: int64

## Main Task: Regression Problem

#### XGBoost

In [50]:
avail_features = [
    # 'fl_date',
    # 'op_unique_carrier',
    # 'tail_num', 
    # 'op_carrier_fl_num',
    # 'origin_airport_id',
    # 'dest_airport_id',
    # 'crs_dep_time',
    # 'crs_arr_time',
    # 'crs_elapsed_time',
    'distance',
    'share_code',
    # 'origin_city',
    # 'origin_state',
    # 'dest_city',
    # 'dest_state',
    # 'arr_date',
    # 'dep_datetime',
    # 'arr_datetime',
    # 'fl_month',
    # 'fl_weekday',
    # 'season',
    # 'day_num_of_flights',
    'num_flights_6hrs',
    'inbound_fl_num',
    # 'inbound_fl',
    # 'dep_min_of_day',
    # 'arr_min_of_day',
    # 'dep_hr',
    # 'arr_hr',
    'arr_min_sin',
    'arr_min_cos',
    # 'arr_hr_sin',
    # 'arr_hr_cos',
    'dep_min_sin', 
    'dep_min_cos', 
    # 'dep_hr_sin', 
    # 'dep_hr_cos',
    'fl_mnth_sin', 
    'fl_mnth_cos',
    'fl_wkday_sin',
    'fl_wkday_cos',
    'op_unique_carrier_leftout_pct',
    'op_unique_carrier_leftout_mean',
    'tail_num_leftout_pct', 
    'tail_num_leftout_mean',
    'op_carrier_fl_num_leftout_pct',
    'op_carrier_fl_num_leftout_mean',
    'origin_airport_id_leftout_pct', 
    'origin_airport_id_leftout_mean',
    'dest_airport_id_leftout_pct',
    'dest_airport_id_leftout_mean',
    # 'origin_city_leftout_pct',
    'origin_city_leftout_mean',
    # 'origin_state_leftout_pct', 
    'origin_state_leftout_mean',
    # 'dest_city_leftout_pct', 
    'dest_city_leftout_mean',
    'dest_state_leftout_pct',
    # 'dest_state_leftout_mean'
]

In [51]:
X_train, X_test, y_train, y_test = train_test_split(df[avail_features], df['arr_delay'], train_size=0.7, test_size=0.3, random_state=888)

xg_reg = XGBRegressor(objective ='reg:squarederror',
                      learning_rate = 0.05,
                      max_depth = 3,
                      reg_lambda = 15,
                      gamma = 10,
                      n_estimators = 150)
xg_reg.fit(X_train, y_train)
y_pred = xg_reg.predict(X_test)
# y_pred = np.exp(xg_reg.predict(X_test)) + diff

In [52]:
r2_score(y_test, y_pred)

0.3434403074894703

In [53]:
xg_reg.score(X_train, y_train)

0.38102631575297397

In [28]:
# X_train = df_train[avail_features]
# # y_train = target_train_log
# y_train = target_train
# X_test = df_test[avail_features]
# y_test = target_test

# xg_reg = XGBRegressor(objective ='reg:squarederror',
#                       learning_rate = 0.1,
#                       max_depth = 6,
#                       # reg_lambda = 10,
#                       n_estimators = 300)
# xg_reg.fit(X_train, y_train)
# y_pred = xg_reg.predict(X_test)
# y_pred = np.exp(xg_reg.predict(X_test)) + diff

In [29]:
# xg_reg.score(X_train, y_train)

0.9530710560765032

In [30]:
# xg_reg.score(X_test, y_test)

0.49758117556661596

In [31]:
## Best Score got so far
# r2_score(y_test, y_pred)

0.49758117556661596

##### PCA

In [20]:
# pca_features = [
#     # 'op_unique_carrier',
#     # 'tail_num'.
#     # 'op_carrier_fl_num',
#     # 'origin_airport_id',
#     # 'dest_airport_id',
#     'crs_elapsed_time',
#     'distance',
#     'share_code',
#     # 'origin_city',
#     # 'origin_state',
#     # 'dest_city',
#     # 'dest_state',
#     'fl_month',
#     'fl_weekday',
#     'season',
#     'day_num_of_flights',
#     'num_flights_6hr',
#     'inbound_fl_num',
#     'inbound_fl',
#     'dep_min_of_day',
#     'arr_min_of_day',
#     'dep_hr',
#     'arr_hr',
#     'arr_hr_sin',
#     'arr_hr_cos',
#     'arr_min_sin',
#     'arr_min_cos',
#     'dep_min_sin',
#     'dep_min_cos',
#     'dep_hr_sin',
#     'dep_hr_cos',
#     'fl_mnth_sin',
#     'fl_mnth_cos',
#     'fl_wkday_sin',
#     'fl_wkday_cos',
#     'op_unique_carrier_delayct',
#     'op_unique_carrier_delaymedian',
#     'tail_num_delayct',
#     'tail_num_delaymedian',
#     'op_carrier_fl_num_delayct',
#     'op_carrier_fl_num_delaymedian',
#     'origin_airport_id_delayct',
#     'origin_airport_id_delaymedian',
#     'dest_airport_id_delayct',
#     'dest_airport_id_delaymedian',
#     'origin_city_delayct',
#     'origin_city_delaymedian',
#     'origin_state_delayct',
#     'origin_state_delaymedian',
#     'dest_city_delayct',
#     'dest_city_delaymedian',
#     'dest_state_delayct',
#     'dest_state_delaymedian'
# ]

In [21]:
# df_X = pd.concat([df_train[pca_features], df_test[pca_features]])
# df_train.shape[0]

10609

In [25]:
# X_scaled = scaler.fit_transform(df_X)

# pca = PCA(n_components='mle')
# pca.fit(X_scaled)
# X_pca = pca.transform(X_scaled)

In [26]:
# X_scaled_train = X_pca[:10609, :]
# X_scaled_test = X_pca[10609:, :]
# y_train = target_train_log
# y_test = target_test

# xg_reg = XGBRegressor(objective ='reg:squarederror',
#                       learning_rate = 0.1,
#                       max_depth = 6,
#                       # reg_lambda = 10,
#                       n_estimators = 300)
# xg_reg.fit(X_scaled_train, y_train)
# # y_pred = xg_reg.predict(X_test)
# y_pred = np.exp(xg_reg.predict(X_scaled_test)) + diff

In [27]:
# r2_score(y_test, y_pred)

0.11846731863060067

In [43]:
# features = [
#     # 'op_unique_carrier',
#     # 'tail_num'.
#     # 'op_carrier_fl_num',
#     # 'origin_airport_id',
#     # 'dest_airport_id',
#     # 'crs_elapsed_time',
#     'distance',
#     'share_code',
#     # 'origin_city',
#     # 'origin_state',
#     # 'dest_city',
#     # 'dest_state',
#     # 'fl_month',
#     # 'fl_weekday',
#     # 'season',
#     # 'day_num_of_flights',
#     # 'num_flights_6hr',
#     # 'inbound_fl_num',
#     # 'inbound_fl',
#     # 'dep_min_of_day',
#     # 'arr_min_of_day',
#     # 'dep_hr',
#     # 'arr_hr',
#     # 'arr_hr_sin',
#     # 'arr_hr_cos',
#     # 'arr_min_sin',
#     # 'arr_min_cos',
#     'dep_min_sin',
#     # 'dep_min_cos',
#     # 'dep_hr_sin',
#     # 'dep_hr_cos',
#     # 'fl_mnth_sin',
#     # 'fl_mnth_cos',
#     # 'fl_wkday_sin',
#     # 'fl_wkday_cos',
#     # 'op_unique_carrier_delayct',
#     # 'op_unique_carrier_delaymedian',
#     'tail_num_delayct',
#     # 'tail_num_delaymedian',
#     'op_carrier_fl_num_delayct',
#     # 'op_carrier_fl_num_delaymedian',
#     # 'origin_airport_id_delayct',
#     # 'origin_airport_id_delaymedian',
#     # 'dest_airport_id_delayct',
#     # 'dest_airport_id_delaymedian',
#     # 'origin_city_delayct',
#     'origin_city_delaymedian',
#     # 'origin_state_delayct',
#     'origin_state_delaymedian',
#     'dest_city_delayct',
#     # 'dest_city_delaymedian',
#     # 'dest_state_delayct',
#     'dest_state_delaymedian'
# ]

In [44]:
# scores = []
# for f in features:
#     X_train = df_train[[f]]
#     y_train = target_train_log
#     X_test = df_test[[f]]
#     y_test = target_test
    
#     xg_reg = XGBRegressor(objective ='reg:squarederror',
#                       learning_rate = 0.1,
#                       max_depth = 6,
#                       # reg_lambda = 10,
#                       n_estimators = 300)
#     xg_reg.fit(X_train, y_train)
#     y_pred = np.exp(xg_reg.predict(X_test)) + diff
#     # y_pred = xg_reg.predict(X_test)
    
#     scores.append([f, xg_reg.score(X_train, y_train), r2_score(y_test, y_pred)])

In [45]:
# s = pd.DataFrame(scores)
# s[s[2]==s[2].max()]

Unnamed: 0,0,1,2
5,origin_city_delaymedian,0.824356,0.240021
