# Ensembles on Flight data

## Import libraries

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

## Load and Analyze the data

In [2]:
df = pd.read_csv('flight-data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 25 columns):
carrier         10000 non-null object
flight          10000 non-null int64
tailnum         9926 non-null object
origin          10000 non-null object
dest            10000 non-null object
air_time        9716 non-null float64
distance        10000 non-null int64
hour            10000 non-null int64
minute          10000 non-null int64
temp            9952 non-null float64
dewp            9952 non-null float64
humid           9952 non-null float64
wind_dir        9731 non-null float64
wind_speed      9951 non-null float64
wind_gust       2387 non-null float64
precip          9954 non-null float64
pressure        8808 non-null float64
visib           9954 non-null float64
type            8499 non-null object
manufacturer    8499 non-null object
model           8499 non-null object
engines         8499 non-null float64
seats           8499 non-null float64
engine          8499 non-n

In [3]:
df.shape

(10000, 25)

In [4]:
df.head()

Unnamed: 0,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,temp,...,precip,pressure,visib,type,manufacturer,model,engines,seats,engine,arr_delay
0,B6,1201,N643JB,JFK,FLL,135.0,1069,21,50,73.04,...,0.0,1019.4,10.0,Fixed wing multi engine,AIRBUS,A320-232,2.0,200.0,Turbo-fan,1.0
1,DL,1773,N3743H,JFK,LAS,287.0,2248,15,35,71.06,...,0.0,1020.4,10.0,Fixed wing multi engine,BOEING,737-832,2.0,189.0,Turbo-jet,0.0
2,EV,4572,N11107,EWR,GSP,84.0,594,15,9,78.8,...,0.0,,5.0,Fixed wing multi engine,EMBRAER,EMB-145XR,2.0,55.0,Turbo-fan,1.0
3,B6,179,N526JB,JFK,PHX,305.0,2153,17,35,44.06,...,0.0,1009.2,10.0,,,,,,,1.0
4,US,1733,N162UW,LGA,CLT,78.0,544,7,53,69.98,...,0.0,1021.3,10.0,Fixed wing multi engine,AIRBUS INDUSTRIE,A321-211,2.0,199.0,Turbo-jet,0.0


In [5]:
df.isna().sum().sum()

19007

There are some missing values, we will need to impute the data.

In [6]:
cols = df.columns
cols

Index(['carrier', 'flight', 'tailnum', 'origin', 'dest', 'air_time',
       'distance', 'hour', 'minute', 'temp', 'dewp', 'humid', 'wind_dir',
       'wind_speed', 'wind_gust', 'precip', 'pressure', 'visib', 'type',
       'manufacturer', 'model', 'engines', 'seats', 'engine', 'arr_delay'],
      dtype='object')

Let's see the categorical and numerical data for one hot encoding and standardization.

In [7]:
# numerical data
num_cols = df._get_numeric_data().columns
num_cols

Index(['flight', 'air_time', 'distance', 'hour', 'minute', 'temp', 'dewp',
       'humid', 'wind_dir', 'wind_speed', 'wind_gust', 'precip', 'pressure',
       'visib', 'engines', 'seats', 'arr_delay'],
      dtype='object')

In [8]:
# categorical data
list(set(cols) - set(num_cols))

['engine',
 'manufacturer',
 'origin',
 'type',
 'model',
 'tailnum',
 'dest',
 'carrier']

## Split off the features and the target variable

In [13]:
features = [x for x in df.columns if x != 'arr_delay']

X =  df[features]
y = df['arr_delay']


The target is 'arr_delay' (flight gets delayed or not).

## Train/Test sets split

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 20)
                                                            
print(f'Training samples: {X_train.shape[0]}')
print(f'Test samples: {X_test.shape[0]}')

Training samples: 8000
Test samples: 2000


## Create a processing and modeling pipeline to fit logistic regression, decision tree, and SVM using grid search

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

nums = ['flight', 'air_time', 'distance', 'hour', 'minute', 'temp', 'dewp',
       'humid', 'wind_dir', 'wind_speed', 'wind_gust', 'precip', 'pressure',
       'visib', 'engines', 'seats', 'arr_delay'],

categories = ['dest',
             'manufacturer',
             'engine',
             'model',
             'carrier',
             'origin',
             'type',
             'tailnum']


processing_pipeline = ColumnTransformer(transformers=[
    ('numscaling', StandardScaler(), nums),
    ('impute_missing', SimpleImputer(strategy='median'), nums),
    ('dummys', OneHotEncoder(handle_unknown='ignore', drop='first'), categories)]
)


lg_reg_pipe = modeling_pipeline = Pipeline([('data_processing', processing_pipeline),
                                       ('logreg', LogisticRegression())])

tr_pipe = modeling_pipeline = Pipeline([('data_processing', processing_pipeline),
                                       ('dt', DecisionTreeClassifier())])

svm_pipe = modeling_pipeline = Pipeline([('data_processing', processing_pipeline),
                                       ('model', SVC())])

lg_reg_param_grid = [
  {'logreg__C':[0.01, 0.1, 1, 10]}
 ]

tr_param_grid = [
  {'dt__max_depth': [2, 5, 10, 15, 20],
   'dt__min_samples_split':[3, 5, 10, 20, 40],
   'dt__min_samples_leaf': [2, 5],
   'dt__class_weight':[None]
  }
 ]

svm_param_grid = [
  {'model__C': [0.01, 0.1, 1, 10, 100], 'model__kernel': ['linear','rbf']}
 ]


lg_reg_results = GridSearchCV(estimator=lg_reg_pipe,param_grid=lg_reg_param_grid, scoring='accuracy', refit=True)

tr_results = GridSearchCV(estimator=tr_pipe, param_grid=tr_param_grid, scoring='accuracy', refit=True)

svm_results = GridSearchCV(estimator=svm_pipe, param_grid=svm_param_grid, scoring='accuracy', refit=True)


In [None]:
lg_reg_results = lg_reg_results.fit(X_train, y_train)
tr_results = tr_results.fit(X_train, y_train)
svm_results = svm_results.fit(X_train, y_train)

## Performance of the three above models

In [None]:
lg_reg_score = lg_reg_results.score(X_test, y_test)
tr_score = tr_results.score(X_test, y_test)
svm_score = svm_results.score(X_test, y_test)

print(f'Logistic Regression Score: {lreg_score:.2%}')
print(f'Decision Score: {tr_score:.2%}')
print(f'Support Vector Machine Score: {svm_score:.2%}')

## Fit an ensemble using the three above models and evaluate the performance

In [None]:
from sklearn.ensemble import VotingClassifier

ems = [('logreg', pipe1),('dt', pipe2),('model', pipe3)]
clf = VotingClassifier(estimators= ems, weights=None, voting='soft')

scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc')
print(f'ROC AUC {scores.mean():.2f} (+/- {scores.std():.2f}) [Ensemble]')

## Fit the AdaBoost model and evaluate the performance

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(base_estimator=tree, n_estimators=100, learning_rate=0.01, random_state=0)
ada = ada.fit(X_train.values, y_train)
ada_train_predict = ada.predict(X_train.values)
ada_test_predict = ada.predict(X_test.values)

ada_train_score = accuracy_score(y_train, ada_train_predict)
ada_test_score = accuracy_score(y_test, ada_test_predict)

print(f'AdaBoost Training Score: {ada_train_score:.2%}')
print(f'AdaBoost Test Score: {ada_test_score:.2%}')