# Model Experimentation I: Without Cross Validation
In this notebook we are conducting experiments without cross validation. Experimentation with cross validation takes place in a different notebook ([here](https://adb-731998097721284.4.azuredatabricks.net/?o=731998097721284#notebook/1215577238253368/command/1215577238253388)), but is based off of the results here.

## Notebook Setup

In [0]:
%pip install timezonefinder
%pip install tzfpy

Python interpreter will be restarted.
Collecting timezonefinder
  Downloading timezonefinder-6.1.8.tar.gz (45.9 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Collecting h3<4,>=3.7.6
  Downloading h3-3.7.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
Collecting cffi<2,>=1.15.1
  Using cached cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (441 kB)
Collecting setuptools>=65.5
  Using cached setuptools-65.6.3-py3-none-any.whl (1.2 MB)
Building wheels for collected packages: timezonefinder
  Building wheel for timezonefinder (PEP 517): started
  Building wheel for timezonefinder (PEP 517): finished with status 'done'
  Created wheel for timezonefinder: filename=timezonefinder-6.1.8-c

In [0]:
# General 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import sys
from statistics import mean
import itertools
import mlflow.spark

# PySpark 
from pyspark.sql.functions import col,isnan,when,count
from pyspark.sql.functions import regexp_replace

# SQL Functions
from pyspark.sql import functions as f
from pyspark.sql.functions import monotonically_increasing_id, to_timestamp, to_utc_timestamp, to_date
from pyspark.sql.functions import isnan, when, count, col, isnull, percent_rank, first, dense_rank
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType, FloatType, DecimalType
from pyspark.sql import SQLContext
from pyspark.sql.window import Window
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from functools import reduce
from pyspark.sql.functions import rand,col,when,concat,substring,lit,udf,lower,sum as ps_sum,count as ps_count,row_number
from pyspark.sql.window import *
from pyspark.sql import DataFrame
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import DenseMatrix, Vectors
from pyspark.sql.functions import row_number

# ML
from pyspark.ml.stat import Correlation
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, OneHotEncoder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Misc 
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from timezonefinder import TimezoneFinder
from tzfpy import get_tz



In [0]:
# Display and define where mids-w261 is located
data_BASE_DIR = "dbfs:/mnt/mids-w261/"
# display(dbutils.fs.ls(f"{data_BASE_DIR}"))

# Inspect the Mount's Final Project folder 
data_BASE_DIR = "dbfs:/mnt/mids-w261/datasets_final_project_2022/"
# display(dbutils.fs.ls(f"{data_BASE_DIR}"))

In [0]:
blob_container = "housestark" # The name of your container created in https://portal.azure.com
storage_account = "neilp" # The name of your Storage account created in https://portal.azure.com
secret_scope = "w261_s1g4" # The name of the scope created in your local computer using the Databricks CLI
secret_key = "w261_s1g4_key" # The name of the secret key created in your local computer using the Databricks CLI 
blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"
mount_path = "/mnt/mids-w261"

In [0]:
spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
# df = spark.read.parquet(f"{blob_url}/df_main_3m")
df_full = spark.read.parquet(f"{blob_url}/df_main_fullClean")

## Helper Functions

In [0]:
def preModeling_dataEdit(df):
  '''
  Input: df that has already gone through the final join, cleaning, and feature engineering
  Output: df that includes null imputing and # and % of flights (by tail number) that were delayed and cancelled in the past 90 days --> these depend on window functions, as such they need to be done right after the data is split for modelling and not during feature engineering phase
  '''
  
  ### FINAL CLEANING 
  # Remove rows with null scheduled_departure_UTC because these are rows without a proper timezone (timezonefinder could not find)
  df = df.na.drop(subset=["scheduled_departure_UTC"])
  dropCols = ['TAXI_IN', 'TAXI_OUT']
  df = df.drop(*dropCols) 

  
  ### FINAL FEATURE ADDITIONS
  ## GET NUMBER & PERCENTAGE OF TIMES A PLANE (BY TAIL NUMBER) HAS BEEN DELAYED OR CANCELLED IN THE PAST 3 MONTHS (2 COLUMNS)
  # Make window function
  df = df.withColumn('roundedMonth', f.date_trunc('month', df.scheduled_departure_UTC))
  window_3m = Window().partitionBy('TAIL_NUM').orderBy(f.col('roundedMonth').cast('long')).rangeBetween(-(86400), 0) # changed to 1 day instead of 3 months 

  # Add in Columns
  # Number of flights delayed/cancelled
  df = df.withColumn('no_delays_last1d', when(df.TAIL_NUM.isNotNull(), f.sum('dep_delay_15').over(window_3m)).otherwise(-1)) \
         .withColumn('no_cancellation_last1d', when(df.TAIL_NUM.isNotNull(), f.sum('CANCELLED').over(window_3m)).otherwise(-1)) 
  # Percentage of flights delayed/cancelled
  df = df.withColumn('count_flights_last1d', when(df.TAIL_NUM.isNotNull(), f.count('TAIL_NUM').over(window_3m)).otherwise(-1)) 
  df = df.withColumn('perc_delays_last1d', when(df.count_flights_last1d != -1, (df.no_delays_last1d/ df.count_flights_last1d)).otherwise(-1.0)) \
         .withColumn('perc_cancellation_last1d', when(df.count_flights_last1d != -1, (df.no_cancellation_last1d/ df.count_flights_last1d)).otherwise(-1.0))     
  
  ### HANDLING NULLS
  ## Imputing Hourly Weather Data to the best of our ability (up to 3 hours back)
  window = Window.partitionBy(col("ORIGIN_AIRPORT_ID"))\
                     .orderBy(col("rounded_depTimestamp"))\
                     .rowsBetween(0,3)
  
  cols_to_fill  = ['origin_HourlyAltimeterSetting', 'origin_HourlyDewPointTemperature', 'origin_HourlyDryBulbTemperature', 'origin_HourlyPrecipitation', 'origin_HourlyPressureChange', 'origin_HourlyPressureTendency', 'origin_HourlyRelativeHumidity', 'origin_HourlySeaLevelPressure', 'origin_HourlyStationPressure', 'origin_HourlyVisibility', 'origin_HourlyWetBulbTemperature', 'origin_HourlyWindDirection', 'origin_HourlyWindGustSpeed', 'origin_HourlyWindSpeed', 'origin_HourlySkyConditions_SCT_cnt', 'origin_HourlySkyConditions_OVC_cnt', 'origin_HourlySkyConditions_FEW_cnt', 'origin_HourlySkyConditions_BKN_cnt', 'origin_HourlySkyConditions_VV_cnt', 'origin_HourlySkyConditions_SKC_cnt', 'origin_HourlySkyConditions_CLR_cnt', 'dest_HourlyAltimeterSetting', 'dest_HourlyDewPointTemperature', 'dest_HourlyDryBulbTemperature', 'dest_HourlyPrecipitation', 'dest_HourlyPressureChange', 'dest_HourlyPressureTendency', 'dest_HourlyRelativeHumidity', 'dest_HourlySeaLevelPressure', 'dest_HourlyStationPressure', 'dest_HourlyVisibility', 'dest_HourlyWetBulbTemperature', 'dest_HourlyWindDirection','dest_HourlyWindGustSpeed', 'dest_HourlyWindSpeed', 'dest_HourlySkyConditions_SCT_cnt', 'dest_HourlySkyConditions_OVC_cnt', 'dest_HourlySkyConditions_FEW_cnt', 'dest_HourlySkyConditions_BKN_cnt', 'dest_HourlySkyConditions_VV_cnt', 'dest_HourlySkyConditions_SKC_cnt', 'dest_HourlySkyConditions_CLR_cnt']

  
  for field in cols_to_fill:
      filled_column_start = first(df[field], ignorenulls=True).over(window)
      df = df.withColumn(field, filled_column_start)
  
  ## We are still left with some null values --> will deal with them now in accordance to the table in section VII of this notebook
  impute_minus1int = ['DEP_DELAY_NEW', 'holiday' ,'holiday_in2DayRange']
  df = df.na.fill(value = -1,subset = impute_minus1int)
  
  impute_minus9999int = ['DEP_DELAY']
  df = df.na.fill(value = -9999,subset = impute_minus9999int)
  
  impute_minus1fl = ['perc_delays_last1d', 'perc_cancellation_last1d']
  df = df.na.fill(value = -1.0,subset = impute_minus1fl)
  
  impute_minus9999int = ['elevation_ft']
  df = df.na.fill(value = -9999,subset = impute_minus9999int)
  
  impute_99int = [ 'origin_HourlyRelativeHumidity', 'dest_HourlyRelativeHumidity']
  df = df.na.fill(value = 99 ,subset = impute_99int)
  
  impute_99fl = ['origin_HourlyPrecipitation', 'dest_HourlyPrecipitation']
  df = df.na.fill(value = 99.0 ,subset = impute_99fl)
  
  impute_999int = ['origin_HourlyPressureTendency', 'dest_HourlyPressureTendency']
  df = df.na.fill(value = 999 ,subset = impute_999int)
  
  impute_999fl = ['origin_HourlyPressureChange', 'dest_HourlyPressureChange']
  df = df.na.fill(value = 999.0 ,subset = impute_999fl)
  
  impute_9999int = ['origin_HourlyDewPointTemperature', 'origin_HourlyDryBulbTemperature', 'origin_HourlyWetBulbTemperature', 'origin_HourlyWindGustSpeed', 'dest_HourlyDewPointTemperature', 'dest_HourlyDryBulbTemperature', 'dest_HourlyWetBulbTemperature', 'dest_HourlyWindGustSpeed']
  df = df.na.fill(value = 9999 ,subset = impute_9999int)
    
  impute_99999int = ['origin_HourlyWindDirection', 'origin_HourlyWindSpeed', 'dest_HourlyWindDirection', 'dest_HourlyWindSpeed']
  df = df.na.fill(value = 99999 ,subset = impute_99999int)
  
  impute_99999fl = ['origin_HourlyAltimeterSetting',  'dest_HourlyAltimeterSetting', 'origin_HourlySeaLevelPressure','dest_HourlySeaLevelPressure', 'origin_HourlyStationPressure', 'dest_HourlyStationPressure']
  df = df.na.fill(value = 99999.0 ,subset = impute_99999fl)
  
  impute_999999fl = ['origin_HourlyVisibility', 'dest_HourlyVisibility']
  df = df.na.fill(value = 999999.0 ,subset = impute_999999fl)
  
  impute_str = ['TAIL_NUM', 'type', 'origin_HourlySkyConditions', 'dest_HourlySkyConditions', 'local_timestamp', 'timezone']
  df = df.na.fill(value = 'no_data',subset = impute_str)
  
  imputed_cols  = cols_to_fill + ['perc_delays_last1d', 'perc_cancellation_last1d', 'elevation_ft']
#   'no_delays_last1d', 'no_cancellation_last1d', 'count_flights_last1d', 
  return df,imputed_cols

In [0]:
# Function to create pipeline
def create_pipeline(df, inputCols_cat, inputCols_cont):
  """Creates a feature engineering pipeline for modeling 
  Args:
    inputCols_cat (list): list of categorical input cols
    inputCols_cont (list): list of continuous input cols 
  
  Returns: 
    pipeline (Pipeline): MLlib pipeline with stages  
  """
  
  # String Indexer
  inputCols_categorical_indexed = [f'{i}_index' for i in inputCols_cat]
  string_indexer = StringIndexer(inputCols = inputCols_cat, 
                                 outputCols = inputCols_categorical_indexed).setHandleInvalid('keep')

  # One Hot Encoder  
  inputCols_categorical_encoded = [f'{i}_encoded' for i in inputCols_categorical_indexed]
  one_hot_encoder = OneHotEncoder(inputCols = inputCols_categorical_indexed, 
                                  outputCols = inputCols_categorical_encoded)

  # Vector Assembler (Categorical)
  assembler_cat = VectorAssembler(inputCols = inputCols_categorical_encoded, 
                              outputCol = 'features_cat').setHandleInvalid('keep')
  
  # Vector Assembler (Continuous)
  assembler_cont = VectorAssembler(inputCols = inputCols_cont, 
                              outputCol = 'features_cont').setHandleInvalid('keep')

  # Pipeline
  return Pipeline().setStages([string_indexer, one_hot_encoder, assembler_cat, assembler_cont])

In [0]:
def impute_and_scale_features(df):
  
  # Impute data 
  imputed_df, imputed_cols = preModeling_dataEdit(df)

  # Vector Assembler (Continuous) 
  assembler_cont = VectorAssembler(inputCols = ['features_cont'] + imputed_cols, 
                              outputCol = 'features_cont_all').setHandleInvalid('keep')

  # Standard Scaler 
  scaler = StandardScaler(inputCol = 'features_cont_all',
                          outputCol = 'features_scaled',
                          withMean = True, withStd = True)
  
  # Vector Assembler (Continuous + Categorical) 
  assembler_all = VectorAssembler(inputCols = ['features_scaled', 'features_cat'], 
                              outputCol = 'features_all').setHandleInvalid('keep')

  pipeline = Pipeline().setStages([assembler_cont, scaler, assembler_all])

  # Create features_scaled for all dfs
  pipeline_df = pipeline.fit(imputed_df).transform(imputed_df) 
  
  return  pipeline_df

In [0]:
def get_sampling(train_df, sampling):
  """Modifies the training data to under/over sample 
  Args:
    train_df (df): training data
    sampling (string): if none, no sampling is performed; if under, undersampling is performed; if over, oversampling is performed 
  Returns:
    train_df_sampled (df): modified training data 
  """
  # No sampling 
  if sampling == 'none':
#     train_df = train_df.filter(col('label') != 2)
    return train_df
  
  # Undersampling
  elif sampling == 'under':
    no_delay = train_df.filter(col('label') == 0)
    delay = train_df.filter(col('label') == 1)
    
    class_ratio =  delay.count() / no_delay.count()
    no_delay_sample = no_delay.sample(withReplacement=True, fraction=class_ratio)
    train_df_sampled = delay.unionAll(no_delay_sample)
    
    return train_df_sampled
    
  # Oversampling
  elif sampling == 'over':
    no_delay = train_df.filter(col('label') == 0) #3000
    delay = train_df.filter(col('label') == 1) #700
    
    class_ratio = no_delay.count() / delay.count() #0.2
    delay_sample = delay.sample(withReplacement=True, fraction=class_ratio)
    train_df_sampled = no_delay.unionAll(delay_sample)
    
    return train_df_sampled
  

In [0]:
def get_model(model_type, params):
  """Builds a model based on the given parameters
  Args:
    model_type (string): type of model to be built 
    params (dict): dictionary of parameters specific to the model_type
  Returns:
    model: MLlib model ready to be trained 
    ml_type (string): type of model (classification or regression)
  """
  # Logistic Regression
  if model_type == 'LogisticRegression':
    ml_type = 'c'
    model = LogisticRegression(featuresCol = 'features_all',
                               labelCol = 'label',
                               maxIter = params['maxIter'],
                               regParam = params['regParam'],
                               elasticNetParam = params['elasticNetParam'])

  # Linear Regression
  elif model_type == 'LinearRegression':
    ml_type = 'r'
    model = LinearRegression(featuresCol = 'features_all',
                             labelCol = 'DEP_DELAY_NEW',
                             maxIter = params['maxIter'],
                             regParam = params['regParam'],
                             elasticNetParam = params['elasticNetParam'])

  # Decision Tree Classifier
  elif model_type == 'DecisionTreeClassifier':
    ml_type = 'c'
    model = DecisionTreeClassifier(featuresCol = 'features_all',
                                   labelCol = 'label',
                                   maxDepth = params['maxDepth'],
                                   impurity = params['impurity'],
                                   maxBins = params['maxBins'],
                                   minInfoGain = params['minInfoGain'])
    
  # Decision Tree Regressor
  elif model_type == 'DecisionTreeRegressor':
    ml_type = 'r'
    model = DecisionTreeRegressor(featuresCol = 'features_all',
                                  labelCol = 'DEP_DELAY_NEW',
                                  maxDepth = params['maxDepth'],
                                  minInfoGain = params['minInfoGain'])

  # Random Forest Classifier
  elif model_type == 'RandomForestClassifier':
    ml_type = 'c'
    model = RandomForestClassifier(featuresCol = 'features_all',
                                   labelCol='label',
                                   numTrees= params['numTrees'], 
                                   maxDepth=params['maxDepth'], 
                                   impurity = params['impurity'],
                                   maxBins = params['maxBins'],
                                   minInfoGain = params['minInfoGain'])
  
  # Random Forest Regressor
  elif model_type == 'RandomForestRegressor':
    ml_type = 'r'
    model = RandomForestRegressor(featuresCol = 'features_all',
                                   labelCol='DEP_DELAY_NEW',
                                   numTrees= params['numTrees'], 
                                   maxDepth=params['maxDepth'],
                                   minInfoGain = params['minInfoGain'])

  # Gradient Boosted Tree Regressor 
  elif model_type == 'GBTRegressor':
    ml_type = 'r'
    model = GBTRegressor(featuresCol = 'features_all',
                         labelCol='DEP_DELAY_NEW',
                         maxIter= params['maxIter'], 
                         maxDepth=params['maxDepth'],
                         stepSize = params['stepSize'],
                         minInfoGain = params['minInfoGain'])

  # MLP NN Classifier 
  elif model_type == 'MultilayerPerceptronClassifier':
    ml_type = 'c'
    model = MultilayerPerceptronClassifier(featuresCol = 'features_all',
                         labelCol='label',
                         layers = params['layers'],
                         maxIter= params['maxIter'], 
                         blockSize=params['blockSize'],
                         stepSize = params['stepSize'])

  
  return model, ml_type

In [0]:
def get_param_permutations(params):
  """Given a dictionary of parameters to test in a grid search, returns all possible permutations
  Args:
    params (dict): dictionary of parameters inputted by user
  Returns:
    param_list (list): list of dictionaries to pass to the model
  """
  param_list = []
  vals = params.values()

  # Loop through all permutations 
  for param_vals in list(itertools.product(*vals)):
    # Create a dictionary to hold each permutation of parameters 
    param_dict = {}
    # Loop over the different parameters 
    for i, key in enumerate(params.keys()):
      param_dict[key] = param_vals[i]
    # Add each dictionary to the parameter list 
    param_list.append(param_dict)
  return param_list 

In [0]:
def evaluate_model(predictions, ml_type):
  """Provides evaluation metrics for classification/regression models
  Args:
    predictions (df): dataframe of predicated and actual values 
    ml_type (string): type of model 
  Returns:
    classification: accuracy, precision, recall, f1score
    regression: r2, rmse, mse, mae
  """
  if ml_type == 'c':
    eval_accuracy = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction')
    eval_precision = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='precisionByLabel')
    eval_recall = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='recallByLabel')
    eval_f1 = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='f1')

    accuracy = eval_accuracy.evaluate(predictions)
    precision = eval_precision.evaluate(predictions)
    recall = eval_recall.evaluate(predictions)
    f1score = eval_f1.evaluate(predictions)
    
    return accuracy, precision, recall, f1score
    
  elif ml_type == 'r':
    eval_r2 = RegressionEvaluator(predictionCol='prediction', labelCol='DEP_DELAY_NEW', metricName='r2')
    eval_rmse = RegressionEvaluator(predictionCol='prediction', labelCol='DEP_DELAY_NEW', metricName='rmse')
    eval_mse = RegressionEvaluator(predictionCol='prediction', labelCol='DEP_DELAY_NEW', metricName='mse')
    eval_mae = RegressionEvaluator(predictionCol='prediction', labelCol='DEP_DELAY_NEW', metricName='mae')
    
    r2 = eval_r2.evaluate(predictions)
    rmse = eval_rmse.evaluate(predictions)
    mse = eval_mse.evaluate(predictions)
    mae = eval_mae.evaluate(predictions)
    
    return r2, rmse, mse, mae

## Modeling Functions

In [0]:
def train_model_no_CV(train_df, val_df, model_type, params, train_metrics=False):
  """Splits the df into time series cross validation splits, trains a model, and provides evaluation metrics. Should be used for experimentation to determine best model parameters.
  Args:
    train_df (df): training data that has been through grid_search_test_train_split
    val_df (df): validation data that has been through grid_search_test_train_split
    model_type (string): indicates the type of model that will be trained 
    params (dict): a dictionary of parameters as keys and list of parameter values as values 
        - LogisticRegression: { 'maxIter': [10,20,30], 'regParam': [0.2,0.3,0.4], 'elasticNetParam': [0,0.8,0.9] }
        - LinearRegression: { 'maxIter': [10,20,30], 'regParam': [0.2,0.3,0.4], 'elasticNetParam': [0,0.8,0.9] }
        - DecisionTreeClassifier: { 'numClasses': [3], 'maxDepth': [2], 'impurity': ['gini'], 'maxBins': [32] }
        - DecisionTreeRegressor: { 'maxDepth': [1,2,3] }
  
  Returns: 
    results_df (df):  dataframe of parameters tested and the results from that iteration  
  """

  # ---------- Train Model ---------- #
  param_permutations = get_param_permutations(params)

  results_df = pd.DataFrame()
  for param in param_permutations:
    model, ml_type = get_model(model_type, param)
    trained_model  = model.fit(train_df)
    
    if train_metrics == True:
      training_predictions = trained_model.transform(train_df)
    predictions          = trained_model.transform(val_df)

    # ---------- Evaluate Model ---------- #
    iter_params = pd.DataFrame(param, index=[0])
    
    # Classification 
    if ml_type == 'c':
      if train_metrics == True:
        train_accuracy, train_precision, train_recall, train_f1score = evaluate_model(training_predictions, ml_type)
        train_iter_results = pd.DataFrame({'Train Accuracy': [train_accuracy], 'Train Precision': [train_precision], 'Train Recall': [train_recall], 'Train F1 Score': [train_f1score]})
      val_accuracy, val_precision, val_recall, val_f1score = evaluate_model(predictions, ml_type)
      val_iter_results = pd.DataFrame({'Val Accuracy': [val_accuracy], 'Val Precision': [val_precision], 'Val Recall': [val_recall], 'Val F1 Score': [val_f1score]})

    # Regression
    elif ml_type == 'r':
      if train_metrics == True:
        train_r2, train_rmse, train_mse, train_mae = evaluate_model(training_predictions, ml_type)
        train_iter_results = pd.DataFrame({'Train R2': [train_r2], 'Train RMSE': [train_rmse], 'Train MSE': [train_mse], 'Train MAE': [train_mae]})
      val_r2, val_rmse, val_mse, val_mae = evaluate_model(predictions, ml_type)
      val_iter_results = pd.DataFrame({'Val R2': [val_r2], 'Val RMSE': [val_rmse], 'Val MSE': [val_mse], 'Val MAE': [val_mae]})
      
    if train_metrics == True:
      iter_df = pd.concat([iter_params, train_iter_results, val_iter_results], axis=1)  
    else:
      iter_df = pd.concat([iter_params, val_iter_results], axis=1)
    results_df = pd.concat([results_df,iter_df], axis=0)
  
  return results_df

In [0]:
def grid_search_test_train_split(pipeline_df, sample_size = None, sampling='none'):
  """Splits the dataframe in train and test splits for grid search 
  Args:
    df (dataframe): dataframe to model on; requirements:
      - Has gone through create_pipeline function 
      - Has 'Year' column from 2015 - 2021
      - Has 'features' column (not scaled)
      - Has 'label' column
    sample_size (float): optional parameter to specify if you would like a subset of the data 
    sampling (string): if none, no sampling is performed; if under, undersampling is performed; if over, oversampling is performed 
  
  Returns: 
    results_df (df):  dataframe of parameters tested and the results from that iteration   
  """
  # ---------- Split Data ---------- #
  train = pipeline_df.filter(col('Year') <= 2019)
  train = get_sampling(train, sampling)
  val   = pipeline_df.filter(col('Year') == 2020)
#   test  = pipeline_df.filter(col('Year') == 2021)

  # ---------- Get Subset of Train & Val Data ---------- #
  if sample_size:
    train = train.sample(sample_size)
    val = val.sample(sample_size)
#     test = test.sample(sample_size)

  # ---------- Impute and Scale Features ---------- #
  train_df_full = impute_and_scale_features(train)
  val_df_full   = impute_and_scale_features(val)
#   test_df_full  = impute_and_scale_features(test)
  
  return train_df_full, val_df_full

## Baseline Models

Before conducting any experiments, we developed classification and regression baseline models (below) to have some level of comparison when conducting the experiments. These baseline models were trained on data from 2015-2019 and validated on 2020 data. We decided to hold out our 2021 data for after the experimentation phase to get a more realistic idea of how our final models do when they see unseen data.  

**Classification** <br/>
We decided to use a simple logistic regression model as a basis of comparision for our classification models. It does not have any sampling or regularization parameters added to it, and it goes through maximum 10 iterations of the data. Looking at the scores below it does look like we have relatively strong scores to start off with. The precision and recall on the validation set is relatively high (0.9147 and 0.9838 respectively). In particular the accuracy and precision for the validation set seems to be somewhat significantly higher than training - perhaps indicating the validtaion data was less complex and easier to predict correctly with higher confidence. It is important to note however that in our full dataset, only 18% are labeled as delayed and making our dataset [moderately imbalanced](https://developers.google.com/machine-learning/data-prep/construct/sampling-splitting/imbalanced-data). So the high level metrics could be somewhat more inflated than what they should be. But this is something we will be able to confirm more throughout our experimentation. Moreover, as there is an 82% chance of a flight not being delayed at random (given our dataset imbalance), we would like to develop a model that is better than randomly guessing and has a higher accuracy than 82%.

**Regression**<br/>
The regression baseline model is a linear regression model with no regularization parameters and 10 iterations. As we mentioned in our model metrics section (master notebook), we will mainly be focusing onE, with a secondary focus on RMSE. The MAE in the baseline model is 15.4381, which translates to an error of this baseline model predicting a delay time +/- 15 minutes. It is also interesting to see the RMSE is not too much larger than the MAE, which is a good sign. It indicates that the model is able to somewhat handle outliers. As we moved further down the experiments, we realized there were some results were the RMSE was much much larger than the MAE. We hope that in our final model we can achieve an MAE lower than this so as to predict with higher confidence the number of minutes a flight will be delayed.

In [0]:
inputCols_categorical = ['Year', 'QUARTER', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DISTANCE_GROUP', 'holiday_in2DayRange', 'C19', 'OP_UNIQUE_CARRIER', 'type', 'DEP_TIME_BLK', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']
inputCols_continuous = ['DISTANCE']

pipeline = create_pipeline(df_full, inputCols_categorical, inputCols_continuous)
pipeline_df = pipeline.fit(df_full).transform(df_full)

train_none, val_none = grid_search_test_train_split(pipeline_df, sampling='none')
train_none = train_none.cache()
val_none  = val_none.cache()

train_under, val_under = grid_search_test_train_split(pipeline_df, sampling='under')
train_under = train_under.cache()
val_under  = val_under.cache()

In [0]:
# Logistic
log_reg_c_params = { 'maxIter': [10], 'regParam': [0.0], 'elasticNetParam': [0.0]}
log_reg_c_BL = train_model_no_CV(train_none, val_none,  model_type='LogisticRegression', params=log_reg_c_params, train_metrics = True)
display(log_reg_c_BL)

maxIter,regParam,elasticNetParam,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
10,0.0,0.0,0.757361891239722,0.8261648954692925,0.9886678771458566,0.757361891239722,0.8741729477176691,0.91479740519402,0.9838899276053577,0.8741729477176691


In [0]:
# Linear 
lin_reg_r_params = { 'maxIter': [10], 'regParam': [0.0], 'elasticNetParam': [0.0] }
lin_reg_r_BL = train_model_no_CV(train_none, val_none, model_type='LinearRegression', params=lin_reg_r_params, train_metrics = True)
display(lin_reg_r_BL)

maxIter,regParam,elasticNetParam,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
10,0.0,0.0,0.0435238466863093,41.42611594207078,1716.123082045891,17.982612358685135,-0.020952475924586,35.40427379060742,1253.4626026402914,15.438188537836863


## Experimentation

We ran experiments on 4 different classification models (logistic regression, decision tree, random forest, multiple layer perceptron neural network), and 4 different regression models (linear regression, decision tree, random forest, gradient boosted trees). This notebook contains the experiments on which hyperparameters will work best, without cross validation on all the models except the neural network. The neural network experiments can be found [here](https://adb-731998097721284.4.azuredatabricks.net/?o=731998097721284#notebook/1860389250614781/command/1860389250614814). The experiments done in this notebook are meant to inform which models we would like to run using cross validation. As running cross validation on a dataset as large as ours can be very time costly, we decided to be selective with our models. 

Our plan was to first test on smaller samples and then scale up; specifically from 0.1% sample size, to 10%, and finally the full dataset. The idea was to conduct a larger grid search on 0.1% sample, then a smaller one on 10%, and then finally a much smaller grid search on the full dataset. This way we could iterate more efficiently and effectively. But we found during our experimentation process that as we tried to scale up, our models that were doing the best in 0.1% would degrade very quickly at 10%. So by the time we reached full data experimentation, we decided to run a grid search that was slightly smaller than what we ran at 0.1% and slightly larger than at 10%, but not necessarily always based off of results from 0.1%. 

It should also be noted that we ran into a few issues in the experimentation phase. For one, we realized that our time-based feature engineered variables on the number and percentage of flights delayed and cancelled (4 variables) may have contained some stale data. In that after some reflection, we realized that it might be more important to consider if flights were delayed recently because those consequential effects may have more effect on the current flight prediction than something that happened up to 3 months ago. As such we changed our time based variables to reflect the past 24 hours only, and reran the experimentation again for 0.1% and 10%.

At this point in time we had been running multi-class classification to predict whether a flight would be delayed, cancelled, or not. In the experimentation phase, we also realized that the our models were still performing very poorly and we realized that even with over sampling, it was getting difficult for our models to differentiate between cancellation and delays (see below screen shot where way more flights are being predicted as cancelled than they should be). Especially considering that we had only ~2% of our full dataset with cancellation labels. Even with over and under sampling, our results weren't improving as much, and we knew that having to apply oversampling with our full dataset would be a logistical challenge given the time crunch. We also realized that depending on what rows were being sampled, the validation set may or may not have had rows with cancelled flights. Given this situation, and the fact that we didn't feel we had enough information in our dataset to differentiate between a moderately rare (delay) and very rare (cancelation) events, we decided to go focus on predicting flight delays only (i.e. binary classification) and investigate in future studies on predicting between delays and cancellations. 

<img src="https://raw.githubusercontent.com/brianahart/spark_flight_predictions/main/Screenshot%202022-12-02%20at%208.02.36%20AM.png" width="600px">

<img src="https://raw.githubusercontent.com/brianahart/spark_flight_predictions/main/Screenshot%202022-12-02%20at%208.03.02%20AM.png" width="600px">


The experimental setup and results in this notebook reflect the updated time based features and binary classification. The experimentation was conducted with the following cluster size: 14GB and 10 Cores.

**Input Features** <br/>
We included all of the below features in our models, with the exception of two follow up experiments in logistic and linear regression in which we excluded elevation ft, origin and destination wet bulb temperature, origin and destination wet dew temperature, origin and destination pressue change, and origin and destination wind gust speed due to high correlation with other existing featuress. More detail on the justification of excluding these features can be found in the full experiments section in this notebook. It should also be known that in prior iterations of our experiments we included a few extra features such as both count and time of flights delayed in the past 24 hours. We realized that those two variables bring in similar information and could further confuse our models; as such we decided to stick with percentage of flights delayed (or cancelled). 

|Total number of features by family | Count |
|-------------------------|------|
| Time related | 6 |
| Flight information | 4 |
| Plane History | 2 |
| Origin information | 3 |
| Destination Information | 1 |
| Holiday | 1 |
| Covid 19 | 1 |
| Origin Weather | 22 |
| Destination Weather | 22 | 

|Feature Family | Variable | Data Type | Definition |
|---------------|----------|-----------|------------|
| Classification label | \*label | integer | classification label; 0 = on time/early/delayed by less than 15 minutes, 1 = delayed by 15+ minutes
| Regression Target Variable | \*DEP_DELAY_NEW | integer | regression label; number of minutes flight is delayed 
| Time related | \*Year | integer | year of flight date 
| Time related | \*QUARTER | integer | quarter of flight date
| Time related | \*MONTH | integer | month of flight date
| Time related | \*DAY_OF_MONTH | integer | day of month of flight date
| Time related | \*DAY_OF_WEEK | integer | day of week of flight date
| Time related | \*DEP_TIME_BLK | string |scheduled depature time (hourly blocks)
| Flight information | \*OP_UNIQUE_CARRIER | string | flight airline; identifier assigned by US Department of Transportation for airlines
| Flight information | \*TAIL_NUM | string | identifier for specific aircraft
| Flight information | \*OP_CARRIER_FL_NUM | string | flight number
| Flight information | \*DISTANCE | integer | planned flight distance
| Plane History | \*perc_delays_last1d | double | percentage of flights delayed in past 24 hours per tail number
| Plane History | \*perc_cancellation_last1d | double | percentage of flights cancelled in past 24 hours per tail number
| Origin Information | \*ORIGIN_AIRPORT_ID | string | origin airport ID; assigned by US Department of Transportation 
| Origin Information | \*elevation_ft | integer | origin airport elevation
| Origin Information | \*type | string | airport size (small, medium, large)
| Destination Information | \*DEST_AIRPORT_ID | string | destination airport ID; assigned by US Department of Transportation 
| Holiday | \*holiday_in2DayRange | long | whether flight date is a federal holiday +/- 2 days
| Covid-19 | \*C19 | integer | ordinal variable referring to impact COVID-19 was having on aviation; further information <a href="https://adb-731998097721284.4.azuredatabricks.net/?o=731998097721284#notebook/1325974983871287/command/1325974983871304" target="_blank">here<a/>
| Origin Weather | \*origin_HourlyAltimeterSetting | float | hourly pressure (hectopascals) value which altimeter is set to indicate altitude relative to mean sea level of aircraft on the ground at location for which value was determined; at origin
| Origin Weather | \*origin_HourlyDewPointTemperature | integer | hourly temperature (Celsius) which a given parcel of air must be cooled at constant pressure and water vapor content in order for saturation to occur; at origin
| Origin Weather | \*origin_HourlyDryBulbTemperature | integer | hourly temperature (Celsius) and is another standard way to measure air temperature; at origin
| Origin Weather | \*origin_HourlyPrecipitation | float | hourly rain quantity measured in millimeters; at orgin 
| Origin Weather | \*origin_HourlyPressureChange | float | hourl number indicating air pressure change; measured in hectopascals; at origin
| Origin Weather | \*origin_HourlyPressureTendency | integer | hourly air pressure measured in hectopascals; at origin
| Origin Weather | \*origin_HourlyRelativeHumidity | integer | the amount of water vapour present in air expressed as a percentage of the amount needed for saturation at the same temperature; at origin
| Origin Weather | \*origin_HourlySkyConditions | string | hourly combination of codes which denote tany specific conditions in sky (e.g. SCT:04 45 SCT:04 190); at origin
| Origin Weather | \*origin_HourlySkyConditions_SCT_cnt | integer | number of scattered cloud layers; at origin 
| Origin Weather | \*origin_HourlySkyConditions_OVC_cnt | integer | number of overcast cloud layers; at origin  
| Origin Weather | \*origin_HourlySkyConditions_FEW_cnt | integer | number of 'few' cloud layers; at origin 
| Origin Weather | \*origin_HourlySkyConditions_BKN_cnt | integer | number of broken cloud layers; at origin 
| Origin Weather | \*origin_HourlySkyConditions_VV_cnt | integer | number of vertical visibility cloud layers; at origin 
| Origin Weather | \*origin_HourlySkyConditions_SKC_cnt | integer | number of clear cloud layers; at origin  
| Origin Weather | \*origin_HourlySkyConditions_CLR_cnt | integer | number of clear cloud layers; at origin 
| Origin Weather | \*origin_HourlySeaLevelPressure | float | hourly air pressure (hectopascals) relative to mean sea level; at origin
| Origin Weather | \*origin_HourlyStationPressure | float | hourly atmospheric pressure at a weather station (hectopascals); at origin 
| Origin Weather | \*origin_HourlyVisibility | float | hourly horizontal distance at which an object can be seen/identified; at origin 
| Origin Weather | \*origin_HourlyWetBulbTemperature | integer | hourly average wet bulb temperature (Celsius); at origin 
| Origin Weather | \*origin_HourlyWindDirection | integer | hourly angle measured in blockwise direction between true north and direction from which the wind is blowing; at origin 
| Origin Weather | \*origin_HourlyWindGustSpeed | integer | hourly rate of speed of wind gust measured in meters/second; at origin 
| Origin Weather | \*origin_HourlyWindSpeed | integer | hourly rate of speed of wind, measured in meters/secondl at origin 
| Origin Weather | \*dest_HourlyAltimeterSetting | float | hourly pressure value (hectopascals) which altimeter is set to indicate altitude relative to mean sea level of aircraft on the ground at location for which value was determined; at destination
| Destination Weather | \*dest_HourlyDewPointTemperature | integer | hourly temperature (Celsius) which a given parcel of air must be cooled at constant pressure and water vapor content in order for saturation to occur; at destination
| Destination Weather | \*dest_HourlyDryBulbTemperature | integer | hourly temperature (Celsius) and is another standard way to measure air temperature; at destination 
| Destination Weather | \*dest_HourlyPrecipitation | float | hourly rain quantity measured in millimeters; at destination 
| Destination Weather | \*dest_HourlyPressureChange | float | hourly number indicating air pressure change; at destination 
| Destination Weather | \*dest_HourlyPressureTendency | integer | hourly air pressure measured in hectopascals; at destination
| Destination Weather | \*dest_HourlyRelativeHumidity | integer | the amount of water vapour present in air expressed as a percentage of the amount needed for saturation at the same temperature; at destination
| Destination Weather | \*dest_HourlySkyConditions | string | hourly combination of codes which denote tany specific conditions in sky (e.g. SCT:04 45 SCT:04 190); at destination
| Destination Weather | \*dest_HourlySkyConditions_SCT_cnt | integer | number of scattered cloud layers; at destination 
| Destination Weather | \*dest_HourlySkyConditions_OVC_cnt | integer | number of overcaset cloud layers; at destination 
| Destination Weather | \*dest_HourlySkyConditions_FEW_cnt | integer | number of 'few' cloud layers; at destination 
| Destination Weather | \*dest_HourlySkyConditions_BKN_cnt | integer | number of broken cloud layers; at destination 
| Destination Weather | \*dest_HourlySkyConditions_VV_cnt | integer | number of vertical visibility cloud layers; at destination 
| Destination Weather | \*dest_HourlySkyConditions_SKC_cnt | integer | number of clear cloud layers; at destination 
| Destination Weather | \*dest_HourlySkyConditions_CLR_cnt | integer | number of clear cloud layers; at destination 
| Destination Weather | \*dest_HourlySeaLevelPressure | float | hourly air pressure (hectopascals) relative to mean sea level; at destination
| Destination Weather | \*dest_HourlyStationPressure | float | hourly atmospheric pressure at a weather station (hectopascals); at destination
| Destination Weather | \*dest_HourlyVisibility | float | hourly horizontal distance at which an object can be seen/identified; at destination 
| Destination Weather | \*dest_HourlyWetBulbTemperature | integer | hourly average wet bulb temperature (Celsius); at destination 
| Destination Weather | \*dest_HourlyWindDirection | integer | hourly angle measured in blockwise direction between true north and direction from which the wind is blowing; at destination 
| Destination Weather | \*dest_HourlyWindGustSpeed | integer | hourly rate of speed of wind gust measured in meters/second; at destination 
| Destination Weather | \*dest_HourlyWindSpeed | integer | hourly rate of speed of wind, measured in meters/secondl at destination  




#### Sample Size = 0.1%
**Total Experiments: 408**
| Model Type     | Model                    | Evaluation Metric | Training Time (None/Under/Over Sampling) | Number of Experiments | Sampling        | Hyperparameters | 
| -------------- | ------------------------ | ----------------- | ---------------------------------------- | --------------------- | --------------- | --------------- |
| Classification | Logistic Regression |  F1 Score, Precision, Recall, Accuracy | 6.40/7.27/7.37 min | 36 | None/Over/Under | Max Iterations: 10, 20 <br/> Regularization Param: 0.2, 0.4 <br/> Elastic Net Param: 0.0, 0.3, 0.8 | 
| Classification | Decision Tree Classification |  F1 Score, Precision, Recall, Accuracy | 7.01/6.50/1.67 min | 84 | None/Over/Under| Max Depth: 5, 10 <br/> Impurity: Gini, Entropy <br/> Max Bins: 28,32,40 <br/> Min Information Gain: 0.0, 0.05 |
| Classification | Random Forest Classification | F1 Score, Precision, Recall, Accuracy | 14.92/14.93/14.02 min | 216 | None/Over/Under | Number of Trees: 10, 20, 50 <br/> Max Depth: 3, 5, 7 <br/> Impurity: Gini, Entropy <br/> Max Bins: 32, 40 <br/> Min Information Gain: 0.0, 0.05 |
| Regression | Linear Regression | MAE, RMSE | 1.06 min | 12 | None | Max Iterations: 10, 20 <br/> Regularization Param: 0.2, 0.4 <br/> Elastic Net Param: 0, 0.3, 0.8 |
| Regression | Decision Tree Regression | MAE, RMSE | 1.42 min | 6 | None | Max Depth: 5, 10, 15 <br/> Min Information Gain: 0.0, 0.05 |
| Regression | Random Forest Regression | MAE, RMSE | 14.52 min | 18 | None | Number of Trees: 10, 20, 50 <br/> Max Depth: 3, 5, 7 <br/> Min Information Gain: 0.0, 0.05 |
| Regression | Gradient Boosted Trees Regression | MAE, RMSE | 1.95 min | 36 | None | Max Depth: 3, 5, 7 <br/> Max Iterations: 15, 50 <br/> Step Size (Learning Rate): 0.1, 0.3, 0.7 <br/> Min Information Gain: 0.0, 0.05 |


#### Sample Size = 10%
**Total Experiments: 87** 
| Model Type     | Model                    | Evaluation Metric | Training Time | Number of Experiments | Sampling        | Hyperparameters | 
| -------------- | ------------------------ | ----------------- | ------------- | --------------------- | --------------- | --------------- |
| Classification | Logistic Regression |  F1 Score, Precision, Recall, Accuracy | 5.65/8.65 min | 10 | Over/Under | Max Iterations: 10, 20 <br/> Regularization Param: 0.2, 0.4 <br/> Elastic Net Param: 0.0, 0.3, 0.8 | 
| Classification | Decision Tree Classification |  F1 Score, Precision, Recall, Accuracy | 15.95 min/4.33 min/38.56 sec | 18 | Over/Under/Under followup| Max Depth: 5, 10 <br/> Impurity: Gini, Entropy <br/> Max Bins: 28,32,40 <br/> Min Information Gain: 0.0, 0.05, 0.07 |
| Classification | Random Forest Classification | F1 Score, Precision, Recall, Accuracy | 7.82/1.64 min | 20 | Under/Under followup | Number of Trees: 15, 50, 75 <br/> Max Depth: 3, 5 <br/> Impurity: Gini, Entropy <br/> Max Bins: 32 <br/> Min Information Gain: 0.0, 0.05, 0.07 |
| Regression | Linear Regression | MAE, RMSE | 7.49 min | 8 | None | Max Iterations: 10, 20 <br/> Regularization Param: 0.2, 0.4 <br/> Elastic Net Param: 0.0, 0.8 |
| Regression | Decision Tree Regression | MAE, RMSE | 8.11 min | 7 | None | Max Depth: 5, 10, 15 <br/> Min Information Gain: 0.0, 0.05, 0.07 |
| Regression | Random Forest Regression | MAE, RMSE | 30.22 min | 8 | None | Number of Trees: 15, 50 <br/> Max Depth: 3, 5 <br/> Min Information Gain: 0.0, 0.05 |
| Regression | Gradient Boosted Trees Regression | MAE, RMSE | 1.30 min | 16 | None | Max Depth: 3, 5 <br/> Max Iterations: 15, 30 <br/> Step Size (Learning Rate): 0.1, 0.3 <br/> Min Information Gain: 0.0, 0.05 |

#### Full Data Set
**Total Experiments: 43**
| Model Type     | Model                    | Evaluation Metric | Training Time <br/> (None/Under/Over Sampling) | Number of Experiments | Sampling | Hyperparameters | 
|----------------|--------------------------|-------------------|-------------------|---------------------| --------------- | ------------------------ |
| Classification | Logistic Regression |  F1 Score, Precision, Recall, Accuracy | 1.12 hours/13.76 mins | 8 | None/Under | Max Iterations: 10, 20 <br/> Regularization Param: 0.2, 0.4 <br/> Elastic Net Param: 0.0, 0.8 | 
| Classification | Decision Tree Classification | F1 Score, Precision, Recall, Accuracy | 20.58 min/9.37 min | 4 | None/Under | Max Depth: 5 <br/> Impurity: Gini <br/> Max Bins: 32 <br/> Min. Information Gain: 0.0, 0.01 |
| Classification | Random Forest Classification | F1 Score, Precision, Recall, Accuracy | 42.68/27.44 mins | 16 | None/Under | Number of Trees: 15, 50 <br/> Max Depth: 3, 5 <br/> Impurity: Gini, Entropy <br/> Max Bins: 32 <br/> Min. Information Gain: 0.01 |
| Regression | Linear Regression |  MAE, RMSE | 16.94 min | 8 | None | Max Iterations: 10, 20 <br/> Regularization Param: 0.2, 0.4 <br/> Elastic Net Param: 0, 0.8 |
| Regression | Decision Tree Regression  | MAE, RMSE |  48.21 mins/9.57 min (follow up) | 3 | None | Max Depth: 5, 10 <br/> Min. Information Gain: 0.0, 0.01 |
| Regression | Random Forest Regression  | MAE, RMSE |  1.34 hours | 2 | None | Number of Trees: 50 <br/> Max Depth: 5, <br/> Min. Information Gain: 0.0, 0.01|
| Regression | Gradient Boosted Trees Regression | MAE, RMSE | 1.50 mins | 2 | None | Max Depth: 3 <br/> Max Iterations: 15 <br/> Step Size (Learning Rate): 0.1 <br/> Min. Information Gain: 0.0, 0.01 |

#### Best Results Per Model Per Sample Size
| Sample Size    | Model                    | Best Parameters  | Performance on Validation Set | 
| -------------- | ------------------------ | ---------------- | ------------------ |
| 1%   | Logistic Regression | Sampling: None <br/> Max Iterations: 20 <br/> Regularization Param: 0.2 <br/> Elastic Net Param: 0.0 | F1 Score: 0.9066 <br/> Precision: 0.9307 <br/> Recall: 0.9979 <br/> Accuracy: 0.9066 |
| 1%   | Decision Tree Classification | Sampling: Under <br/> Max Depth: 10 <br/> Impurity: Entropy <br/> Max Bins: 40  <br/> Min Information Gain: 0.05| F1 Score: 0.9024 <br/> Precision: 0.9914 <br/> Recall: 0.93302 <br/> Accuracy: 0.9024 |
| 1%   | Random Forest Classification | Sampling: Over <br/> Number of Trees: 50 <br/> Max Depth: 5 <br/> Impurity: Gini <br/> Max Bins: 40 <br/> Min Information Gain: 0.0 | F1 Score: 0.8997 <br/> Precision: 0.9310 <br/> Recall: 0.9901 <br/> Accuracy: 0.8997 |
| 1%   | Linear Regression | Sampling: None <br/> Max Iterations: 20 <br/> Regularization Param: 0.2 <br/> Elastic Net Param: 0.8 | MAE: 13.4802 <br/> RMSE: 24.68 |
| 1%   | Decision Tree Regression | Sampling: None <br/> Max Depth: 5 <br/> Min Information Gain: 0.05 | MAE: 9.3928 <br/> RMSE: 43.1001 |
| 1%   | Random Forest Regression | Sampling: None  <br/> Number of Trees: 10 <br/> Max Depth: 3 <br/> Min Information Gain: 0.05 | MAE: 7.2266 <br/> RMSE: 21.0994 |
| 1%   | Gradient Boosted Trees Regression | Sampling: None <br/> Max Depth: 3 <br/> Max Iterations: 15 <br/> Step Size (Learning Rate): 0.1 <br/> Min Information Gain: 0 | MAE: 9.2447 <br/> RMSE: 42.5545 |
| 10%  | Logistic Regression | Sampling: Over <br/> Max Iterations: 20 <br/> Regularization Param: 0.4 <br/> Elastic Net Param: 0.8 | F1 Score: 0.7867 <br/> Precision: 0.8539 <br/> Recall: 1 <br/> Accuracy: 0.7867 |
| 10%  | Decision Tree Classification | Sampling: Over <br/> Max Depth: 10 <br/> Impurity: Gini <br/> Max Bins: 32 <br/> Min Information Gain: 0.05 | F1 Score: 0.7867 <br/> Precision: 0.8539 <br/> Recall: 1 <br/> Accuracy: 0.7867 |
| 10%  | Random Forest Classification | Sampling: Under <br/> Number of Trees: 15 <br/> Max Depth: 5 <br/> Impurity: Gini <br/> Max Bins: 34 <br/> Min Information Gain: 0.0| F1 Score: 0.6278 <br/> Precision: 0.8906 <br/> Recall: 0.5879 <br/> Accuracy: 0.5278 |
| 10%  | Linear Regression | Sampling: None <br/> Max Iterations: 10 <br/> Regularization Param: 0.4 <br/> Elastic Net Param: 0.0 | MAE: 15.8405 <br/> RMSE: 34.3119 |
| 10%  | Decision Tree Regression | Sampling: None <br/> Max Depth: 5 <br/> Min Information Gain: 0.0  | MAE: 23.4333 <br/> RMSE: 37.7920 |
| 10%  | Random Forest Regression | Sampling: None <br/> Number of Trees: 50 <br/> Max Depth: 3 <br/> Min Information Gain: 0.05 | MAE: 17.0127 <br/> RMSE: 34.4760 |
| 10%  | Gradient Boosted Trees Regression | Sampling: None <br/> Max Depth: 5 <br/> Max Iterations: 15 <br/> Step Size (Learning Rate): 0.1 <br/> Min Information Gain: 0.0   | MAE: 14.7590 <br/> RMSE: 33.9956 |
| Full | Logistic Regression | Sampling: None <br/> Max Iterations: 10 <br/> Regularization Param: 0.2 <br/> Elastic Net Param: 0.0 | F1 Score: 0.8661 <br/> Precision: 0.9091 <br/> Recall: 0.9998 <br/> Accuracy: 0.8661 |
| Full | Decision Tree Classification | Sampling: None <br/> Max Depth: 5 <br/> Impurity: Gini <br/> Max Bins: 32 <br/> Min Information Gain: 0.0 | F1 Score: 0.8656 <br/> Precision: 0.9090 <br/> Recall: 1 <br/> Accuracy: 0.8656 |
| Full | Random Forest Classification | Sampling: None <br/> Number of Trees: 3 <br/> Max Depth: 15 <br/> Impurity: Gini <br/> Max Bins: 32 <br/> Min Information Gain: 0.01| F1 Score: 0.8656 <br/> Precision: 0.9090 <br/> Recall: 1 <br/> Accuracy: 0.8656 |
| Full | Linear Regression | Sampling: None <br/> Max Iterations: 20 <br/> Regularization Param: 0.2 <br/> Elastic Net Param: 0.8 | MAE: 15.3747 <br/> RMSE: 35.3285 |
| Full | Decision Tree Regression | Sampling: None <br/> Max Depth: 5 <br/> Min Information Gain: 0.0    | MAE: 17.4483 <br/> RMSE: 35.9332 |
| Full | Random Forest Regression | Sampling: None <br/> Number of Trees: 50 <br/> Max Depth: 5 <br/> Min Information Gain: 0.0 | MAE: 17.6723 <br/> RMSE: 35.8424 |
| Full | Gradient Boosted Trees Regression | Sampling: None <br/> Max Depth: 3 <br/> Max Iterations: 15 <br/> Step Size (Learning Rate): 0.0 | MAE: 17.7356 <br/> RMSE: 35.9672 |

In [0]:
# Pipeline Set Up

inputCols_categorical = ['Year', 'QUARTER', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DISTANCE_GROUP', 'holiday_in2DayRange', 'C19', 'OP_UNIQUE_CARRIER', 'type', 'DEP_TIME_BLK', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']
inputCols_continuous = ['DISTANCE']

pipeline = create_pipeline(df_full, inputCols_categorical, inputCols_continuous)
pipeline_df = pipeline.fit(df_full).transform(df_full)

# train_001_none, val_001_none = grid_search_test_train_split(pipeline_df, 0.001, sampling='none')
# train_001_none = train_001_none.cache()
# val_001_none  = val_001_none.cache()

# train_001_over, val_001_over = grid_search_test_train_split(pipeline_df, 0.001, sampling='over')
# train_001_over = train_001_over.cache()
# val_001_over  = val_001_over.cache()

# train_001_under, val_001_under = grid_search_test_train_split(pipeline_df, 0.001, sampling='under')
# train_001_under = train_001_under.cache()
# val_001_under  = val_001_under.cache()


train_10_none, val_10_none = grid_search_test_train_split(pipeline_df, 0.1, sampling='none')
train_10_none = train_10_none.cache()
val_10_none  = val_10_none.cache()

# train_10_over, val_10_over = grid_search_test_train_split(pipeline_df, 0.1, sampling='over')
# train_10_over = train_10_over.cache()
# val_10_over  = val_10_over.cache()

# train_10_under, val_10_under = grid_search_test_train_split(pipeline_df, 0.1, sampling='under')
# train_10_under = train_10_under.cache()
# val_10_under  = val_10_under.cache()

### 0.1% Sample Size

#### Classification

##### Logistic Regression

In [0]:
# Logistic Regression No sampling
log_reg_c_params = { 'maxIter': [10,20], 'regParam': [0.2, 0.4], 'elasticNetParam': [0, 0.3, 0.8]}
log_reg_c_no_sampling01 = train_model_no_CV(train_001_none, val_001_none,  model_type='LogisticRegression', params=log_reg_c_params, train_metrics = True)
display(log_reg_c_no_sampling01)

maxIter,regParam,elasticNetParam,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
10,0.2,0.0,0.9594976695017576,0.9603825136612022,1.0,0.9594976695017576,0.9066250806903574,0.9307186472522312,0.9979853941072776,0.9066250806903574
10,0.2,0.3,0.9579297872463666,0.9593267882187938,1.0,0.9579297872463666,0.9059524488232812,0.9282374941561478,1.0,0.9059524488232812
10,0.2,0.8,0.9579297872463666,0.9593267882187938,1.0,0.9579297872463666,0.9059524488232812,0.9282374941561478,1.0,0.9059524488232812
10,0.4,0.0,0.9518447079870452,0.9526821005081876,1.0,0.9518447079870452,0.9059524488232812,0.9282374941561478,1.0,0.9059524488232812
10,0.4,0.3,0.7143745115822451,0.8023587597489062,1.0,0.7143745115822451,0.9059524488232812,0.9282374941561478,1.0,0.9059524488232812
10,0.4,0.8,0.7143745115822451,0.8023587597489062,1.0,0.7143745115822451,0.7895911024518429,0.8560034490191851,1.0,0.7895911024518429
20,0.2,0.0,0.9595726742840353,0.9604189594322796,1.0,0.9595726742840353,0.9066250806903574,0.9307186472522312,0.9979853941072776,0.9066250806903574
20,0.2,0.3,0.9579297872463666,0.9593267882187938,1.0,0.9579297872463666,0.9059524488232812,0.9282374941561478,1.0,0.9059524488232812
20,0.2,0.8,0.9579297872463666,0.9593267882187938,1.0,0.9579297872463666,0.9059524488232812,0.9282374941561478,1.0,0.9059524488232812
20,0.4,0.0,0.9518447079870452,0.9526821005081876,1.0,0.9518447079870452,0.9059524488232812,0.9282374941561478,1.0,0.9059524488232812


In [0]:
# Logistic Regression Over sampling 
log_reg_c_params = { 'maxIter': [10,20], 'regParam': [0.2, 0.4], 'elasticNetParam': [0, 0.3, 0.8]}
log_reg_c_over_sampling01 = train_model_no_CV(train_001_over, val_001_over,  model_type='LogisticRegression', params=log_reg_c_params, train_metrics = True)
display(log_reg_c_over_sampling01)

maxIter,regParam,elasticNetParam,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
10,0.2,0.0,0.964988941159766,0.969447616047876,0.9601856184682576,0.964988941159766,0.796695479803392,0.9286880783886772,0.8364795292963961,0.796695479803392
10,0.2,0.3,0.953431727734438,0.994893182398502,0.9116362501949774,0.953431727734438,0.9007911579105212,0.9323481874855692,0.9899485167933316,0.9007911579105212
10,0.2,0.8,0.953431727734438,0.994893182398502,0.9116362501949774,0.953431727734438,0.9007911579105212,0.9323481874855692,0.9899485167933316,0.9007911579105212
10,0.4,0.0,0.9644045738802988,0.9685250029507808,0.9599516456091094,0.9644045738802988,0.7708310230774117,0.9265504429837096,0.7948026477077715,0.7708310230774117
10,0.4,0.3,0.953431727734438,0.994893182398502,0.9116362501949774,0.953431727734438,0.9007911579105212,0.9323481874855692,0.9899485167933316,0.9007911579105212
10,0.4,0.8,0.953431727734438,0.994893182398502,0.9116362501949774,0.953431727734438,0.9007911579105212,0.9323481874855692,0.9899485167933316,0.9007911579105212
20,0.2,0.0,0.9649694368378988,0.9694833832099544,0.9601076275152084,0.9649694368378988,0.7968500557099768,0.9287074829931972,0.8367246874233881,0.7968500557099768
20,0.2,0.3,0.953431727734438,0.994893182398502,0.9116362501949774,0.953431727734438,0.9007911579105212,0.9323481874855692,0.9899485167933316,0.9007911579105212
20,0.2,0.8,0.953431727734438,0.994893182398502,0.9116362501949774,0.953431727734438,0.9007911579105212,0.9323481874855692,0.9899485167933316,0.9007911579105212
20,0.4,0.0,0.9643850841966736,0.9685237645577588,0.9599126501325846,0.9643850841966736,0.7706769790376578,0.92652944539737,0.7945574895807797,0.7706769790376578


In [0]:
# Logistic Regression Under sampling 
log_reg_c_params = { 'maxIter': [10,20], 'regParam': [0.2, 0.4], 'elasticNetParam': [0, 0.3, 0.8]}
log_reg_c_under_sampling01 = train_model_no_CV(train_001_under, val_001_under,  model_type='LogisticRegression', params=log_reg_c_params,  train_metrics = True)
display(log_reg_c_under_sampling01)

maxIter,regParam,elasticNetParam,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
10,0.2,0.0,0.9943819987438832,0.9942074776198,0.9947312961011592,0.9943819987438832,0.7888830772788183,0.9282164549972391,0.8222059183174371,0.7888830772788183
10,0.2,0.3,0.990456131474535,0.9817209863769616,0.9998243765367052,0.990456131474535,0.9020677278786652,0.9331951163326424,0.9907067742724384,0.9020677278786652
10,0.2,0.8,0.990456131474535,0.9817209863769616,0.9998243765367052,0.990456131474535,0.9020677278786652,0.9331951163326424,0.9907067742724384,0.9020677278786652
10,0.4,0.0,0.9938468773951716,0.992988606485539,0.9949069195644538,0.9938468773951716,0.7666012761973117,0.9270051933064052,0.7857666911225238,0.7666012761973117
10,0.4,0.3,0.990456131474535,0.9817209863769616,0.9998243765367052,0.990456131474535,0.9020677278786652,0.9331951163326424,0.9907067742724384,0.9020677278786652
10,0.4,0.8,0.990456131474535,0.9817209863769616,0.9998243765367052,0.990456131474535,0.9020677278786652,0.9331951163326424,0.9907067742724384,0.9020677278786652
20,0.2,0.0,0.9943819987438832,0.9942074776198,0.9947312961011592,0.9943819987438832,0.7888830772788183,0.9282164549972391,0.8222059183174371,0.7888830772788183
20,0.2,0.3,0.990456131474535,0.9817209863769616,0.9998243765367052,0.990456131474535,0.9020677278786652,0.9331951163326424,0.9907067742724384,0.9020677278786652
20,0.2,0.8,0.990456131474535,0.9817209863769616,0.9998243765367052,0.990456131474535,0.9020677278786652,0.9331951163326424,0.9907067742724384,0.9020677278786652
20,0.4,0.0,0.9938468773951716,0.992988606485539,0.9949069195644538,0.9938468773951716,0.7666012761973117,0.9270051933064052,0.7857666911225238,0.7666012761973117


##### Decision Tree Classification

In [0]:
# Decision Tree Classifier No sampling 
dt_c_params = { 'maxDepth': [5, 10, 15], 'impurity': ['gini','entropy'], 'maxBins': [28,32,40], 'minInfoGain': [0.0, 0.05] }
dt_c_no_sampling01 = train_model_no_CV(train_001_none, val_001_none, model_type='DecisionTreeClassifier', params=dt_c_params,  train_metrics = True)
display(dt_c_no_sampling01)

maxDepth,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
5,gini,28,0.0,0.983768548841567,0.9971128398428102,0.9825351667456932,0.983768548841567,0.8939891949563691,0.992938209331652,0.9914379249559304,0.8939891949563691
5,gini,28,0.05,0.9587780915157708,0.980083940449794,0.9780701754385964,0.9587780915157708,0.906478199388714,0.9366666666666666,0.9906824477461595,0.906478199388714
5,gini,32,0.0,0.983667850936496,0.9968733715476812,0.9826537063379168,0.983667850936496,0.4353138597870833,0.9950738916256158,0.3052127927474188,0.4353138597870833
5,gini,32,0.05,0.9587780915157708,0.980083940449794,0.9780701754385964,0.9587780915157708,0.906478199388714,0.9366666666666666,0.9906824477461595,0.906478199388714
5,gini,40,0.0,0.9837010026126766,0.9969531751122516,0.9826141931405088,0.9837010026126766,0.4353138597870833,0.9950738916256158,0.3052127927474188,0.4353138597870833
5,gini,40,0.05,0.9587780915157708,0.980083940449794,0.9780701754385964,0.9587780915157708,0.906478199388714,0.9366666666666666,0.9906824477461595,0.906478199388714
5,entropy,28,0.0,0.9819704791237552,0.997866258706067,0.9793741109530584,0.9819704791237552,0.0140171294517465,0.0,0.0,0.0140171294517465
5,entropy,28,0.05,0.9766261728979474,0.9726738152888272,1.0,0.9766261728979474,0.9059524488232812,0.9282374941561478,1.0,0.9059524488232812
5,entropy,32,0.0,0.9821659772202702,0.9981474769441424,0.9793345977556503,0.9821659772202702,0.013385759588771,0.0,0.0,0.013385759588771
5,entropy,32,0.05,0.9768473054939256,0.9729345276998194,0.999960486802592,0.9768473054939256,0.9059524488232812,0.9282374941561478,1.0,0.9059524488232812


In [0]:
# Decision Tree Classifier Over sampling 
dt_c_params = { 'maxDepth': [5, 10, 15], 'impurity': ['gini','entropy'], 'maxBins': [28, 32, 40], 'minInfoGain': [0.0, 0.05] }
dt_c_over_sampling01 = train_model_no_CV(train_001_over, val_001_over, model_type='DecisionTreeClassifier', params=dt_c_params, train_metrics = True)
display(dt_c_over_sampling01)

maxDepth,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
5,gini,28,0.0,0.9595371190249404,0.9410179640718564,0.9805022617376384,0.9595371190249404,0.8534372625578786,0.9293625399950776,0.9257170875214512,0.8534372625578786
5,gini,28,0.05,0.953431727734438,0.994893182398502,0.9116362501949774,0.953431727734438,0.9007911579105212,0.9323481874855692,0.9899485167933316,0.9007911579105212
5,gini,32,0.0,0.9595371190249404,0.9410179640718564,0.9805022617376384,0.9595371190249404,0.8566471134866684,0.9296913277804998,0.9303750919342976,0.8566471134866684
5,gini,32,0.05,0.953431727734438,0.994893182398502,0.9116362501949774,0.953431727734438,0.9007911579105212,0.9323481874855692,0.9899485167933316,0.9007911579105212
5,gini,40,0.0,0.9595178547398546,0.941114813012391,0.9803462798315395,0.9595178547398546,0.8593841480677687,0.9310850439882696,0.9340524638391764,0.8593841480677687
5,gini,40,0.05,0.953431727734438,0.994893182398502,0.9116362501949774,0.953431727734438,0.9007911579105212,0.9323481874855692,0.9899485167933316,0.9007911579105212
5,entropy,28,0.0,0.9591099224518684,0.94143232095988,0.9790984245827484,0.9591099224518684,0.7642362934089315,0.9274521183981428,0.7835253738661436,0.7642362934089315
5,entropy,28,0.05,0.9510189678102668,1.0,0.9021993448759944,0.9510189678102668,0.0156934202997719,0.0,0.0,0.0156934202997719
5,entropy,32,0.0,0.9591099224518684,0.94143232095988,0.9790984245827484,0.9591099224518684,0.7642362934089315,0.9274521183981428,0.7835253738661436,0.7642362934089315
5,entropy,32,0.05,0.9510189678102668,1.0,0.9021993448759944,0.9510189678102668,0.0156934202997719,0.0,0.0,0.0156934202997719


In [0]:
# Decision Tree Classifier Under sampling 
dt_c_params = { 'maxDepth': [5, 10], 'impurity': ['gini','entropy'], 'maxBins': [28, 32, 40], 'minInfoGain': [0.05] }
dt_c_under_sampling01 = train_model_no_CV(train_001_under, val_001_under, model_type='DecisionTreeClassifier', params=dt_c_params, train_metrics = True)
display(dt_c_under_sampling01)

maxDepth,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
5,gini,28,0.05,0.990456131474535,0.9817209863769616,0.9998243765367052,0.990456131474535,0.9020677278786652,0.9331951163326424,0.9907067742724384,0.9020677278786652
5,gini,32,0.05,0.990456131474535,0.9817209863769616,0.9998243765367052,0.990456131474535,0.9020677278786652,0.9331951163326424,0.9907067742724384,0.9020677278786652
5,gini,40,0.05,0.990456131474535,0.9817209863769616,0.9998243765367052,0.990456131474535,0.9020677278786652,0.9331951163326424,0.9907067742724384,0.9020677278786652
5,entropy,28,0.05,0.9903668474404056,0.9813857290589452,1.0,0.9903668474404056,0.9024680434361813,0.9330264672036824,0.991440449987772,0.9024680434361813
5,entropy,32,0.05,0.9903668474404056,0.9813857290589452,1.0,0.9903668474404056,0.9024680434361813,0.9330264672036824,0.991440449987772,0.9024680434361813
5,entropy,40,0.05,0.9903668474404056,0.9813857290589452,1.0,0.9903668474404056,0.9024680434361813,0.9330264672036824,0.991440449987772,0.9024680434361813
10,gini,28,0.05,0.990456131474535,0.9817209863769616,0.9998243765367052,0.990456131474535,0.9020677278786652,0.9331951163326424,0.9907067742724384,0.9020677278786652
10,gini,32,0.05,0.990456131474535,0.9817209863769616,0.9998243765367052,0.990456131474535,0.9020677278786652,0.9331951163326424,0.9907067742724384,0.9020677278786652
10,gini,40,0.05,0.990456131474535,0.9817209863769616,0.9998243765367052,0.990456131474535,0.9020677278786652,0.9331951163326424,0.9907067742724384,0.9020677278786652
10,entropy,28,0.05,0.9903668474404056,0.9813857290589452,1.0,0.9903668474404056,0.9024680434361813,0.9330264672036824,0.991440449987772,0.9024680434361813


##### Random Forest Classification

In [0]:
# Random Forest Classifier - No Sampling
rf_c_params = { 'maxDepth': [3, 5, 7], 'numTrees': [10, 20, 50] , 'impurity': ['gini','entropy'], 'maxBins': [32, 40], 'minInfoGain': [0.0, 0.05] }
rf_c_no_sampling01 = train_model_no_CV(train_001_none, val_001_none, model_type='RandomForestClassifier', params=rf_c_params, train_metrics = True)
display(rf_c_no_sampling01)

maxDepth,numTrees,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
3,10,gini,32,0.0,0.7143745115822451,0.8023587597489062,1.0,0.7143745115822451,0.7895911024518429,0.8560034490191851,1.0,0.7895911024518429
3,10,gini,32,0.05,0.7143745115822451,0.8023587597489062,1.0,0.7143745115822451,0.7895911024518429,0.8560034490191851,1.0,0.7895911024518429
3,10,gini,40,0.0,0.7143745115822451,0.8023587597489062,1.0,0.7143745115822451,0.7895911024518429,0.8560034490191851,1.0,0.7895911024518429
3,10,gini,40,0.05,0.7143745115822451,0.8023587597489062,1.0,0.7143745115822451,0.7895911024518429,0.8560034490191851,1.0,0.7895911024518429
3,10,entropy,32,0.0,0.7143745115822451,0.8023587597489062,1.0,0.7143745115822451,0.7895911024518429,0.8560034490191851,1.0,0.7895911024518429
3,10,entropy,32,0.05,0.7143745115822451,0.8023587597489062,1.0,0.7143745115822451,0.7895911024518429,0.8560034490191851,1.0,0.7895911024518429
3,10,entropy,40,0.0,0.7143745115822451,0.8023587597489062,1.0,0.7143745115822451,0.7895911024518429,0.8560034490191851,1.0,0.7895911024518429
3,10,entropy,40,0.05,0.7143745115822451,0.8023587597489062,1.0,0.7143745115822451,0.7895911024518429,0.8560034490191851,1.0,0.7895911024518429
3,20,gini,32,0.0,0.7143745115822451,0.8023587597489062,1.0,0.7143745115822451,0.7895911024518429,0.8560034490191851,1.0,0.7895911024518429
3,20,gini,32,0.05,0.7143745115822451,0.8023587597489062,1.0,0.7143745115822451,0.7895911024518429,0.8560034490191851,1.0,0.7895911024518429


In [0]:
# Random Forest Classifier - Over Sampling
rf_c_params = { 'maxDepth': [3, 5, 7], 'numTrees': [10, 20, 50] , 'impurity': ['gini','entropy'], 'maxBins': [32, 40], 'minInfoGain': [0.0, 0.05] }
rf_c_over_sampling01 = train_model_no_CV(train_001_over, val_001_over, model_type='RandomForestClassifier', params=rf_c_params, train_metrics = True)
display(rf_c_over_sampling01)

maxDepth,numTrees,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
3,10,gini,32,0.0,0.5903904696378562,0.6029793030715823,0.5350959288722508,0.5903904696378562,0.4675063777092572,0.8644628099173554,0.3846531012503064,0.4675063777092572
3,10,gini,32,0.05,0.3337663686962409,0.0,0.0,0.3337663686962409,0.0152481956949231,0.0,0.0,0.0152481956949231
3,10,gini,40,0.0,0.5915740128389142,0.604242610566999,0.5364997660271409,0.5915740128389142,0.4709837562178892,0.8639085465432771,0.3890659475361608,0.4709837562178892
3,10,gini,40,0.05,0.3337663686962409,0.0,0.0,0.3337663686962409,0.0152481956949231,0.0,0.0,0.0152481956949231
3,10,entropy,32,0.0,0.5903904696378562,0.6029793030715823,0.5350959288722508,0.5903904696378562,0.4675063777092572,0.8644628099173554,0.3846531012503064,0.4675063777092572
3,10,entropy,32,0.05,0.3337663686962409,0.0,0.0,0.3337663686962409,0.0152481956949231,0.0,0.0,0.0152481956949231
3,10,entropy,40,0.0,0.5915740128389142,0.604242610566999,0.5364997660271409,0.5915740128389142,0.4709837562178892,0.8639085465432771,0.3890659475361608,0.4709837562178892
3,10,entropy,40,0.05,0.3337663686962409,0.0,0.0,0.3337663686962409,0.0152481956949231,0.0,0.0,0.0152481956949231
3,20,gini,32,0.0,0.6224072936730394,0.6450613063494234,0.5518639837778818,0.6224072936730394,0.4652553267930893,0.8818129661503156,0.3768080411865653,0.4652553267930893
3,20,gini,32,0.05,0.3337663686962409,0.0,0.0,0.3337663686962409,0.0152481956949231,0.0,0.0,0.0152481956949231


In [0]:
# Random Forest Classifier - Under Sampling
rf_c_params = { 'maxDepth': [3, 5, 7], 'numTrees': [10, 20, 50] , 'impurity': ['gini','entropy'], 'maxBins': [32, 40], 'minInfoGain': [0.0, 0.05] }
rf_c_under_sampling01 = train_model_no_CV(train_001_under, val_001_under, model_type='RandomForestClassifier', params=rf_c_params, train_metrics = True)
display(rf_c_under_sampling01)

maxDepth,numTrees,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
3,10,gini,32,0.0,0.5926378342825189,0.5856567979304398,0.7156656129258869,0.5926378342825189,0.5326243959632854,0.8725850965961361,0.4639276106627537,0.5326243959632854
3,10,gini,32,0.05,0.3419889945435994,0.5077581594435527,1.0,0.3419889945435994,0.787269018204505,0.8543669034684497,1.0,0.787269018204505
3,10,gini,40,0.0,0.5854955791712522,0.5796730632551528,0.716192483315771,0.5854955791712522,0.4349485621231983,0.8643339472068754,0.3443384690633406,0.4349485621231983
3,10,gini,40,0.05,0.3419889945435994,0.5077581594435527,1.0,0.3419889945435994,0.787269018204505,0.8543669034684497,1.0,0.787269018204505
3,10,entropy,32,0.0,0.5848888883189227,0.5787346711259754,0.7293642430628732,0.5848888883189227,0.5708247949345306,0.8541996830427893,0.5272682807532404,0.5708247949345306
3,10,entropy,32,0.05,0.3419889945435994,0.5077581594435527,1.0,0.3419889945435994,0.787269018204505,0.8543669034684497,1.0,0.787269018204505
3,10,entropy,40,0.0,0.5789029773214802,0.575427224513848,0.6859852476290832,0.5789029773214802,0.4883140776596171,0.8453865336658354,0.4145267791636097,0.4883140776596171
3,10,entropy,40,0.05,0.3419889945435994,0.5077581594435527,1.0,0.3419889945435994,0.787269018204505,0.8543669034684497,1.0,0.787269018204505
3,20,gini,32,0.0,0.5918553154283305,0.5832997176280759,0.761854583772392,0.5918553154283305,0.4959923730422953,0.8519429414658141,0.4235754463193935,0.4959923730422953
3,20,gini,32,0.05,0.3419889945435994,0.5077581594435527,1.0,0.3419889945435994,0.787269018204505,0.8543669034684497,1.0,0.787269018204505


#### Regression

##### Linear Regression

In [0]:
# Linear Regression, no sampling
lin_reg_r_params = { 'maxIter': [10, 20], 'regParam': [0.2, 0.4], 'elasticNetParam': [0.0, 0.3, 0.8] }
lin_reg_r_no_sampling01 = train_model_no_CV(train_001_none, val_001_none, model_type='LinearRegression', params=lin_reg_r_params,  train_metrics = True)
display(lin_reg_r_no_sampling01)

maxIter,regParam,elasticNetParam,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
10,0.2,0.0,0.3606671764074005,32.714221860865415,1070.2203119619246,11.846338649702195,0.0522139639436763,25.162843386983088,633.1686873178385,13.588996107340751
10,0.2,0.3,0.3590981153427452,32.75434114957566,1072.8468641427846,11.394495535849217,0.071591845252104,24.904282545186916,620.2232890905018,13.531036761838894
10,0.2,0.8,0.3551370735674787,32.85540300468136,1079.4775066000243,11.00519842404081,0.0874775275561365,24.69029880824676,609.6108552405112,13.481837498292789
10,0.4,0.0,0.3606314262907072,32.71513650254879,1070.2801563804005,11.821811365620997,0.0567047150241442,25.10315996021297,630.1686399880399,13.56567201481066
10,0.4,0.3,0.3566955244829829,32.81567789075651,1076.868715429886,11.113384585957071,0.0860686059652945,24.70935218991238,610.5520856451277,13.491104155694671
10,0.4,0.8,0.3502311189441044,32.98014401751477,1087.6898994160151,10.80226618739102,0.099077460310104,24.532866065730392,601.8615173990657,13.581270196443969
20,0.2,0.0,0.3606671764074005,32.714221860865415,1070.2203119619246,11.84633864970219,0.0522139639436732,25.162843386983138,633.168687317841,13.588996107340918
20,0.2,0.3,0.3591413183498975,32.75323715072866,1072.774543851872,11.4035013388712,0.0714495239718105,24.906191335804987,620.3183668557276,13.50388164907442
20,0.2,0.8,0.3551384653117382,32.85536755036903,1079.4751768698425,11.00705044319584,0.0875758154048933,24.688969075676688,609.5451940197198,13.480238690605884
20,0.4,0.0,0.3606314262907072,32.71513650254879,1070.2801563804005,11.821811365621,0.0567047150241412,25.103159960213024,630.1686399880423,13.565672014810772


##### Decision Tree Regression

In [0]:
# Decision Tree Regressor No sampling 
dt_r_params = { 'maxDepth': [5, 10, 15], 'minInfoGain': [0.0, 0.05] }
dt_r_no_sampling01 = train_model_no_CV(train_001_none, val_001_none, model_type='DecisionTreeRegressor', params=dt_r_params,  train_metrics = True)
display(dt_r_no_sampling01)

maxDepth,minInfoGain,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
5,0.0,0.3839495879010074,32.113025187420824,1031.246386687924,9.917281848564125,-1.7806694956502698,43.10019628049126,1857.6269194168729,9.39405007931845
5,0.05,0.3839400931291269,32.11327265472848,1031.2622805969322,9.920350641877,-1.7806600868098816,43.10012336222263,1857.6206338388083,9.392856357369103
10,0.0,0.5068249800631615,28.73251414143341,825.5573688876708,9.399974639197984,-1.890651446998307,43.94428732192728,1931.1003882320983,10.133408218263137
10,0.05,0.5067317257387822,28.735230524496647,825.7134732959638,9.428904594224534,-1.8903755042363528,43.94218980210473,1930.9160446041967,10.15145089356706
15,0.0,0.6302762734014316,24.8777875440678,618.9043130877748,8.527545907699615,-1.9448047040387333,44.35400189956761,1967.277484506848,10.189791557476305
15,0.05,0.6299243395108478,24.889625104664887,619.4934378507646,8.63108527302733,-1.9439391820957712,44.3474832698151,1966.6992723665303,10.208349597641355


##### Random Forest Regression

In [0]:
# Random Forest Regression - No Sampling
rf_r_params = { 'maxDepth': [3, 5, 7], 'numTrees': [10, 20, 50], 'minInfoGain': [0.0, 0.05]}
rf_r_no_sampling01 = train_model_no_CV(train_001_none, val_001_none, model_type='RandomForestRegressor', params=rf_r_params,train_metrics = True)
display(rf_r_no_sampling01)

maxDepth,numTrees,minInfoGain,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
3,10,0.0,0.3533448808048027,32.90102693342394,1082.477573273888,10.684498177175032,0.320053971586742,21.3128624629585,454.2381063649855,8.054509184046118
3,10,0.05,0.3618189365900754,32.68474118840428,1068.2923065529717,10.372222632291589,0.3336036646814782,21.099436738053257,445.1862306631116,7.226692863708997
3,20,0.0,0.3509486447103181,32.961929356621155,1086.4887869108832,10.851019747602626,0.3680040295116107,20.547629029600586,422.2050587380847,8.046051034938204
3,20,0.05,0.3509474772789571,32.9619590004794,1086.490741149285,10.851220239279115,0.3680024119716467,20.547655324538702,422.2061393360435,8.0461343961404
3,50,0.0,0.3334792706764242,33.40257318079985,1115.7318950986892,11.487632386226249,0.3563119020636283,20.7368268369788,430.01598726684415,7.870901747818076
3,50,0.05,0.3334788855462166,33.40258283116593,1115.732539792901,11.487676836155556,0.3563106001987651,20.736847807175916,430.0168569779765,7.870935446206237
5,10,0.0,0.3943872705828619,31.83981907417211,1013.774078676014,10.102623272098088,0.2992605624572375,21.636292387805764,468.1291482906217,7.309609066363379
5,10,0.05,0.3929968829358359,31.8763476254315,1016.1015379373524,10.134246843042632,0.3072313472885142,21.51288598919344,462.8042635840353,8.233064087294709
5,20,0.0,0.391131344644666,31.92529377165179,1019.2243824062688,10.439143455049564,0.254499926944669,22.316619401573803,498.0315015147002,8.611086359747661
5,20,0.05,0.3974610318082157,31.758915475323946,1008.6287121687708,10.290321985559851,0.2807283684373196,21.920528846797403,480.509584923277,7.705307975169246


##### Gradient Boosted Trees Regression

In [0]:
# Gradient Boosted Trees Regression - No Sampling
gbt_r_params = { 'maxDepth': [3, 5, 7], 'maxIter': [15, 50] , 'stepSize': [0.1, 0.3, 0.7], 'minInfoGain': [0.0, 0.05]}
gbt_r_no_sampling01 = train_model_no_CV(train_001_none, val_001_none, model_type='GBTRegressor', params=gbt_r_params, train_metrics = True)
display(gbt_r_no_sampling01)

maxDepth,maxIter,stepSize,minInfoGain,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
3,15,0.1,0.0,0.4036315526800547,31.59587758899464,998.2994806187336,9.932228758066666,-1.7107147904529985,42.55459624922131,1810.8936619342408,9.244757737551485
3,15,0.1,0.05,0.4036315526800548,31.59587758899464,998.2994806187332,9.93222875806666,-1.710714790452998,42.55459624922131,1810.8936619342408,9.244757737551485
3,15,0.3,0.0,0.4484437328101744,30.385613234148085,923.2854916152352,9.902337140909298,-2.2056537191384296,46.27675151979243,2141.537731224611,20.105550044540102
3,15,0.3,0.05,0.4484437328101744,30.385613234148085,923.2854916152352,9.9023371409093,-2.206023293478537,46.27941902671875,2141.784625450617,20.105550044540102
3,15,0.7,0.0,0.4722443355561176,29.72278810429471,883.4441326928029,9.958823375251184,-1.986917290329369,44.67002135456149,1995.410807816979,10.601579305203607
3,15,0.7,0.05,0.4722443355561175,29.72278810429471,883.4441326928029,9.958823375251187,-1.9848582899717775,44.65462229307592,1994.0352921372728,10.585147368289311
3,50,0.1,0.0,0.4584431557002806,30.10891586958264,906.5468148416056,9.76717638814133,-1.9655518438820103,44.50997209867459,1981.1376162247905,16.226468113882973
3,50,0.1,0.05,0.4584431557002807,30.10891586958265,906.5468148416054,9.76717638814133,-1.9655464274791563,44.50993145125672,1981.133997795572,16.226468113882973
3,50,0.3,0.0,0.5661997964930028,26.94746828265816,726.1660468448674,9.629558353385974,-3.6313027583329287,55.62322277341063,3093.942911700466,32.79830659914202
3,50,0.3,0.05,0.5661997964930028,26.94746828265816,726.1660468448674,9.629558353385974,-3.6313027583329314,55.62322277341063,3093.942911700467,32.79830659914203


### 10% Sample Size

#### Classification

##### Logistic Regression

**No Sampling:**
- No need to run because of imbalanced data set 

**Over Sampling:**
- maxIter: 20
- regParam: 0.4
- elasticNetParam: 0.8 ,0.3

**Under Sampling:**
- maxIter: 20
- regParam: 0.4
- elasticNetParam: 0.8, 0.3

In [0]:
# Logistic Regression Over sampling 
log_reg_c_params = { 'maxIter': [20], 'regParam': [0.4], 'elasticNetParam': [0.3, 0.8]}
log_reg_c_over_sampling10 = train_model_no_CV(train_10_over, val_10_over,  model_type='LogisticRegression', params=log_reg_c_params, train_metrics = True)
display(log_reg_c_over_sampling10)

maxIter,regParam,elasticNetParam,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
20,0.4,0.3,0.6376633011409668,0.6368668951042761,0.6413560623506244,0.6376633011409668,0.5637030688448127,0.8950745061482994,0.4890495094000449,0.5637030688448127
20,0.4,0.8,0.3335838068614388,0.5002253990835009,1.0,0.3335838068614388,0.7867323417435987,0.8539884992036434,1.0,0.7867323417435987


In [0]:
# Logistic Regression Under sampling 
log_reg_c_params = { 'maxIter': [10, 20], 'regParam': [0.2, 0.4], 'elasticNetParam': [0.0, 0.8]}
log_reg_c_under_sampling10 = train_model_no_CV(train_10_under, val_10_under,  model_type='LogisticRegression', params=log_reg_c_params, train_metrics = True)
display(log_reg_c_under_sampling10)

maxIter,regParam,elasticNetParam,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
10,0.2,0.0,0.7387516006813835,0.7445967740478998,0.7266259070024023,0.7387516006813835,0.5819774699234916,0.8842149104539775,0.5207286306389284,0.5819774699234916
10,0.2,0.8,0.7054895725695075,0.7501279875262613,0.622360946860278,0.7054895725695075,0.5630327649547799,0.8966770795897493,0.487087780069279,0.5630327649547799
10,0.4,0.0,0.7364244961883442,0.7428872573699061,0.7229688169181393,0.7364244961883442,0.583177409730368,0.8825388520142285,0.5234767635104018,0.583177409730368
10,0.4,0.8,0.3335403649689837,0.0,0.0,0.3335403649689837,0.013497395321228,0.0,0.0,0.013497395321228
20,0.2,0.0,0.7387714516944599,0.7446497984191437,0.7265818456760859,0.7387714516944599,0.581900885983966,0.8841881994890781,0.5206310193620728,0.581900885983966
20,0.2,0.8,0.7054895725695075,0.7501279875262613,0.622360946860278,0.7054895725695075,0.5630327649547799,0.8966770795897493,0.487087780069279,0.5630327649547799
20,0.4,0.0,0.7364228980089874,0.7428740103295159,0.7229899663547712,0.7364228980089874,0.5832308190054765,0.8825671274028319,0.523544340548225,0.5832308190054765
20,0.4,0.8,0.3335403649689837,0.0,0.0,0.3335403649689837,0.013497395321228,0.0,0.0,0.013497395321228


##### Decision Tree Classification

**No Sampling:**
- entropy bad --> precision and recall going to 0

**Over Sampling:**
- maxDepth: 5
- impurity: gini
- maxBins: 32
- minInfoGain: 0.0, 0.05

**Under Sampling:**
- maxDepth: 5
- impurity: gini and entropy
- maxBins: 28 and 40
- minInfoGain: 0.05

In [0]:
# Decision Tree Classifier Over sampling 
dt_c_params = { 'maxDepth': [5, 10], 'impurity': ['gini', 'entropy'], 'maxBins': [32], 'minInfoGain': [0.0, 0.05] }
dt_c_over_sampling10 = train_model_no_CV(train_10_over, val_10_over, model_type='DecisionTreeClassifier', params=dt_c_params, train_metrics = True)
display(dt_c_over_sampling10)

maxDepth,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
5,gini,32,0.0,0.656904015800025,0.6761952024881092,0.6062467915873808,0.656904015800025,0.2138243714468591,0.7372490053359236,0.1383292137917259,0.2138243714468591
5,gini,32,0.05,0.3335838068614388,0.5002253990835009,1.0,0.3335838068614388,0.7867323417435987,0.8539884992036434,1.0,0.7867323417435987
5,entropy,32,0.0,0.6567102333447659,0.6756876321783426,0.6067073564087516,0.6567102333447659,0.2145045283420853,0.7371293818393789,0.1389159363842908,0.2145045283420853
5,entropy,32,0.05,0.6363585722486187,0.6556346342135866,0.5801591959116857,0.6363585722486187,0.5440133980625272,0.894147651394989,0.4630489601278306,0.5440133980625272
10,gini,32,0.0,0.6741808451330715,0.6859000746283256,0.644292260829822,0.6741808451330715,0.3488470540194655,0.8116845702739607,0.2567547998901455,0.3488470540194655
10,gini,32,0.05,0.3335838068614388,0.5002253990835009,1.0,0.3335838068614388,0.7867323417435987,0.8539884992036434,1.0,0.7867323417435987
10,entropy,32,0.0,0.6742111634954244,0.6878092911405277,0.6399716310836684,0.6742111634954244,0.3443486012614281,0.8113638006796109,0.2521683769006066,0.3443486012614281
10,entropy,32,0.05,0.6363585722486187,0.6556346342135866,0.5801591959116857,0.6363585722486187,0.5440133980625272,0.894147651394989,0.4630489601278306,0.5440133980625272


In [0]:
# Decision Tree Classifier Under sampling 
dt_c_params = { 'maxDepth': [5, 10], 'impurity': ['gini', 'entropy'], 'maxBins': [32], 'minInfoGain': [0.0, 0.05]  }
dt_c_under_sampling10 = train_model_no_CV(train_10_under, val_10_under, model_type='DecisionTreeClassifier', params=dt_c_params, train_metrics = True)
display(dt_c_under_sampling10)

maxDepth,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
5,gini,32,0.0,0.720670742183152,0.7428895548276689,0.6763413589570508,0.720670742183152,0.5974215022169472,0.8971569486791282,0.5354078649660613,0.5974215022169472
5,gini,32,0.05,0.7054895725695075,0.7501279875262613,0.622360946860278,0.7054895725695075,0.5460927369663716,0.8958812773491964,0.4647598262018701,0.5460927369663716
5,entropy,32,0.0,0.720085085265596,0.7455493090864882,0.6701075625098036,0.720085085265596,0.590993002774288,0.8977748148528011,0.5258044170354204,0.590993002774288
5,entropy,32,0.05,0.7052978282963087,0.7509173519710541,0.6207183406152018,0.7052978282963087,0.5439755339529699,0.895710819990296,0.4620417275694291,0.5439755339529699
10,gini,32,0.0,0.7358825808812797,0.7536260694561985,0.7015567747814118,0.7358825808812797,0.5700257673134681,0.8835987198229879,0.5037417656127986,0.5700257673134681
10,gini,32,0.05,0.7054895725695075,0.7501279875262613,0.622360946860278,0.7054895725695075,0.5460927369663716,0.8958812773491964,0.4647598262018701,0.5460927369663716
10,entropy,32,0.0,0.7327866489803101,0.7609775513139565,0.6806699436543759,0.7327866489803101,0.5466288426059606,0.8834093340088949,0.4712972788979436,0.5466288426059606
10,entropy,32,0.05,0.7052978282963087,0.7509173519710541,0.6207183406152018,0.7052978282963087,0.5439755339529699,0.895710819990296,0.4620417275694291,0.5439755339529699


In [0]:
# Decision Tree Classifier Under sampling 
dt_c_params = { 'maxDepth': [5], 'impurity': ['gini'], 'maxBins': [32, 40], 'minInfoGain': [0.07]  }
dt_c_under_sampling10 = train_model_no_CV(train_10_under, val_10_under, model_type='DecisionTreeClassifier', params=dt_c_params, train_metrics = True)
display(dt_c_under_sampling10)

maxDepth,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
5,gini,32,0.07,0.7054895725695075,0.7501279875262613,0.622360946860278,0.7054895725695075,0.5460927369663716,0.8958812773491964,0.4647598262018701,0.5460927369663716
5,gini,40,0.07,0.7054895725695075,0.7501279875262613,0.622360946860278,0.7054895725695075,0.5460927369663716,0.8958812773491964,0.4647598262018701,0.5460927369663716


##### Random Forest Classification

**No Sampling:**
-  not great in comparison 

**Over Sampling:**
- undersampling not worth running. recall much worse 

**Under Sampling:**
- maxDepth: 5 and 7
- numTrees: 20
- impurity: entropy 
- maxBins: 32
- minInfoGain: 0.05

In [0]:
# Random Forest Classifier - Under Sampling
rf_c_params = { 'maxDepth': [3, 5], 'numTrees': [15, 50] , 'impurity': ['entropy', 'gini'], 'maxBins': [32], 'minInfoGain': [0.0, 0.05]  }
rf_c_under_sampling10 = train_model_no_CV(train_10_under, val_10_under, model_type='RandomForestClassifier', params=rf_c_params, train_metrics = True)
display(rf_c_under_sampling10)

maxDepth,numTrees,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
3,15,entropy,32,0.0,0.5939515371498537,0.6034821283916709,0.5514556980988419,0.5939515371498537,0.6099475522152281,0.8678340793257275,0.5808721943015037,0.6099475522152281
3,15,entropy,32,0.05,0.3335403649689837,0.0,0.0,0.3335403649689837,0.013497395321228,0.0,0.0,0.013497395321228
3,15,gini,32,0.0,0.5939515371498537,0.6034821283916709,0.5514556980988419,0.5939515371498537,0.6099475522152281,0.8678340793257275,0.5808721943015037,0.6099475522152281
3,15,gini,32,0.05,0.3335403649689837,0.0,0.0,0.3335403649689837,0.013497395321228,0.0,0.0,0.013497395321228
3,50,entropy,32,0.0,0.6840181490997255,0.6871752469333984,0.6752433507052455,0.6840181490997255,0.6276464932011007,0.8909589437016079,0.5863759686042088,0.6276464932011007
3,50,entropy,32,0.05,0.3335403649689837,0.0,0.0,0.3335403649689837,0.013497395321228,0.0,0.0,0.013497395321228
3,50,gini,32,0.0,0.6862899642703905,0.6867943279216703,0.6845138537622204,0.6862899642703905,0.6095505239697374,0.8902022516688254,0.5590723424704163,0.6095505239697374
3,50,gini,32,0.05,0.3335403649689837,0.0,0.0,0.3335403649689837,0.013497395321228,0.0,0.0,0.013497395321228
5,15,entropy,32,0.0,0.7094372057646333,0.746961866470729,0.6378476218339734,0.7094372057646333,0.5928727128100595,0.8965423774754782,0.5298239993592696,0.5928727128100595
5,15,entropy,32,0.05,0.3335403649689837,0.0,0.0,0.3335403649689837,0.013497395321228,0.0,0.0,0.013497395321228


In [0]:
# Random Forest Classifier - Under Sampling
rf_c_params = { 'maxDepth': [3], 'numTrees': [50, 75] , 'impurity': ['gini'], 'maxBins': [32], 'minInfoGain': [0.05, 0.07]  }
rf_c_under_sampling10 = train_model_no_CV(train_10_under, val_10_under, model_type='RandomForestClassifier', params=rf_c_params, train_metrics = True)
display(rf_c_under_sampling10)

maxDepth,numTrees,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
3,50,gini,32,0.05,0.3335403649689837,0.0,0.0,0.3335403649689837,0.013497395321228,0.0,0.0,0.013497395321228
3,50,gini,32,0.07,0.3335403649689837,0.0,0.0,0.3335403649689837,0.013497395321228,0.0,0.0,0.013497395321228
3,75,gini,32,0.05,0.3335403649689837,0.0,0.0,0.3335403649689837,0.013497395321228,0.0,0.0,0.013497395321228
3,75,gini,32,0.07,0.3335403649689837,0.0,0.0,0.3335403649689837,0.013497395321228,0.0,0.0,0.013497395321228


#### Regression

##### Linear Regression

**No Sampling:**
- looks like our model is slightly overfitting --> RMSE very similar; MAE worse in VAL
- maxIter: 10
- regParam: 0.4
- elasticNetParam: 0.3 and 0.8

In [0]:
# Linear Regression, no sample
lin_reg_r_params = { 'maxIter': [10, 20], 'regParam': [0.2, 0.4], 'elasticNetParam': [0.0, 0.8] }
lin_reg_r_no_sampling10 = train_model_no_CV(train_10_none, val_10_none, model_type='LinearRegression', params=lin_reg_r_params, train_metrics = True)
display(lin_reg_r_no_sampling10)

maxIter,regParam,elasticNetParam,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
10,0.2,0.0,0.0637399379994394,41.20066143250767,1697.4945024761244,17.59547355962504,-0.0216751730793314,34.317763329751834,1177.7088799568598,15.847848834267554
10,0.2,0.8,0.0601839774008327,41.27882832605933,1703.941667972278,17.48562527531469,-0.0174655215600412,34.2469898843919,1172.8563161416416,15.918757681507358
10,0.4,0.0,0.0637114423345223,41.201288411782535,1697.5461667908858,17.58589536289111,-0.0213267524735922,34.31191115882832,1177.307247371327,15.840543795643509
10,0.4,0.8,0.0573916084577227,41.34010639720342,1709.004396932099,17.46127933996274,-0.0171412487734783,34.241532081849805,1172.4825193123495,16.008469867498633
20,0.2,0.0,0.0637399424740858,41.20066133405298,1697.4944943633282,17.595288325953575,-0.0216576312241756,34.31746871565456,1177.6886590499296,15.847894960848103
20,0.2,0.8,0.0603265857281817,41.27569636839367,1703.683110695826,17.486219268759303,-0.0176050878903732,34.24933864349164,1173.0171975165697,15.921448019951304
20,0.4,0.0,0.0637116597068478,41.20128362905806,1697.5457726820878,17.58571026382843,-0.0213096301896524,34.31162354237927,1177.287510113955,15.840596278796554
20,0.4,0.8,0.057400960934683,41.33990131025793,1708.987440341865,17.461655425025217,-0.0170837546234097,34.240564312817035,1172.4162444601595,16.0065790512908


##### Decision Tree Regression

**No Sampling:**
- Not worth pursuing

In [0]:
# Decision Tree Regressor No sampling 
dt_r_params = { 'maxDepth': [5, 10, 15], 'minInfoGain': [0.0, 0.05] }
dt_r_no_sampling10 = train_model_no_CV(train_10_none, val_10_none, model_type='DecisionTreeRegressor', params=dt_r_params,  train_metrics = True)
display(dt_r_no_sampling10)

maxDepth,minInfoGain,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
5,0.0,0.0508803041211178,41.48264425355271,1720.80977426681,17.321574014926412,-0.2390121672883056,37.79204041893739,1428.2383190265973,23.43330650330669
5,0.05,0.050877768070477,41.4826996743899,1720.8143722756276,17.3227702185367,-0.2390121672883054,37.79204041893738,1428.238319026597,23.43330650330669
10,0.0,0.0792962643526427,40.856945171953306,1669.2899687839986,16.943598496977604,-2.633693149159801,64.71972747254912,4188.643124121028,24.323446099310534
10,0.05,0.0792854034390766,40.8571861519908,1669.309660258429,16.94843225635678,-2.6336931491597992,64.71972747254912,4188.64312412103,24.32344609931053
15,0.0,0.1538140011578842,39.16867286056272,1534.1849336577814,16.491393576312127,-4.316478258664653,78.28428257481151,6128.428898252935,27.816088333477666
15,0.05,0.1537914895781124,39.16919387062463,1534.225748474578,16.500352120868538,-4.3164782829836845,78.28428275385842,6128.428926286053,27.816087826593307


In [0]:
# Decision Tree Regressor No sampling 
dt_r_params = { 'maxDepth': [5], 'minInfoGain': [0.07] }
dt_r_no_sampling10 = train_model_no_CV(train_10_none, val_10_none, model_type='DecisionTreeRegressor', params=dt_r_params,  train_metrics = True)
display(dt_r_no_sampling10)

maxDepth,minInfoGain,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
5,0.07,0.050872666336892,41.4828111634008,1720.82362201837,17.324230549539525,-0.2391252935108005,37.793765653484485,1428.368722270504,23.498828059698287


##### Random Forest Regression

**No Sampling:**
- maxDepth: 5, 7
- numTrees: 10
- minInfoGain: 0, 0.05

In [0]:
# Random Forest Regression - No Sampling
rf_r_params = { 'maxDepth': [3, 5], 'numTrees': [15, 50], 'minInfoGain': [0.0, 0.05]}
rf_r_no_sampling10 = train_model_no_CV(train_10_none, val_10_none, model_type='RandomForestRegressor', params=rf_r_params, train_metrics = True)
display(rf_r_no_sampling10)

maxDepth,numTrees,minInfoGain,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
3,15,0.0,0.0421737030909322,41.67247717009267,1736.595353491895,17.76755239729161,-0.036480316698942,34.56551916951927,1194.775115458404,17.2712036936851
3,15,0.05,0.0421737030909326,41.67247717009267,1736.595353491895,17.767552397291613,-0.0364803166989424,34.56551916951927,1194.775115458404,17.27120369368509
3,50,0.0,0.0416021446323606,41.684908810734726,1737.631622559269,17.769770134737133,-0.0311187459882398,34.47600190618817,1188.5947074354904,17.012712316142
3,50,0.05,0.0416021446323605,41.684908810734726,1737.631622559269,17.76977013473713,-0.0311187459882398,34.47600190618817,1188.5947074354906,17.012712316141997
5,15,0.0,0.0517432453393457,41.463781920321146,1719.245211135951,17.49573928549152,-0.0540740145000406,34.857650273923525,1215.0557826191607,18.201940419316408
5,15,0.05,0.0514237533350752,41.47076643747356,1719.824468911483,17.501967131672476,-0.0658802380813758,35.05231936428253,1228.6650928156564,18.68343270151692
5,50,0.0,0.0523768235299555,41.44992758317456,1718.0964966504148,17.451319407916188,-0.048456884829654,34.76464850682591,1208.5807858031528,17.958647893373865
5,50,0.05,0.0522160811869617,41.453442944617976,1718.3879319626976,17.471396292647558,-0.049341562727994,34.77931245321967,1209.600574718681,17.965377512763823


##### Gradient Boosted Trees Regression

In [0]:
# Gradient Boosted Trees Regression - No Sampling
gbt_r_params = { 'maxDepth': [3, 5], 'maxIter': [15, 30] , 'stepSize': [0.1, 0.3], 'minInfoGain': [0.0, 0.05]}
gbt_r_no_sampling10 = train_model_no_CV(train_10_none, val_10_none, model_type='GBTRegressor', params=gbt_r_params, train_metrics = True)
display(gbt_r_no_sampling10)

maxDepth,maxIter,stepSize,minInfoGain,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
3,15,0.1,0.0,0.0587888143397581,40.7706589999373,1662.2466352891688,17.090172679425624,-0.0263492617437326,34.39908370589925,1183.2969598054633,16.748423230580183
3,15,0.1,0.05,0.0587888143397581,40.7706589999373,1662.2466352891688,17.090172679425624,-0.0263492617437328,34.39908370589925,1183.2969598054633,16.74842323058019
3,15,0.3,0.0,0.0663091712803508,40.60745183032135,1648.965144151869,17.002424276636834,-0.0366907794223123,34.5719523014396,1195.2198859330144,16.91267967600886
3,15,0.3,0.05,0.0663091712803511,40.60745183032135,1648.9651441518697,17.002424276636834,-0.0366907794223125,34.5719523014396,1195.219885933015,16.912679676008864
3,30,0.1,0.0,0.0639912525532288,40.65782526528976,1653.0587553028347,16.98853169859159,-0.0311939505491827,34.48017532907867,1188.8824907240053,16.93514897450474
3,30,0.1,0.05,0.0639912525532289,40.657825265289766,1653.0587553028347,16.988531698591586,-0.0311939505491831,34.48017532907867,1188.8824907240055,16.93514897450474
3,30,0.3,0.0,0.0732611847137462,40.45599323360147,1636.6873885172076,16.916387960466153,-0.067639983479357,35.08421002517937,1230.9017930908965,17.433122104894988
3,30,0.3,0.05,0.0732611847137462,40.45599323360147,1636.6873885172076,16.91638796046615,-0.0676399834793575,35.08421002517937,1230.9017930908965,17.433122104894984
5,15,0.1,0.0,0.0687758924967004,40.55377586741722,1644.6087371047115,16.86906120560383,-0.0024134241996749,33.9956010687015,1155.7008920222984,14.759029480259317
5,15,0.1,0.05,0.068775872401494,40.5537763049792,1644.6087725942914,16.869064215580003,-0.0028422909162231,34.002872530943606,1156.1953403555992,14.761173322553224


### Full DataSet Experimentation

In [0]:
inputCols_categorical = ['Year', 'QUARTER', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DISTANCE_GROUP', 'holiday_in2DayRange', 'C19', 'OP_UNIQUE_CARRIER', 'type', 'DEP_TIME_BLK', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']
inputCols_continuous = ['DISTANCE']

pipeline = create_pipeline(df_full, inputCols_categorical, inputCols_continuous)
pipeline_df1 = pipeline.fit(df_full).transform(df_full)
pipeline_df = pipeline_df1.filter(col('label') != 2).cache()

train_none, val_none = grid_search_test_train_split(pipeline_df, sampling='none')
train_none = train_none.cache()
val_none  = val_none.cache()

train_under, val_under = grid_search_test_train_split(pipeline_df, sampling='under')
train_under = train_under.cache()
val_under  = val_under.cache()

pipeline_df.unpersist()

Out[19]: DataFrame[local_timestamp: timestamp, timezone: string, scheduled_departure_UTC: timestamp, rounded_depTimestamp: timestamp, label: int, Year: int, QUARTER: int, MONTH: int, DAY_OF_MONTH: int, DAY_OF_WEEK: int, DEP_TIME_BLK: string, OP_UNIQUE_CARRIER: string, TAIL_NUM: string, OP_CARRIER_FL_NUM: string, dep_delay_15: int, DEP_DELAY: int, DEP_DELAY_NEW: int, CANCELLED: int, ORIGIN_AIRPORT_ID: string, ORIGIN: string, ORIGIN_CITY_NAME: string, ORIGIN_STATE_ABR: string, elevation_ft: int, type: string, DEST_AIRPORT_ID: string, DEST: string, DEST_CITY_NAME: string, DEST_STATE_ABR: string, TAXI_OUT: int, TAXI_IN: int, DISTANCE: int, DISTANCE_GROUP: int, holiday: int, holiday_in2DayRange: bigint, C19: int, CARRIER_DELAY: int, WEATHER_DELAY: int, NAS_DELAY: int, SECURITY_DELAY: int, LATE_AIRCRAFT_DELAY: int, scheduled_departure_UTC_minus_1hr: timestamp, scheduled_departure_UTC_add_2hr: timestamp, origin_HourlyAltimeterSetting: float, origin_HourlyDewPointTemperature: int, origin_Hourl

#### Classification

##### Logistic Regression

In [0]:
# Logistic Regression No sampling 
log_reg_c_params = { 'maxIter': [10, 20], 'regParam': [0.2, 0.4], 'elasticNetParam': [0.0, 0.8]}
log_reg_c_no_sampling = train_model_no_CV(train_none, val_none,  model_type='LogisticRegression', params=log_reg_c_params, train_metrics = True)
display(log_reg_c_no_sampling)

maxIter,regParam,elasticNetParam,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
10,0.2,0.0,0.7367054588578306,0.8182757477823044,0.9999680738982836,0.7367054588578306,0.8661037572015079,0.9091744455622518,0.9998784814017906,0.8661037572015079
10,0.2,0.8,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
10,0.4,0.0,0.7363700617516982,0.8181618522433426,0.9999999217497508,0.7363700617516982,0.8657775769795434,0.9090430686259922,0.9999957580776806,0.8657775769795434
10,0.4,0.8,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
20,0.2,0.0,0.7367049181972333,0.8182755615034469,0.9999678782726602,0.7367049181972333,0.8661007421748952,0.9091734761828592,0.999876485203052,0.86610118204684
20,0.2,0.8,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
20,0.4,0.0,0.7363698308592077,0.8181617736735813,0.9999999217497508,0.7363698308592077,0.8657775769795434,0.9090430686259922,0.9999957580776806,0.8657775769795434
20,0.4,0.8,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754


In [0]:
# Logistic Regression Under sampling 
log_reg_c_params = { 'maxIter': [10, 20], 'regParam': [0.2, 0.4], 'elasticNetParam': [0.0, 0.8]}
log_reg_c_under_sampling = train_model_no_CV(train_under, val_under,  model_type='LogisticRegression', params=log_reg_c_params, train_metrics = True)
display(log_reg_c_under_sampling)

maxIter,regParam,elasticNetParam,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
10,0.2,0.0,0.6629520325155231,0.6698746403303867,0.6427596907142964,0.6629520325155231,0.6868617796688612,0.9515430759974174,0.5939010638990702,0.6868617796688612
10,0.2,0.8,0.3334873011022192,0.0,0.0,0.3334873011022192,0.0151788819552767,0.0,0.0,0.0151788819552767
10,0.4,0.0,0.6595737128776282,0.6663003736262405,0.6395291150698852,0.6595737128776282,0.6845312468340595,0.9502491172254098,0.5914509794723398,0.6845312468340595
10,0.4,0.8,0.3334873011022192,0.0,0.0,0.3334873011022192,0.0151788819552767,0.0,0.0,0.0151788819552767
20,0.2,0.0,0.662971820280807,0.6698912696755049,0.6427901578744841,0.662971820280807,0.6868800918093181,0.9515402617193652,0.5939285116317253,0.6868800918093181
20,0.2,0.8,0.3334873011022192,0.0,0.0,0.3334873011022192,0.0151788819552767,0.0,0.0,0.0151788819552767
20,0.4,0.0,0.6595827014739665,0.6662818426861826,0.6396143526741097,0.6595827014739665,0.6845932281231228,0.9502415811359084,0.5915425550894708,0.6845932281231228
20,0.4,0.8,0.3334873011022192,0.0,0.0,0.3334873011022192,0.0151788819552767,0.0,0.0,0.0151788819552767


**Extra Experimentation with feature changes**

For the below, take action for both origin and destination weather. The numbers refer to the correlation coefficients between the two variables (regardless of origin and destination). 
- remove either elevation ft or hourly station pressure: -0.99
  - % nulls before imputation
    - elevation ft: 0.02%
    - hourly station pressure: 1.27% (origin), 1.38% (destination)
  - Keep: elevation_ft
- remove either hourly dry bulb or hourly dew point temperature: 0.74
  - % nulls before imputation
    - hourly dry bulb: 1.21% (origin), 1.35% (destination)
    - hourly dew point: 1.24% (origin), 1.38% (destination)
  - Keep: hourly dry bulb
- remove either hourly wetbulb or hourly dry bulb temperature: 0.93
  - % nulls before imputation
    - hourly dry bulb: 1.21% (origin), 1.35% (destination)
    - hourly wet bulb: 1.35% (origin), 1.41% (destination)
  - Keep: hourly dry bulb
- remove either hourly wetbulb or hourly dew point temperature: 0.93
  - % nulls before imputation
    - hourly wet bulb: 1.35% (origin), 1.41% (destination)
    - hourly dew point: 1.24% (origin), 1.38% (destination)
  - Keep: remove both as they both are correlated with hourly dry bulb and they have both been chosen to be removed because hourly dry bulb has less % of nulls
- remove either hourly pressure tendency or hourly pressure change: 0.71
  - % nulls before imputation
    - hourly pressure tendency: 70.81% (origin), 70.54% (destination)
    - hourly pressure change: 70.81% (origin), 70.54% (destination)
  - Keep hourly pressure tendency, because even though both have same % of nulls, hourly pressure change has a wider distribution which could be easier for the model to learn from 
- remove either hourly wind speed or hourly wind gust speed: 0.85
  - % nulls before imputation
    - hourly wind speed: 1.25% (origin), 1.38% (destination)
    - hourly wind gust speed: 86.34% (origin), 85.98% (destination)
  - Keep hourly wind speed

In [0]:
def preModeling_dataEdit(df):
  '''
  Input: df that has already gone through the final join, cleaning, and feature engineering
  Output: df that includes null imputing and # and % of flights (by tail number) that were delayed and cancelled in the past 90 days --> these depend on window functions, as such they need to be done right after the data is split for modelling and not during feature engineering phase
  '''
  
  ### FINAL CLEANING 
  # Remove rows with null scheduled_departure_UTC because these are rows without a proper timezone (timezonefinder could not find)
  df = df.na.drop(subset=["scheduled_departure_UTC"])
  dropCols = ['TAXI_IN', 'TAXI_OUT']
  df = df.drop(*dropCols) 

  
  ### FINAL FEATURE ADDITIONS
  ## GET NUMBER & PERCENTAGE OF TIMES A PLANE (BY TAIL NUMBER) HAS BEEN DELAYED OR CANCELLED IN THE PAST 3 MONTHS (2 COLUMNS)
  # Make window function
  df = df.withColumn('roundedMonth', f.date_trunc('month', df.scheduled_departure_UTC))
  window_3m = Window().partitionBy('TAIL_NUM').orderBy(f.col('roundedMonth').cast('long')).rangeBetween(-(86400), 0) # changed to 1 day instead of 3 months 

  # Add in Columns
  # Number of flights delayed/cancelled
  df = df.withColumn('no_delays_last1d', when(df.TAIL_NUM.isNotNull(), f.sum('dep_delay_15').over(window_3m)).otherwise(-1)) \
         .withColumn('no_cancellation_last1d', when(df.TAIL_NUM.isNotNull(), f.sum('CANCELLED').over(window_3m)).otherwise(-1)) 
  # Percentage of flights delayed/cancelled
  df = df.withColumn('count_flights_last1d', when(df.TAIL_NUM.isNotNull(), f.count('TAIL_NUM').over(window_3m)).otherwise(-1)) 
  df = df.withColumn('perc_delays_last1d', when(df.count_flights_last1d != -1, (df.no_delays_last1d/ df.count_flights_last1d)).otherwise(-1.0)) \
         .withColumn('perc_cancellation_last1d', when(df.count_flights_last1d != -1, (df.no_cancellation_last1d/ df.count_flights_last1d)).otherwise(-1.0))     
  
  ### HANDLING NULLS
  ## Imputing Hourly Weather Data to the best of our ability (up to 3 hours back)
  window = Window.partitionBy(col("ORIGIN_AIRPORT_ID"))\
                     .orderBy(col("rounded_depTimestamp"))\
                     .rowsBetween(0,3)
  
  cols_to_fill  = ['origin_HourlyAltimeterSetting', 'origin_HourlyDryBulbTemperature', 'origin_HourlyPrecipitation', 'origin_HourlyPressureTendency', 'origin_HourlyRelativeHumidity', 'origin_HourlySeaLevelPressure', 'origin_HourlyVisibility', 'origin_HourlyWindDirection', 'origin_HourlyWindSpeed', 'origin_HourlySkyConditions_SCT_cnt', 'origin_HourlySkyConditions_OVC_cnt', 'origin_HourlySkyConditions_FEW_cnt', 'origin_HourlySkyConditions_BKN_cnt', 'origin_HourlySkyConditions_VV_cnt', 'origin_HourlySkyConditions_SKC_cnt', 'origin_HourlySkyConditions_CLR_cnt', 'dest_HourlyAltimeterSetting', 'dest_HourlyDryBulbTemperature', 'dest_HourlyPrecipitation', 'dest_HourlyPressureTendency', 'dest_HourlyRelativeHumidity', 'dest_HourlySeaLevelPressure', 'dest_HourlyVisibility', 'dest_HourlyWindDirection', 'dest_HourlyWindSpeed', 'dest_HourlySkyConditions_SCT_cnt', 'dest_HourlySkyConditions_OVC_cnt', 'dest_HourlySkyConditions_FEW_cnt', 'dest_HourlySkyConditions_BKN_cnt', 'dest_HourlySkyConditions_VV_cnt', 'dest_HourlySkyConditions_SKC_cnt', 'dest_HourlySkyConditions_CLR_cnt']

  
  for field in cols_to_fill:
      filled_column_start = first(df[field], ignorenulls=True).over(window)
      df = df.withColumn(field, filled_column_start)
  
  ## We are still left with some null values --> will deal with them now in accordance to the table in section VII of this notebook
  impute_minus1int = ['DEP_DELAY_NEW', 'holiday' ,'holiday_in2DayRange']
  df = df.na.fill(value = -1,subset = impute_minus1int)
  
  impute_minus9999int = ['DEP_DELAY']
  df = df.na.fill(value = -9999,subset = impute_minus9999int)
  
  impute_minus1fl = ['perc_delays_last1d', 'perc_cancellation_last1d']
  df = df.na.fill(value = -1.0,subset = impute_minus1fl)
  
  impute_minus9999int = ['elevation_ft']
  df = df.na.fill(value = -9999,subset = impute_minus9999int)
  
  impute_99int = [ 'origin_HourlyRelativeHumidity', 'dest_HourlyRelativeHumidity']
  df = df.na.fill(value = 99 ,subset = impute_99int)
  
  impute_99fl = ['origin_HourlyPrecipitation', 'dest_HourlyPrecipitation']
  df = df.na.fill(value = 99.0 ,subset = impute_99fl)
  
  impute_999int = ['origin_HourlyPressureTendency', 'dest_HourlyPressureTendency']
  df = df.na.fill(value = 999 ,subset = impute_999int)
  
#   impute_999fl = ['origin_HourlyPressureChange', 'dest_HourlyPressureChange']
#   df = df.na.fill(value = 999.0 ,subset = impute_999fl)
  
  impute_9999int = ['origin_HourlyDryBulbTemperature', 'dest_HourlyDryBulbTemperature']
  df = df.na.fill(value = 9999 ,subset = impute_9999int)
    
  impute_99999int = ['origin_HourlyWindDirection', 'origin_HourlyWindSpeed', 'dest_HourlyWindDirection', 'dest_HourlyWindSpeed']
  df = df.na.fill(value = 99999 ,subset = impute_99999int)
  
  impute_99999fl = ['origin_HourlyAltimeterSetting',  'dest_HourlyAltimeterSetting', 'origin_HourlySeaLevelPressure','dest_HourlySeaLevelPressure']
  df = df.na.fill(value = 99999.0 ,subset = impute_99999fl)
  
  impute_999999fl = ['origin_HourlyVisibility', 'dest_HourlyVisibility']
  df = df.na.fill(value = 999999.0 ,subset = impute_999999fl)
  
  impute_str = ['TAIL_NUM', 'type', 'origin_HourlySkyConditions', 'dest_HourlySkyConditions', 'local_timestamp', 'timezone']
  df = df.na.fill(value = 'no_data',subset = impute_str)
  
  imputed_cols  = cols_to_fill + ['perc_delays_last1d', 'perc_cancellation_last1d', 'elevation_ft']
#   'no_delays_last1d', 'no_cancellation_last1d', 'count_flights_last1d', 
  return df,imputed_cols

In [0]:
inputCols_categorical = ['Year', 'QUARTER', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DISTANCE_GROUP', 'holiday_in2DayRange', 'C19', 'OP_UNIQUE_CARRIER', 'type', 'DEP_TIME_BLK', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']
inputCols_continuous = ['DISTANCE']

pipeline = create_pipeline(df_full, inputCols_categorical, inputCols_continuous)
pipeline_df1 = pipeline.fit(df_full).transform(df_full)
pipeline_df = pipeline_df1.filter(col('label') != 2).cache()

train_none, val_none = grid_search_test_train_split(pipeline_df, sampling='none')
train_none = train_none.cache()
val_none  = val_none.cache()

train_under, val_under = grid_search_test_train_split(pipeline_df, sampling='under')
train_under = train_under.cache()
val_under  = val_under.cache()

pipeline_df.unpersist()

Out[18]: DataFrame[local_timestamp: timestamp, timezone: string, scheduled_departure_UTC: timestamp, rounded_depTimestamp: timestamp, label: int, Year: int, QUARTER: int, MONTH: int, DAY_OF_MONTH: int, DAY_OF_WEEK: int, DEP_TIME_BLK: string, OP_UNIQUE_CARRIER: string, TAIL_NUM: string, OP_CARRIER_FL_NUM: string, dep_delay_15: int, DEP_DELAY: int, DEP_DELAY_NEW: int, CANCELLED: int, ORIGIN_AIRPORT_ID: string, ORIGIN: string, ORIGIN_CITY_NAME: string, ORIGIN_STATE_ABR: string, elevation_ft: int, type: string, DEST_AIRPORT_ID: string, DEST: string, DEST_CITY_NAME: string, DEST_STATE_ABR: string, TAXI_OUT: int, TAXI_IN: int, DISTANCE: int, DISTANCE_GROUP: int, holiday: int, holiday_in2DayRange: bigint, C19: int, CARRIER_DELAY: int, WEATHER_DELAY: int, NAS_DELAY: int, SECURITY_DELAY: int, LATE_AIRCRAFT_DELAY: int, scheduled_departure_UTC_minus_1hr: timestamp, scheduled_departure_UTC_add_2hr: timestamp, origin_HourlyAltimeterSetting: float, origin_HourlyDewPointTemperature: int, origin_Hourl

In [0]:
# Logistic Regression No sampling 
log_reg_c_params = { 'maxIter': [10, 20], 'regParam': [0.2, 0.4], 'elasticNetParam': [0.0, 0.8]}
log_reg_c_no_sampling = train_model_no_CV(train_none, val_none,  model_type='LogisticRegression', params=log_reg_c_params, train_metrics = True)
display(log_reg_c_no_sampling)

maxIter,regParam,elasticNetParam,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
10,0.2,0.0,0.7366986452396131,0.8182734335525216,0.9999688955259012,0.7366986452396131,0.8660984618546442,0.9091723207219766,0.9998802280756868,0.8660984618546442
10,0.2,0.8,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
10,0.4,0.0,0.7363679226014549,0.8181611247469397,0.9999999608748752,0.7363679226014549,0.8657782464030378,0.9090432954557828,0.9999960076025228,0.8657782464030378
10,0.4,0.8,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
20,0.2,0.0,0.7366986452396131,0.8182734335525216,0.9999688955259012,0.7366986452396131,0.8660979078102652,0.9091721144415024,0.9998802280756868,0.8660979078102652
20,0.2,0.8,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
20,0.4,0.0,0.7363679226014549,0.8181611247469397,0.9999999608748752,0.7363679226014549,0.8657782464030378,0.9090432954557828,0.9999960076025228,0.8657782464030378
20,0.4,0.8,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754


In [0]:
# Logistic Regression Under sampling 
log_reg_c_params = { 'maxIter': [10, 20], 'regParam': [0.2, 0.4], 'elasticNetParam': [0.0, 0.8]}
log_reg_c_under_sampling = train_model_no_CV(train_under, val_under,  model_type='LogisticRegression', params=log_reg_c_params, train_metrics = True)
display(log_reg_c_under_sampling)

maxIter,regParam,elasticNetParam,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
10,0.2,0.0,0.6625858317132428,0.6695519908663603,0.6425217419622291,0.6625858317132428,0.6872609814266382,0.9514218345389096,0.594537601771826,0.6872609814266382
10,0.2,0.8,0.3333606170720244,0.0,0.0,0.3333606170720244,0.0151788819552767,0.0,0.0,0.0151788819552767
10,0.4,0.0,0.6593868183057319,0.6661339878999537,0.6395580931324356,0.6593868183057319,0.6854893683957969,0.9501079845378778,0.5928835015920932,0.6854893683957969
10,0.4,0.8,0.3333606170720244,0.0,0.0,0.3333606170720244,0.0151788819552767,0.0,0.0,0.0151788819552767
20,0.2,0.0,0.662585040085502,0.6695487721885622,0.6425273749369591,0.662585040085502,0.6872611041211267,0.951396503052542,0.5945538208865767,0.6872611041211267
20,0.2,0.8,0.3333606170720244,0.0,0.0,0.3333606170720244,0.0151788819552767,0.0,0.0,0.0151788819552767
20,0.4,0.0,0.6593916634527335,0.6661273018338404,0.6395950595291009,0.6593916634527335,0.6855016224468162,0.9501197801850664,0.5928932330609437,0.6855016224468162
20,0.4,0.8,0.3333606170720244,0.0,0.0,0.3333606170720244,0.0151788819552767,0.0,0.0,0.0151788819552767


##### Decision Tree Classification

In [0]:
# Decision Tree Classifier No sampling 
dt_c_params = { 'maxDepth': [5], 'impurity': ['gini'], 'maxBins': [32], 'minInfoGain': [0.0, 0.01]  }
dt_c_no_sampling = train_model_no_CV(train_none, val_none, model_type='DecisionTreeClassifier', params=dt_c_params, train_metrics = True)
display(dt_c_no_sampling)

maxDepth,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
5,gini,32,0.0,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
5,gini,32,0.01,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754


In [0]:
# Decision Tree Classifier Under sampling 
dt_c_params = { 'maxDepth': [5], 'impurity': ['gini'], 'maxBins': [32], 'minInfoGain': [0.0, 0.01]  }
dt_c_under_sampling = train_model_no_CV(train_under, val_under, model_type='DecisionTreeClassifier', params=dt_c_params, train_metrics = True)
display(dt_c_under_sampling)

maxDepth,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
5,gini,32,0.0,0.6344302627420071,0.6663149979380395,0.5494616734572038,0.6344302627420071,0.6225484190513313,0.9566558778568328,0.5059398141089829,0.6225484190513313
5,gini,32,0.01,0.6133482824315729,0.6290047320374396,0.5581520346515517,0.6133482824315729,0.6622688493146525,0.9523795975650926,0.5596275791823421,0.6622688493146525


##### Random Forest

In [0]:
# Random Forest Classifier - No Sampling
rf_c_params = { 'maxDepth': [3, 5], 'numTrees': [15, 50] , 'impurity': ['entropy', 'gini'], 'maxBins': [32], 'minInfoGain': [0.01]  }
rf_c_no_sampling = train_model_no_CV(train_none, val_none, model_type='RandomForestClassifier', params=rf_c_params, train_metrics = True)
display(rf_c_no_sampling)

maxDepth,numTrees,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
3,15,entropy,32,0.01,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
3,15,gini,32,0.01,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
3,50,entropy,32,0.01,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
3,50,gini,32,0.01,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
5,15,entropy,32,0.01,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
5,15,gini,32,0.01,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
5,50,entropy,32,0.01,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754
5,50,gini,32,0.01,0.7363048199222192,0.8181396554341768,1.0,0.7363048199222192,0.8656765908909754,0.9090052744582758,1.0,0.8656765908909754


In [0]:
# Random Forest Classifier - Under Sampling
rf_c_params = { 'maxDepth': [3, 5], 'numTrees': [15, 50] , 'impurity': ['entropy', 'gini'], 'maxBins': [32], 'minInfoGain': [0.01]  }
rf_c_under_sampling = train_model_no_CV(train_under, val_under, model_type='RandomForestClassifier', params=rf_c_params, train_metrics = True)
display(rf_c_under_sampling)

maxDepth,numTrees,impurity,maxBins,minInfoGain,Train Accuracy,Train Precision,Train Recall,Train F1 Score,Val Accuracy,Val Precision,Val Recall,Val F1 Score
3,15,entropy,32,0.01,0.4104370102501683,0.7602225308379135,0.0816668373186734,0.4104370102501683,0.1194516179064589,0.8869520601571896,0.0664275054227986,0.1194516179064589
3,15,gini,32,0.01,0.3334647825805829,0.5001182968595681,1.0,0.3334647825805829,0.7875172607341671,0.8545419149368073,1.0,0.7875172607341671
3,50,entropy,32,0.01,0.4104370102501683,0.7602225308379135,0.0816668373186734,0.4104370102501683,0.1194516179064589,0.8869520601571896,0.0664275054227986,0.1194516179064589
3,50,gini,32,0.01,0.3334647825805829,0.5001182968595681,1.0,0.3334647825805829,0.7875172607341671,0.8545419149368073,1.0,0.7875172607341671
5,15,entropy,32,0.01,0.4104370102501683,0.7602225308379135,0.0816668373186734,0.4104370102501683,0.1194516179064589,0.8869520601571896,0.0664275054227986,0.1194516179064589
5,15,gini,32,0.01,0.3334647825805829,0.5001182968595681,1.0,0.3334647825805829,0.7875172607341671,0.8545419149368073,1.0,0.7875172607341671
5,50,entropy,32,0.01,0.4104370102501683,0.7602225308379135,0.0816668373186734,0.4104370102501683,0.1194516179064589,0.8869520601571896,0.0664275054227986,0.1194516179064589
5,50,gini,32,0.01,0.3334647825805829,0.5001182968595681,1.0,0.3334647825805829,0.7875172607341671,0.8545419149368073,1.0,0.7875172607341671


#### Regression

##### Linear Regression

In [0]:
# Linear Regression, no sample
lin_reg_r_params = { 'maxIter': [10, 20], 'regParam': [0.2, 0.4], 'elasticNetParam': [0.0, 0.8] }
lin_reg_r_no_sampling = train_model_no_CV(train_none, val_none, model_type='LinearRegression', params=lin_reg_r_params, train_metrics = True)
display(lin_reg_r_no_sampling)

maxIter,regParam,elasticNetParam,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
10,0.2,0.0,0.0440775034375388,41.41412444141641,1715.1297032491243,17.975928210384645,-0.0205511554583734,35.39731467323712,1252.9698860761678,15.413066373228764
10,0.2,0.8,0.0406843772303767,41.487560767319934,1721.217698422064,17.947579129310746,-0.0168589596293988,35.33322565687659,1248.4368353197622,15.388211154261198
10,0.4,0.0,0.0440452712290679,41.41482264521704,1715.1875347347818,17.97144095485093,-0.0202204905999467,35.39157973470384,1252.5639161178992,15.405679423397087
10,0.4,0.8,0.0378282676913661,41.54927400895637,1726.3421706713373,17.9762609354555,-0.0153605158663889,35.30718253143427,1246.5971383080175,15.406482047692208
20,0.2,0.0,0.0440775034375392,41.41412444141641,1715.129703249124,17.97592821038464,-0.0205511554583703,35.39731467323706,1252.969886076164,15.41306637322852
20,0.2,0.8,0.0407951650329372,41.48516507590595,1721.0189213751664,17.94508056932255,-0.0165873754863403,35.328506917600706,1248.1034010269614,15.374720724161303
20,0.4,0.0,0.0440452712290682,41.41482264521704,1715.1875347347818,17.971440954850927,-0.0202204905999465,35.39157973470384,1252.5639161178992,15.405679423397071
20,0.4,0.8,0.0378117142411548,41.54963141962296,1726.3718711065196,17.978192419526163,-0.0150617324354747,35.301987343546344,1246.2303104039063,15.397909869364886


**Extra Experimentation with feature changes**

For the below, take action for both origin and destination weather. The numbers refer to the correlation coefficients between the two variables (regardless of origin and destination). 
- remove either elevation ft or hourly station pressure: -0.99
  - % nulls before imputation
    - elevation ft: 0.02%
    - hourly station pressure: 1.27% (origin), 1.38% (destination)
  - Keep: elevation_ft
- remove either hourly dry bulb or hourly dew point temperature: 0.74
  - % nulls before imputation
    - hourly dry bulb: 1.21% (origin), 1.35% (destination)
    - hourly dew point: 1.24% (origin), 1.38% (destination)
  - Keep: hourly dry bulb
- remove either hourly wetbulb or hourly dry bulb temperature: 0.93
  - % nulls before imputation
    - hourly dry bulb: 1.21% (origin), 1.35% (destination)
    - hourly wet bulb: 1.35% (origin), 1.41% (destination)
  - Keep: hourly dry bulb
- remove either hourly wetbulb or hourly dew point temperature: 0.93
  - % nulls before imputation
    - hourly wet bulb: 1.35% (origin), 1.41% (destination)
    - hourly dew point: 1.24% (origin), 1.38% (destination)
  - Keep: remove both as they both are correlated with hourly dry bulb and they have both been chosen to be removed because hourly dry bulb has less % of nulls
- remove either hourly pressure tendency or hourly pressure change: 0.71
  - % nulls before imputation
    - hourly pressure tendency: 70.81% (origin), 70.54% (destination)
    - hourly pressure change: 70.81% (origin), 70.54% (destination)
  - Keep hourly pressure tendency, because even though both have same % of nulls, hourly pressure change has a wider distribution which could be easier for the model to learn from 
- remove either hourly wind speed or hourly wind gust speed: 0.85
  - % nulls before imputation
    - hourly wind speed: 1.25% (origin), 1.38% (destination)
    - hourly wind gust speed: 86.34% (origin), 85.98% (destination)
  - Keep hourly wind speed

In [0]:
def preModeling_dataEdit(df):
  '''
  Input: df that has already gone through the final join, cleaning, and feature engineering
  Output: df that includes null imputing and # and % of flights (by tail number) that were delayed and cancelled in the past 90 days --> these depend on window functions, as such they need to be done right after the data is split for modelling and not during feature engineering phase
  '''
  
  ### FINAL CLEANING 
  # Remove rows with null scheduled_departure_UTC because these are rows without a proper timezone (timezonefinder could not find)
  df = df.na.drop(subset=["scheduled_departure_UTC"])
  dropCols = ['TAXI_IN', 'TAXI_OUT']
  df = df.drop(*dropCols) 

  
  ### FINAL FEATURE ADDITIONS
  ## GET NUMBER & PERCENTAGE OF TIMES A PLANE (BY TAIL NUMBER) HAS BEEN DELAYED OR CANCELLED IN THE PAST 3 MONTHS (2 COLUMNS)
  # Make window function
  df = df.withColumn('roundedMonth', f.date_trunc('month', df.scheduled_departure_UTC))
  window_3m = Window().partitionBy('TAIL_NUM').orderBy(f.col('roundedMonth').cast('long')).rangeBetween(-(86400), 0) # changed to 1 day instead of 3 months 

  # Add in Columns
  # Number of flights delayed/cancelled
  df = df.withColumn('no_delays_last1d', when(df.TAIL_NUM.isNotNull(), f.sum('dep_delay_15').over(window_3m)).otherwise(-1)) \
         .withColumn('no_cancellation_last1d', when(df.TAIL_NUM.isNotNull(), f.sum('CANCELLED').over(window_3m)).otherwise(-1)) 
  # Percentage of flights delayed/cancelled
  df = df.withColumn('count_flights_last1d', when(df.TAIL_NUM.isNotNull(), f.count('TAIL_NUM').over(window_3m)).otherwise(-1)) 
  df = df.withColumn('perc_delays_last1d', when(df.count_flights_last1d != -1, (df.no_delays_last1d/ df.count_flights_last1d)).otherwise(-1.0)) \
         .withColumn('perc_cancellation_last1d', when(df.count_flights_last1d != -1, (df.no_cancellation_last1d/ df.count_flights_last1d)).otherwise(-1.0))     
  
  ### HANDLING NULLS
  ## Imputing Hourly Weather Data to the best of our ability (up to 3 hours back)
  window = Window.partitionBy(col("ORIGIN_AIRPORT_ID"))\
                     .orderBy(col("rounded_depTimestamp"))\
                     .rowsBetween(0,3)
  
  cols_to_fill  = ['origin_HourlyAltimeterSetting', 'origin_HourlyDryBulbTemperature', 'origin_HourlyPrecipitation', 'origin_HourlyPressureTendency', 'origin_HourlyRelativeHumidity', 'origin_HourlySeaLevelPressure', 'origin_HourlyVisibility', 'origin_HourlyWindDirection', 'origin_HourlyWindSpeed', 'origin_HourlySkyConditions_SCT_cnt', 'origin_HourlySkyConditions_OVC_cnt', 'origin_HourlySkyConditions_FEW_cnt', 'origin_HourlySkyConditions_BKN_cnt', 'origin_HourlySkyConditions_VV_cnt', 'origin_HourlySkyConditions_SKC_cnt', 'origin_HourlySkyConditions_CLR_cnt', 'dest_HourlyAltimeterSetting', 'dest_HourlyDryBulbTemperature', 'dest_HourlyPrecipitation', 'dest_HourlyPressureTendency', 'dest_HourlyRelativeHumidity', 'dest_HourlySeaLevelPressure', 'dest_HourlyVisibility', 'dest_HourlyWindDirection', 'dest_HourlyWindSpeed', 'dest_HourlySkyConditions_SCT_cnt', 'dest_HourlySkyConditions_OVC_cnt', 'dest_HourlySkyConditions_FEW_cnt', 'dest_HourlySkyConditions_BKN_cnt', 'dest_HourlySkyConditions_VV_cnt', 'dest_HourlySkyConditions_SKC_cnt', 'dest_HourlySkyConditions_CLR_cnt']

  
  for field in cols_to_fill:
      filled_column_start = first(df[field], ignorenulls=True).over(window)
      df = df.withColumn(field, filled_column_start)
  
  ## We are still left with some null values --> will deal with them now in accordance to the table in section VII of this notebook
  impute_minus1int = ['DEP_DELAY_NEW', 'holiday' ,'holiday_in2DayRange']
  df = df.na.fill(value = -1,subset = impute_minus1int)
  
  impute_minus9999int = ['DEP_DELAY']
  df = df.na.fill(value = -9999,subset = impute_minus9999int)
  
  impute_minus1fl = ['perc_delays_last1d', 'perc_cancellation_last1d']
  df = df.na.fill(value = -1.0,subset = impute_minus1fl)
  
  impute_minus9999int = ['elevation_ft']
  df = df.na.fill(value = -9999,subset = impute_minus9999int)
  
  impute_99int = [ 'origin_HourlyRelativeHumidity', 'dest_HourlyRelativeHumidity']
  df = df.na.fill(value = 99 ,subset = impute_99int)
  
  impute_99fl = ['origin_HourlyPrecipitation', 'dest_HourlyPrecipitation']
  df = df.na.fill(value = 99.0 ,subset = impute_99fl)
  
  impute_999int = ['origin_HourlyPressureTendency', 'dest_HourlyPressureTendency']
  df = df.na.fill(value = 999 ,subset = impute_999int)
  
#   impute_999fl = ['origin_HourlyPressureChange', 'dest_HourlyPressureChange']
#   df = df.na.fill(value = 999.0 ,subset = impute_999fl)
  
  impute_9999int = ['origin_HourlyDryBulbTemperature', 'dest_HourlyDryBulbTemperature']
  df = df.na.fill(value = 9999 ,subset = impute_9999int)
    
  impute_99999int = ['origin_HourlyWindDirection', 'origin_HourlyWindSpeed', 'dest_HourlyWindDirection', 'dest_HourlyWindSpeed']
  df = df.na.fill(value = 99999 ,subset = impute_99999int)
  
  impute_99999fl = ['origin_HourlyAltimeterSetting',  'dest_HourlyAltimeterSetting', 'origin_HourlySeaLevelPressure','dest_HourlySeaLevelPressure']
  df = df.na.fill(value = 99999.0 ,subset = impute_99999fl)
  
  impute_999999fl = ['origin_HourlyVisibility', 'dest_HourlyVisibility']
  df = df.na.fill(value = 999999.0 ,subset = impute_999999fl)
  
  impute_str = ['TAIL_NUM', 'type', 'origin_HourlySkyConditions', 'dest_HourlySkyConditions', 'local_timestamp', 'timezone']
  df = df.na.fill(value = 'no_data',subset = impute_str)
  
  imputed_cols  = cols_to_fill + ['perc_delays_last1d', 'perc_cancellation_last1d', 'elevation_ft']
#   'no_delays_last1d', 'no_cancellation_last1d', 'count_flights_last1d', 
  return df,imputed_cols

In [0]:
inputCols_categorical = ['Year', 'QUARTER', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'DISTANCE_GROUP', 'holiday_in2DayRange', 'C19', 'OP_UNIQUE_CARRIER', 'type', 'DEP_TIME_BLK', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID']
inputCols_continuous = ['DISTANCE']

pipeline = create_pipeline(df_full, inputCols_categorical, inputCols_continuous)
pipeline_df1 = pipeline.fit(df_full).transform(df_full)
pipeline_df = pipeline_df1.filter(col('label') != 2).cache()

train_none, val_none = grid_search_test_train_split(pipeline_df, sampling='none')
train_none = train_none.cache()
val_none  = val_none.cache()

train_under, val_under = grid_search_test_train_split(pipeline_df, sampling='under')
train_under = train_under.cache()
val_under  = val_under.cache()

pipeline_df.unpersist()

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-1215577238243337>[0m in [0;36m<cell line: 4>[0;34m()[0m
[1;32m      2[0m [0minputCols_continuous[0m [0;34m=[0m [0;34m[[0m[0;34m'DISTANCE'[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m [0;34m[0m[0m
[0;32m----> 4[0;31m [0mpipeline[0m [0;34m=[0m [0mcreate_pipeline[0m[0;34m([0m[0mdf_full[0m[0;34m,[0m [0minputCols_categorical[0m[0;34m,[0m [0minputCols_continuous[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      5[0m [0mpipeline_df1[0m [0;34m=[0m [0mpipeline[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0mdf_full[0m[0;34m)[0m[0;34m.[0m[0mtransform[0m[0;34m([0m[0mdf_full[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      6[0m [0mpipeline_df[0m [0;34m=[0m [0mpipeline_df1[0m[0;34m.[0m[0mfilter[0m[0;34m([0m[0mcol[0m[0;34m

In [0]:
# Linear Regression, no sample
lin_reg_r_params = { 'maxIter': [10, 20], 'regParam': [0.2, 0.4], 'elasticNetParam': [0.0, 0.8] }
lin_reg_r_no_sampling = train_model_no_CV(train_none, val_none, model_type='LinearRegression', params=lin_reg_r_params, train_metrics = True)
display(lin_reg_r_no_sampling)

maxIter,regParam,elasticNetParam,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
10,0.2,0.0,0.0434195663684131,41.428374132609854,1716.3101832714974,17.977524682325253,-0.021361269364069,35.4113610873824,1253.9644940609803,15.42452816173658
10,0.2,0.8,0.0400331180448668,41.50164089395655,1722.3861968909264,17.95212952177156,-0.0176513807035905,35.34699026982609,1249.4097211351802,15.401390643715912
10,0.4,0.0,0.0433883485197136,41.42905013126019,1716.36619477847,17.97304537463326,-0.0210395763167374,35.40578397861134,1253.5695391400914,15.417178677284298
10,0.4,0.8,0.0373143945526457,41.56036776962164,1727.2641691462047,17.981876383313608,-0.0157127620154555,35.31330633693824,1247.0296044464424,15.409252915061968
20,0.2,0.0,0.0434152780698455,41.42846699309082,1716.3178773976158,17.97744762885957,-0.0213778561799911,35.41164862488792,1253.9848583325256,15.424899130433529
20,0.2,0.8,0.0401671237612353,41.49874410068002,1722.145761933724,17.949469876532568,-0.0174550211581139,35.34357993996435,1249.1686429726506,15.390091703662502
20,0.4,0.0,0.0433838067033629,41.42914847990224,1716.374343769786,17.973070828188238,-0.0210450129982533,35.405878240244064,1253.5762139629878,15.41743523505789
20,0.4,0.8,0.037294129021102,41.56080521176619,1727.3005298503715,17.98224768361883,-0.0156652816291351,35.31248095151193,1246.971310950893,15.4078117677116


##### Decision Tree Regression

In [0]:
# Decision Tree Regressor No sampling 
dt_r_params = { 'maxDepth': [5], 'minInfoGain': [0.0, 0.01] }
dt_r_no_sampling = train_model_no_CV(train_none, val_none, model_type='DecisionTreeRegressor', params=dt_r_params,  train_metrics = True)
display(dt_r_no_sampling)

maxDepth,minInfoGain,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
5,0.0,0.0277798607362583,41.76566969376268,1744.3711649684865,18.142051379878204,-0.0516910221644715,35.933292327937664,1291.201497525024,17.448302822292725
5,0.01,0.0277798607362586,41.76566969376268,1744.3711649684865,18.1420513798782,-0.0516910221644715,35.933292327937664,1291.201497525024,17.44830282229272


In [0]:
# Decision Tree Regressor No sampling 
dt_r_params = { 'maxDepth': [10], 'minInfoGain': [0.01] }
dt_r_no_sampling2 = train_model_no_CV(train_none, val_none, model_type='DecisionTreeRegressor', params=dt_r_params,  train_metrics = True)
display(dt_r_no_sampling2)

maxDepth,minInfoGain,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
10,0.01,0.0416514766414928,41.46664340093345,1719.4825149401777,17.794463899369024,-0.3834305820739692,41.212751765905374,1698.4909081181368,19.742400854707025


##### Random Forest Regression

In [0]:
# Random Forest Regression - No Sampling
rf_r_params = { 'maxDepth': [5], 'numTrees': [50], 'minInfoGain': [0.0, 0.01]}
rf_r_no_sampling = train_model_no_CV(train_none, val_none, model_type='RandomForestRegressor', params=rf_r_params, train_metrics = True)
display(rf_r_no_sampling)

maxDepth,numTrees,minInfoGain,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
5,50,0.0,0.0308454960635582,41.699769286864246,1738.8707585777065,18.22613112821767,-0.0463781399810285,35.84241436940363,1284.6786678280316,17.67234266897744
5,50,0.01,0.030846731476764,41.69974270882026,1738.868541981809,18.226097455791216,-0.0463806274867217,35.84245697262392,1284.6817218343972,17.672328088940453


##### Gradient Boosted Tree Regression

In [0]:
# Gradient Boosted Trees Regression - No Sampling
gbt_r_params = { 'maxDepth': [3], 'maxIter': [15] , 'stepSize': [0.1], 'minInfoGain': [0.0, 0.01]}
gbt_r_no_sampling = train_model_no_CV(train_none, val_none, model_type='GBTRegressor', params=gbt_r_params, train_metrics = True)
display(gbt_r_no_sampling)

maxDepth,maxIter,stepSize,minInfoGain,Train R2,Train RMSE,Train MSE,Train MAE,Val R2,Val RMSE,Val MSE,Val MAE
3,15,0.1,0.0,0.0358548747168673,41.59186050586201,1729.882860339084,17.98497022943661,-0.0536777322664114,35.96721643234636,1293.640657891246,17.735607776223443
3,15,0.1,0.01,0.0359892846045334,41.58896127834894,1729.6417002120077,17.978135290171483,-0.0535537164900958,35.96509973509303,1293.4883989551886,17.755337306086513
