# Gap analysis for ensemble 

Train: 2015-2018
Test: 2019


# Set up 

In [0]:
blob_container = "261storagecontainer"
storage_account = "261storage"
secret_scope = "261_team_6_1_spring24_scope"
secret_key = "team_6_1_key"
team_blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"


# blob storage is mounted here.
mids261_mount_path = "/mnt/mids-w261"

# SAS Token: Grant the team limited access to Azure Storage resources
spark.conf.set(
    f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
    dbutils.secrets.get(scope=secret_scope, key=secret_key),
)

# see what's in the blob storage root folder
# display(dbutils.fs.ls(f"{team_blob_url}"))

# mount
data_BASE_DIR = "dbfs:/mnt/mids-w261/"
# display(dbutils.fs.ls(f"{data_BASE_DIR}"))

## Import libraries

In [0]:
%pip install geopandas
%pip install folium


In [0]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import os

from hyperopt import hp, Trials, fmin, tpe, STATUS_OK

from pyspark.sql.functions import col
from pyspark.sql import functions as F, DataFrame, Window
from pyspark.sql.types import FloatType, DoubleType

from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.regression import (
    LinearRegression,
    DecisionTreeRegressor,
    RandomForestRegressor,
    GBTRegressor,
)
from pyspark.ml.classification import LinearSVC, MultilayerPerceptronClassifier
from pyspark.ml.feature import (
    VectorIndexer,
    VectorAssembler,
    StringIndexer,
    OneHotEncoder,
    StandardScaler,
)
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics


# for maps
import geopandas as gpd
import folium
from branca.element import Template, MacroElement

# date handling
import matplotlib.dates as mdates
import datetime



## Import Training and Test results

In [0]:
def spark_shape(self):
  """Hack to emulate pandas' df.shape"""
  display(self.count(), len(self.columns))


def blob_read(subPath):
  return spark.read.format("parquet").option("header", "true").load(f"{team_blob_url}/{subPath}")


def blob_write(df, subPath):
    df.write.mode("overwrite").parquet(f"{team_blob_url}/{subPath}")

os.environ["MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR"] = "false"

In [0]:
display(dbutils.fs.ls(f"{team_blob_url}/MultilayerPerceptronClassifierEnsemble-2015"))
display(dbutils.fs.ls(f"{team_blob_url}/LinearRegression-L2/2015-train"))



In [0]:
# Evaluation metrics for ensemble model (fbeta, precision, recall)
model_name = "MultilayerPerceptronClassifierEnsemble"
evaluation_df = blob_read(f"{model_name}/evaluation")
# evaluation_df.display()


In [0]:
test_df =  blob_read(f"{model_name}/test")

In [0]:
# MultilayerPerceptronClassifierEnsemble-2015
folds = [2015, 2016, 2017, 2018]

train_outputs = [
    "MultilayerPerceptronClassifierEnsemble-{year}/train",
]

validation_outputs = [
    "MultilayerPerceptronClassifierEnsemble-{year}/val",
]

models = {}

for year in folds:
    models[year] = {}

    for output in train_outputs:
        model_name = output.split("-")[0]
        models[year][f"{model_name}_train"] = spark.read.parquet(
            f"{team_blob_url}/{output.format(year=year)}"
        ).withColumn("year", F.lit(year))

    for output in validation_outputs:
        model_name = output.split("-")[0]
        models[year][f"{model_name}_val"] = spark.read.parquet(
            f"{team_blob_url}/{output.format(year=year)}"
        ).withColumn("year", F.lit(year))

In [0]:
models[2018].keys()
models[2018]["MultilayerPerceptronClassifierEnsemble_train"].limit(5).display()

In [0]:
# Verify that all models have the same columns
reference_columns = list(
    models[next(iter(models))][next(iter(models[next(iter(models))]))].columns
)

for year, model_dict in models.items():
    for model_name, model_df in model_dict.items():
        assert set(model_df.columns) == set(
            reference_columns
        ), f"Column mismatch in {model_name} for year {year}"

# Evaluation metrics

In [0]:
def evaluate_model(
    model_df, target_column="DEP_DEL15", prediction_column="prediction"
):
    """
    Calculates evaluation metrics for all classification models.

    Args:
        model_df: The DataFrame containing the model's predictions and target values.
        target_column: Column with target values.
        prediction_column: Column with model's predictions.

    Returns:
        dict: A dictionary containing the evaluation metrics (TP, FP, FN, TN, sum_check, precision, recall).
    """
    TP = model_df.where(
        (F.col(prediction_column) == 1) & (F.col(target_column) == 1)
    ).count()
    FP = model_df.where(
        (F.col(prediction_column) == 1) & (F.col(target_column) == 0)
    ).count()
    FN = model_df.where(
        (F.col(prediction_column) == 0) & (F.col(target_column) == 1)
    ).count()
    TN = model_df.where(
        (F.col(prediction_column) == 0) & (F.col(target_column) == 0)
    ).count()
    sum_check = model_df.count()

    if TP + FP == 0:
        precision = 0
    else:
        precision = TP / (TP + FP)

    if TP + FN == 0:
        recall = 0
    else:
        recall = TP / (TP + FN)

    return {
        "TP": TP,
        "FP": FP,
        "FN": FN,
        "TN": TN,
        "sum_check": sum_check,
        "precision": precision,
        "recall": recall,
    }

In [0]:
def add_prediction_type(
    model_df, target_column="DEP_DEL15", prediction_column="prediction"
):
    """
    Adds a new column 'prediction_type' to the input DataFrame indicating TP, TN, FP, or FN.

    Args:
        model_df: The DataFrame containing the model's predictions and target values.
        target_column: Column with target values.
        prediction_column: Column with model's predictions.

    Returns:
        A Spark dataframe: The input DataFrame with a new column "prediction_type" indicating TP, TN, FP, or FN.
    """
    return model_df.withColumn(
        "prediction_type",
        F.when((F.col(prediction_column) == 1) & (F.col(target_column) == 1), "TP")
        .when((F.col(prediction_column) == 1) & (F.col(target_column) == 0), "FP")
        .when((F.col(prediction_column) == 0) & (F.col(target_column) == 1), "FN")
        .otherwise("TN"),
    )

In [0]:
print("Evaluation metrics for Ensemble model")
for year in folds:
    for model_key in models[year].keys():
        if "_train" in model_key:
            train_metrics = evaluate_model(models[year][model_key])
            print(f"{year} Training Metrics:")
            print(train_metrics)
        elif "_val" in model_key:
            val_metrics = evaluate_model(models[year][model_key])
            print(f"{year} Validation Metrics:")
            print(val_metrics)
            print("-" * 100)
            print()

test_metrics = evaluate_model(test_df)
print(f"Test Metrics:")
print(test_metrics)

In [0]:
#training and validation
for year in folds:
    for model_key in models[year].keys():
        if "_train" in model_key:
            models[year][model_key] = add_prediction_type(models[year][model_key])
        elif "_val" in model_key:
            models[year][model_key] = add_prediction_type(models[year][model_key])

#test 
test_df = add_prediction_type(test_df)

In [0]:
models[2015]["MultilayerPerceptronClassifierEnsemble_train"].where(
    F.col("prediction_type") == "FP"
).select("DEP_DEL15", "prediction", "prediction_type").limit(5).display()

test_df.where(F.col("prediction_type") == "FP").select(
    "DEP_DEL15", "prediction", "prediction_type"
).limit(5).display()

# EDA

## 1. Compare percentage of TP, TN, FP, FN

In [0]:
train_df = spark.createDataFrame([], models[2015]["MultilayerPerceptronClassifierEnsemble_train"].schema)
val_df = spark.createDataFrame([], models[2015]["MultilayerPerceptronClassifierEnsemble_val"].schema)

for year, fold_data in models.items():
    train_df_year = fold_data.get("MultilayerPerceptronClassifierEnsemble_train")
    val_df_year = fold_data.get("MultilayerPerceptronClassifierEnsemble_val")

    if train_df_year is None or val_df_year is None:
        print(
            f"WARNING: DataFrames missing for year {year} and model MultilayerPerceptronClassifierEnsemble"
        )
        continue

    train_df = train_df.union(train_df_year)
    val_df = val_df.union(val_df_year)

train_df.limit(5).display()
val_df.limit(5).display()

#### CHECKPOINT

In [0]:
# checkpoint
dbutils.fs.mkdirs(f"{team_blob_url}/ensemble_dfs_w_pred_type") 



In [0]:
blob_write(train_df,"ensemble_dfs_w_pred_type")
blob_write(val_df,"ensemble_dfs_w_pred_type")
blob_write(test_df,"ensemble_dfs_w_pred_type")

In [0]:
TRAIN_SOURCE = "ensemble_dfs_w_pred_type/train_df"
VAL_SOURCE = "ensemble_dfs_w_pred_type/val_df"
TEST_SOURCE = "ensemble_dfs_w_pred_type/test_df"

train_df = blob_read(TRAIN_SOURCE)
val_df = blob_read(VAL_SOURCE)
test_df = blob_read(TEST_SOURCE)

In [0]:
# For Train
confusion_matrix_train = train_df.groupBy("prediction_type").count().withColumn(
    "Percentage", F.col("count") / F.sum("count").over(Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)) * 100
).groupBy().pivot("prediction_type").avg("Percentage").na.fill(0)

# For Validation
confusion_matrix_val = val_df.groupBy("prediction_type").count().withColumn(
    "Percentage", F.col("count") / F.sum("count").over(Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)) * 100
).groupBy().pivot("prediction_type").avg("Percentage").na.fill(0)

# For Test
confusion_matrix_test = test_df.groupBy("prediction_type").count().withColumn(
    "Percentage", F.col("count") / F.sum("count").over(Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)) * 100
).groupBy().pivot("prediction_type").avg("Percentage").na.fill(0)

# Plot the confusion matrices
confusion_matrix_train_pandas = confusion_matrix_train.toPandas()
confusion_matrix_val_pandas = confusion_matrix_val.toPandas()
confusion_matrix_test_pandas = confusion_matrix_test.toPandas()

f, axes = plt.subplots(1, 3, figsize=(18, 6), sharex=True, sharey=True)

sns.heatmap(confusion_matrix_train_pandas, annot=True, fmt=".2f", cmap="Blues", ax=axes[0])
axes[0].set_title('Train')

sns.heatmap(confusion_matrix_val_pandas, annot=True, fmt=".2f", cmap="Blues", ax=axes[1])
axes[1].set_title('Validation')

sns.heatmap(confusion_matrix_test_pandas, annot=True, fmt=".2f", cmap="Blues", ax=axes[2])
axes[2].set_title('Test')

# Display the plot
plt.tight_layout()
plt.show()


In [0]:
class_percentage_train_df = (
    train_df.groupBy("FL_DATE", "prediction_type")
    .count()
    .withColumn(
        "Percentage",
        F.col("count") / (F.sum("count").over(Window.partitionBy("FL_DATE"))) * 100,
    )
)

class_percentage_train_df.limit(1).display()


class_percentage_val_df = (
    val_df.groupBy("FL_DATE", "prediction_type")
    .count()
    .withColumn(
        "Percentage",
        F.col("count") / (F.sum("count").over(Window.partitionBy("FL_DATE"))) * 100,
    )
)

class_percentage_val_df.limit(1).display()



class_percentage_test_df = (
    test_df.groupBy("FL_DATE", "prediction_type")
    .count()
    .withColumn(
        "Percentage",
        F.col("count") / (F.sum("count").over(Window.partitionBy("FL_DATE"))) * 100,
    )
)

class_percentage_test_df.limit(1).display()

## 2. Seasonalities - flags

PLAN:

- If the highest percentage for a day is FP/FN, and has a flag, add flag as annotation.

In [0]:
# TRAINING--------------------------------------------------------------------------------

# Calculate maximum Percentage for each FL_DATE
class_percentage_train_df = class_percentage_train_df.withColumn(
    "max_percentage",
    F.max("Percentage").over(Window.partitionBy("FL_DATE")),
)

# Add max_pct_per_day column based on comparison with max_percentage
class_percentage_train_df = class_percentage_train_df.withColumn(
    "max_pct_per_day", (F.col("Percentage") == F.col("max_percentage")).cast("int")
)

# Optional: Drop the temporary max_percentage column
class_percentage_train_df = class_percentage_train_df.drop("max_percentage")

# VALIDATION--------------------------------------------------------------------------------
class_percentage_val_df = class_percentage_val_df.withColumn(
    "max_percentage",
    F.max("Percentage").over(Window.partitionBy("FL_DATE")),
)

class_percentage_val_df = class_percentage_val_df.withColumn(
    "max_pct_per_day", (F.col("Percentage") == F.col("max_percentage")).cast("int")
)

class_percentage_val_df = class_percentage_val_df.drop("max_percentage")


# TEST--------------------------------------------------------------------------------
class_percentage_test_df = class_percentage_test_df.withColumn(
    "max_percentage",
    F.max("Percentage").over(Window.partitionBy("FL_DATE")),
)

class_percentage_test_df = class_percentage_test_df.withColumn(
    "max_pct_per_day", (F.col("Percentage") == F.col("max_percentage")).cast("int")
)

class_percentage_test_df = class_percentage_test_df.drop("max_percentage")

class_percentage_train_df.limit(5).display()
class_percentage_val_df.limit(5).display()
class_percentage_test_df.limit(5).display()

In [0]:
# TRAINING------------------------------------------------------------------------------------
fn_df = class_percentage_train_df.filter(F.col("prediction_type") == "FN")
fp_df = class_percentage_train_df.filter(F.col("prediction_type") == "FP")

fn_df = fn_df.toPandas()
fp_df = fp_df.toPandas()

fig, ax = plt.subplots(figsize=(12, 6))

# FN
ax.plot(fn_df["FL_DATE"], fn_df["Percentage"], color="orange", label="FN")
ax.set_title("Percentage of FN and FP out of all flights for that day - training")
ax.set_xlabel("Date")
ax.set_ylabel("Percentage")

# FP
ax.plot(fp_df["FL_DATE"], fp_df["Percentage"], color="blue", label="FP")

# Change date format
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.xaxis.set_minor_locator(mdates.MonthLocator(bymonth=(1, 4, 7, 10)))

# annotations
holidays1 = datetime.date(2015, 12, 30)  
holidays2 = datetime.date(2016, 12, 30)  
holidays3 = datetime.date(2017, 12, 30)  
holidays4 = datetime.date(2018, 12, 30)  

ax.axvline(holidays1, color="gray", linestyle="dotted")
ax.axvline(holidays2, color="gray", linestyle="dotted")
ax.axvline(holidays3, color="gray", linestyle="dotted")
ax.axvline(holidays4, color="gray", linestyle="dotted")

ax.text(holidays1, 28, 'Holiday \n season') 
ax.text(holidays2, 28, 'Holiday \n season') 
ax.text(holidays3, 28, 'Holiday \n season') 
ax.text(holidays4, 28, 'Holiday \n season') 



ax.legend()

plt.tight_layout()
plt.show()

In [0]:
# VALIDATION------------------------------------------------------------------------------------
fn_df = class_percentage_val_df.filter(F.col("prediction_type") == "FN")
fp_df = class_percentage_val_df.filter(F.col("prediction_type") == "FP")

# Convert to Pandas DataFrame
fn_df = fn_df.toPandas()
fp_df = fp_df.toPandas()

fig, ax = plt.subplots(figsize=(12, 6))

# FN
ax.plot(fn_df["FL_DATE"], fn_df["Percentage"], color="orange", label="FN")
ax.set_title("Percentage of FN and FP out of all flights for that day - validation")
ax.set_xlabel("Date")
ax.set_ylabel("Percentage")

# FP
ax.plot(fp_df["FL_DATE"], fp_df["Percentage"], color="blue", label="FP")

# Change date format
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.xaxis.set_minor_locator(mdates.MonthLocator(bymonth=(1, 4, 7, 10)))

ax.legend()


plt.tight_layout()
plt.show()

In [0]:
# TEST ------------------------------------------------------------------------------------
fn_df = class_percentage_test_df.filter(F.col("prediction_type") == "FN")
fp_df = class_percentage_test_df.filter(F.col("prediction_type") == "FP")

# Convert to Pandas DataFrame
fn_df = fn_df.toPandas()
fp_df = fp_df.toPandas()

fig, ax = plt.subplots(figsize=(12, 6))

# FN
ax.plot(fn_df["FL_DATE"], fn_df["Percentage"], color="orange", label="FN")
ax.set_title("Percentage of FN and FP out of all flights for that day  - test")
ax.set_xlabel("Date - 2019")
ax.set_ylabel("Percentage")

# FP
ax.plot(fp_df["FL_DATE"], fp_df["Percentage"], color="blue", label="FP")

# change date format
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
ax.xaxis.set_minor_locator(mdates.DayLocator(interval=5))

#annotations
thanksgiving = datetime.date(2019, 11, 30)  
ax.axvline(thanksgiving, color="gray", linestyle="dotted")
ax.text(thanksgiving, 22.1, 'Thanksgiving season') 
tornadoes = datetime.date(2019, 4, 14)  
ax.axvline(tornadoes, color="gray", linestyle="dotted")
ax.text(tornadoes, 21.5, 'Southern and Eastern Tornadoes \n and Severe Weather')
flooding = datetime.date(2019, 6, 20)  
ax.axvline(flooding, color="gray", linestyle="dotted")
ax.text(flooding, 22.1,'  ?')

ax.legend()

plt.tight_layout()
plt.show()

There doesn't seem to be much clustering around specific dates for the training set, but for the validation set, January 2017 seems to cluster a large number of false negatives. During this month, there was a severe tornado outbreak and western storms - look into whether delays were more higher in those states than usual for that month. For the test set, there is definitely a seasonal component to the false positives. 

## 3. Other variables (Based on 5YR_OTPW_s2.1_Overall_EDA notebook)

In [0]:
OUTCOME = ["log_DEP_DELAY", "DEP_DELAY", "DEP_DEL15"]  # original lable
PREDICTIONS = ["prediction", "PRED_DEP_DELAY", "PRED_DELAY15"]  # prediction lables

NUMERICAL = [
    # "HourlyWindDirection",
    "HourlyRelativeHumidity",
    "HourlyWindSpeed",
    # "WindChill",
    # "pca_time_distance",
    "pca_elevation_station_pressure",
    # "pca_altimeter_sea_level_pressure",
    "pca_dew_windchill_wet_temp",
    "pagerank",
    "incoming_flight_delay_ratio",
    "last_delay",
    "log_average_delay",
    "trend",
    "yhat",
    # "seasonality",
    # old ones, just to try
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    # "ELEVATION",
    # "HourlyStationPressure",
    "HourlyAltimeterSetting",
    # "HourlySeaLevelPressure",
    # "HourlyDewPointTemperature",
    # "HourlyDryBulbTemperature",
    # "HourlyWetBulbTemperature",
]

CATEGORICAL = [
    "origin_type",
    "origin_region",
    # "SkyDarkness",
    # "CloudHeight",
    "CloudHeightandDarkness",
    "event_flag",
    "drought_flag",
    "flooding_flag",
    # "freeze_flag",
    "severe_storm_flag",
    "tropical_cyclone_flag",
    "wildfire_flag",
    "winter_storm_flag",
    # old ones just to try
    "OP_UNIQUE_CARRIER",
    # "TAIL_NUM",
    # "OP_CARRIER_FL_NUM",
    "ORIGIN",
    "DEST",
    # "origin_station_id",
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    # "HourlySkyConditions",
]

TIME = [
    "sched_depart_date_time",
    "sched_depart_date_time_UTC",
    "two_hours_prior_depart_UTC",
    "four_hours_prior_depart_UTC",
    "YEAR",
    "QUARTER",
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "sched_arrive_date_time_UTC",
]

FILTER = [
    "FL_DATE",
    # "ORIGIN",
    # "DEST",
    # "OP_UNIQUE_CARRIER",
    "TAIL_NUM",
    "OP_CARRIER_FL_NUM",
    "origin_station_id",
]

In [0]:
# TRAINING -----------------------------------------------------------------------------
prediction_type_delay_df_train = train_df.select(
    "prediction_type", "log_DEP_DELAY"
).toPandas()

prediction_type_categories = {
    "FN": prediction_type_delay_df_train["prediction_type"] == "FN",
    "FP": prediction_type_delay_df_train["prediction_type"] == "FP",
    "TN": prediction_type_delay_df_train["prediction_type"] == "TN",
    "TP": prediction_type_delay_df_train["prediction_type"] == "TP",
}

fig, axs = plt.subplots(2, 2, figsize=(12, 8), sharex=True)

row = 0
col = 0
for category_name, filter_condition in prediction_type_categories.items():
    filtered_data = prediction_type_delay_df_train[filter_condition]
    axs[row, col].hist(filtered_data["log_DEP_DELAY"])
    axs[row, col].set_xlabel("Departure Delay (log mins)")
    axs[row, col].set_ylabel("Frequency")
    axs[row, col].set_title(f"{category_name} - Log Departure Delay Distribution - Training")

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

In [0]:
# VALIDATION -----------------------------------------------------------------------------

prediction_type_delay_df_val = val_df.select(
    "prediction_type", "log_DEP_DELAY"
).toPandas()

prediction_type_categories = {
    "FN": prediction_type_delay_df_val["prediction_type"] == "FN",
    "FP": prediction_type_delay_df_val["prediction_type"] == "FP",
    "TN": prediction_type_delay_df_val["prediction_type"] == "TN",
    "TP": prediction_type_delay_df_val["prediction_type"] == "TP",
}

fig, axs = plt.subplots(2, 2, figsize=(12, 8), sharex=True)

row = 0
col = 0
for category_name, filter_condition in prediction_type_categories.items():
    filtered_data = prediction_type_delay_df_val[filter_condition]
    axs[row, col].hist(filtered_data["log_DEP_DELAY"])
    axs[row, col].set_xlabel("Departure Delay (log mins)")
    axs[row, col].set_ylabel("Frequency")
    axs[row, col].set_title(f"{category_name} - Log Departure Delay Distribution - Validation")

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

In [0]:
# TEST -----------------------------------------------------------------------------

prediction_type_delay_df_test = test_df.select(
    "prediction_type", "log_DEP_DELAY"
).toPandas()

prediction_type_categories = {
    "FN": prediction_type_delay_df_test["prediction_type"] == "FN",
    "FP": prediction_type_delay_df_test["prediction_type"] == "FP",
    "TN": prediction_type_delay_df_test["prediction_type"] == "TN",
    "TP": prediction_type_delay_df_test["prediction_type"] == "TP",
}

fig, axs = plt.subplots(2, 2, figsize=(12, 8), sharex=True)

row = 0
col = 0
for category_name, filter_condition in prediction_type_categories.items():
    filtered_data = prediction_type_delay_df_test[filter_condition]
    axs[row, col].hist(filtered_data["log_DEP_DELAY"])
    axs[row, col].set_xlabel("Departure Delay (log mins)")
    axs[row, col].set_ylabel("Frequency")
    axs[row, col].set_title(f"{category_name} - Log Departure Delay Distribution - Test")

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

No aparent difference in distributions of log Delays between training, validation and test for all prediction types. 

In [0]:
# TRAINING -----------------------------------------------------------------------------
year_delay_df = train_df.select(["YEAR", "log_DEP_DELAY", "prediction_type"]).toPandas()
prediction_type_order = ["FN", "FP", "TN", "TP"]

fig, axs = plt.subplots(2, 2, figsize=(12, 8))

row = 0
col = 0
for pred_type in prediction_type_order:
    filtered_data = year_delay_df[year_delay_df["prediction_type"] == pred_type]
    sns.boxplot(
        x="YEAR",
        y="log_DEP_DELAY",
        showmeans=True,
        data=filtered_data,
        ax=axs[row, col],
    )

    axs[row, col].set_xlabel("Year")
    axs[row, col].set_ylabel("Departure Delay (log mins)")
    axs[row, col].set_title(f"Prediction Type: {pred_type} - Training")

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

In [0]:
# VALIDATION -----------------------------------------------------------------------------
year_delay_df_val = val_df.select(["YEAR", "log_DEP_DELAY", "prediction_type"]).toPandas()
prediction_type_order = ["FN", "FP", "TN", "TP"]

fig, axs = plt.subplots(2, 2, figsize=(12, 8))

row = 0
col = 0
for pred_type in prediction_type_order:
    filtered_data = year_delay_df_val[year_delay_df_val["prediction_type"] == pred_type]
    sns.boxplot(
        x="YEAR",
        y="log_DEP_DELAY",
        showmeans=True,
        data=filtered_data,
        ax=axs[row, col],
    )

    axs[row, col].set_xlabel("Year")
    axs[row, col].set_ylabel("Departure Delay (log mins)")
    axs[row, col].set_title(f"Prediction Type: {pred_type} - Validation")

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

In [0]:
# TEST -----------------------------------------------------------------------------
year_delay_df_test = test_df.select(["log_DEP_DELAY", "prediction_type"]).toPandas()
prediction_type_order = ["FN", "FP", "TN", "TP"]

fig, axs = plt.subplots(2, 2, figsize=(4, 4))

row = 0
col = 0
for pred_type in prediction_type_order:
    filtered_data = year_delay_df_test[year_delay_df_test["prediction_type"] == pred_type]
    sns.boxplot(
        y="log_DEP_DELAY",
        showmeans=True,
        data=filtered_data,
        ax=axs[row, col],
    )

    axs[row, col].set_ylabel("Departure Delay (log mins)")
    axs[row, col].set_title(f"Prediction Type: {pred_type} - Test")

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

Visualize the correlation matrix for numerical columns


In [0]:
# TRAINING---------------------------------------------------------
train_df.cache()

prediction_types = train_df.select("prediction_type").distinct().rdd.flatMap(lambda x: x).collect()
numerical_columns = NUMERICAL

fig, axs = plt.subplots(2, 2, figsize=(10, 10))
axs = axs.flatten()

for index, pred_type in enumerate(prediction_types):
    subset_data = (
        train_df.filter(train_df.prediction_type == pred_type)
        .select(["log_DEP_DELAY"] + numerical_columns)
        .toPandas()
    )
    
    corr_matrix = subset_data.corr().round(2)
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    
    heatmap = sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", mask=mask, ax=axs[index], square=True, cbar=False)
    heatmap.set_xticklabels(heatmap.get_xticklabels(), ha='right')  
    heatmap.set_yticklabels(heatmap.get_yticklabels(), rotation=0, ha='right')  
    
    axs[index].set_title(f"Correlation Matrix for {pred_type} - Training", fontsize=7) 
    
    for label in axs[index].get_xticklabels():
        label.set_fontsize(6)  
    for label in axs[index].get_yticklabels():
        label.set_fontsize(6)  

plt.tight_layout()
plt.show()

Some correlations are more evident in FP (vs TP):
- `incoming_flight_delay_ratio` and `DISTANCE`


Some correlations are more evident in FN (vs TN):
- `log_average_delay` with `incoming_flight_delay_ratio` and `last_delay`


In [0]:
# VALIDATION---------------------------------------------------------
val_df.cache()

prediction_types = val_df.select("prediction_type").distinct().rdd.flatMap(lambda x: x).collect()
numerical_columns = NUMERICAL

fig, axs = plt.subplots(2, 2, figsize=(12, 12))
axs = axs.flatten()

for index, pred_type in enumerate(prediction_types):
    subset_data = (
        val_df.filter(val_df.prediction_type == pred_type)
        .select(["log_DEP_DELAY"] + numerical_columns)
        .toPandas()
    )
    
    corr_matrix = subset_data.corr().round(2)
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    
    heatmap = sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", mask=mask, ax=axs[index], square=True, cbar=False)
    heatmap.set_xticklabels(heatmap.get_xticklabels(), ha='right')  
    heatmap.set_yticklabels(heatmap.get_yticklabels(), rotation=0, ha='right')  
    
    axs[index].set_title(f"Correlation Matrix for {pred_type} - Validation", fontsize=7) 
    
    for label in axs[index].get_xticklabels():
        label.set_fontsize(6)  
    for label in axs[index].get_yticklabels():
        label.set_fontsize(6)  

plt.tight_layout()
plt.show()

In [0]:
# TEST---------------------------------------------------------
test_df.cache()

prediction_types = test_df.select("prediction_type").distinct().rdd.flatMap(lambda x: x).collect()
numerical_columns = NUMERICAL

fig, axs = plt.subplots(2, 2, figsize=(12, 12))
axs = axs.flatten()

for index, pred_type in enumerate(prediction_types):
    subset_data = (
        test_df.filter(test_df.prediction_type == pred_type)
        .select(["log_DEP_DELAY"] + numerical_columns)
        .toPandas()
    )
    
    corr_matrix = subset_data.corr().round(2)
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    
    heatmap = sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", mask=mask, ax=axs[index], square=True, cbar=False)
    heatmap.set_xticklabels(heatmap.get_xticklabels(), ha='right')  
    heatmap.set_yticklabels(heatmap.get_yticklabels(), rotation=0, ha='right')  
    
    axs[index].set_title(f"Correlation Matrix for {pred_type} - Test", fontsize=7) 
    
    for label in axs[index].get_xticklabels():
        label.set_fontsize(6)  
    for label in axs[index].get_yticklabels():
        label.set_fontsize(6)  

plt.tight_layout()
plt.show()

Similar correlations in TP/FP and TN/FN in training, validation and test. 

In [0]:
# TRAINING---------------------------------------------------------
nominal_df = train_df.select(
    "OP_UNIQUE_CARRIER", "log_DEP_DELAY", "prediction_type"
).toPandas()
prediction_type_order = ["TP", "FP", "TN", "FN"]
nominal_df = nominal_df.sort_values(by="OP_UNIQUE_CARRIER")

fig, axs = plt.subplots(2, 2, figsize=(12, 8), sharey = True)

row = 0
col = 0
for pred_type in prediction_type_order:
    filtered_data = nominal_df[nominal_df["prediction_type"] == pred_type]
    sns.boxplot(
        x="OP_UNIQUE_CARRIER",
        y="log_DEP_DELAY",
        showmeans=True,
        data=filtered_data,
        ax=axs[row, col],
    )

    axs[row, col].set_xlabel("Unique Carrier")
    axs[row, col].set_ylabel("Departure Delay (log mins)")
    axs[row, col].set_title(f"Prediction Type: {pred_type} - Training")

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

- Different distribution for `HA` and `WN`in TP vs FP 
- And also somewhat in `AS` and `B6`

In [0]:
# VALIDATION---------------------------------------------------------
nominal_df_val = val_df.select("OP_UNIQUE_CARRIER", "log_DEP_DELAY", "prediction_type").toPandas()
prediction_type_order = ["TP", "FP", "TN", "FN"]
nominal_df_val = nominal_df_val.sort_values(by="OP_UNIQUE_CARRIER")

fig, axs = plt.subplots(2, 2, figsize=(12, 8), sharey = True)

row = 0
col = 0
for pred_type in prediction_type_order:
    filtered_data = nominal_df_val[nominal_df_val["prediction_type"] == pred_type]
    sns.boxplot(
        x="OP_UNIQUE_CARRIER", y="log_DEP_DELAY", showmeans=True, data=filtered_data, ax=axs[row, col]
    )  

    axs[row, col].set_xlabel('Unique Carrier')
    axs[row, col].set_ylabel("Departure Delay (log mins)")
    axs[row, col].set_title(f'Prediction Type: {pred_type} - Validation')

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

In [0]:
# TEST---------------------------------------------------------
nominal_df_test = test_df.select("OP_UNIQUE_CARRIER", "log_DEP_DELAY", "prediction_type").toPandas()
prediction_type_order = ["TP", "FP", "TN", "FN"]
nominal_df_test = nominal_df_val.sort_values(by="OP_UNIQUE_CARRIER")

fig, axs = plt.subplots(2, 2, figsize=(12, 8),sharey=True)

row = 0
col = 0
for pred_type in prediction_type_order:
    filtered_data = nominal_df_test[nominal_df_test["prediction_type"] == pred_type]
    sns.boxplot(
        x="OP_UNIQUE_CARRIER", y="log_DEP_DELAY", showmeans=True, data=filtered_data, ax=axs[row, col]
    )  

    axs[row, col].set_xlabel('Unique Carrier')
    axs[row, col].set_ylabel("Departure Delay (log mins)")
    axs[row, col].set_title(f'Prediction Type: {pred_type} - Test')

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

- `A6` and `NH` rank different in FP validation than FP training.

In [0]:
# TRAINING ---------------------------------------------------------
nominal_df = train_df.select("origin_type", "log_DEP_DELAY", "prediction_type").toPandas()
prediction_type_order = ["TP", "FP", "TN", "FN"]
nominal_df = nominal_df.sort_values(by="origin_type")

fig, axs = plt.subplots(2, 2, figsize=(12, 8),sharey=True)

row = 0
col = 0
for pred_type in prediction_type_order:
    filtered_data = nominal_df[nominal_df["prediction_type"] == pred_type]
    sns.boxplot(
        x="origin_type", y="log_DEP_DELAY", showmeans=True, data=filtered_data, ax=axs[row, col]
    )  

    axs[row, col].set_xlabel('Origin Type')
    axs[row, col].set_ylabel("Departure Delay (log mins)")
    axs[row, col].set_title(f'Prediction Type: {pred_type} - Training')

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

In [0]:
# VALIDATION---------------------------------------------------------
nominal_df_val = val_df.select("origin_type", "log_DEP_DELAY", "prediction_type").toPandas()
prediction_type_order = ["TP", "FP", "TN", "FN"]
nominal_df_val = nominal_df_val.sort_values(by="origin_type")

fig, axs = plt.subplots(2, 2, figsize=(12, 8), sharey = True)

row = 0
col = 0
for pred_type in prediction_type_order:
    filtered_data = nominal_df_val[nominal_df_val["prediction_type"] == pred_type]
    sns.boxplot(
        x="origin_type", y="log_DEP_DELAY", showmeans=True, data=filtered_data, ax=axs[row, col]
    )  

    axs[row, col].set_xlabel('Origin Type')
    axs[row, col].set_ylabel("Departure Delay (log mins)")
    axs[row, col].set_title(f'Prediction Type: {pred_type} - Validation')

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

In [0]:
# TEST---------------------------------------------------------
nominal_df_test = test_df.select("origin_type", "log_DEP_DELAY", "prediction_type").toPandas()
prediction_type_order = ["TP", "FP", "TN", "FN"]
nominal_df_test = nominal_df_test.sort_values(by="origin_type")

fig, axs = plt.subplots(2, 2, figsize=(12, 8),sharey = True)

row = 0
col = 0
for pred_type in prediction_type_order:
    filtered_data = nominal_df_test[nominal_df_test["prediction_type"] == pred_type]
    sns.boxplot(
        x="origin_type", y="log_DEP_DELAY", showmeans=True, data=filtered_data, ax=axs[row, col]
    )  

    axs[row, col].set_xlabel('Origin Type')
    axs[row, col].set_ylabel("Departure Delay (log mins)")
    axs[row, col].set_title(f'Prediction Type: {pred_type} - Test')

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

No aparent different in origin type in training / validation

In [0]:
# TRAINING---------------------------------------------------------
nominal_df = train_df.select("origin_region", "log_DEP_DELAY", "prediction_type").toPandas()
prediction_type_order = ["TP", "FP", "TN", "FN"]
nominal_df = nominal_df.sort_values(by="origin_region")

fig, axs = plt.subplots(2, 2, figsize=(20, 10))

row = 0
col = 0
for pred_type in prediction_type_order:
    filtered_data = nominal_df[nominal_df["prediction_type"] == pred_type]
    sns.boxplot(
        x="origin_region", y="log_DEP_DELAY", showmeans=True, data=filtered_data, ax=axs[row, col]
    )  

    axs[row, col].set_xlabel('Origin Region')
    axs[row, col].set_ylabel("Departure Delay (log mins)")
    axs[row, col].set_title(f'Prediction Type: {pred_type} - Training')
    axs[row, col].set_xticklabels(axs[row, col].get_xticklabels(), rotation=45) 


    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

Difference in distribution for `US-MN`, `US-CO` and `US-NJ` in TP vs FP. 

In [0]:
# VALIDATION---------------------------------------------------------
nominal_df_val = val_df.select("origin_region", "log_DEP_DELAY", "prediction_type").toPandas()
prediction_type_order = ["TP", "FP", "TN", "FN"]
nominal_df_val = nominal_df_val.sort_values(by="origin_region")

fig, axs = plt.subplots(2, 2, figsize=(20, 10))

row = 0
col = 0
for pred_type in prediction_type_order:
    filtered_data = nominal_df_val[nominal_df_val["prediction_type"] == pred_type]
    sns.boxplot(
        x="origin_region", y="log_DEP_DELAY", showmeans=True, data=filtered_data, ax=axs[row, col]
    )  

    axs[row, col].set_xlabel('Origin Region')
    axs[row, col].set_ylabel("Departure Delay (log mins)")
    axs[row, col].set_title(f'Prediction Type: {pred_type} - Validation')
    axs[row, col].set_xticklabels(axs[row, col].get_xticklabels(), rotation=45) 

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

In [0]:
# TEST ---------------------------------------------------------
nominal_df_test = val_df.select("origin_region", "log_DEP_DELAY", "prediction_type").toPandas()
prediction_type_order = ["TP", "FP", "TN", "FN"]
nominal_df_test = nominal_df_test.sort_values(by="origin_region")

fig, axs = plt.subplots(2, 2, figsize=(20, 10))

row = 0
col = 0
for pred_type in prediction_type_order:
    filtered_data = nominal_df_test[nominal_df_test["prediction_type"] == pred_type]
    sns.boxplot(
        x="origin_region", y="log_DEP_DELAY", showmeans=True, data=filtered_data, ax=axs[row, col]
    )  

    axs[row, col].set_xlabel('Origin Region')
    axs[row, col].set_ylabel("Departure Delay (log mins)")
    axs[row, col].set_title(f'Prediction Type: {pred_type} - Test')
    axs[row, col].set_xticklabels(axs[row, col].get_xticklabels(), rotation=45) 

    col += 1
    if col == 2:
        row += 1
        col = 0

plt.tight_layout()
plt.show()

Similar distribution to training

- Homogeneous distributions for all cloud layers in TP/ FP


# 5. Sample cases of FP / FN

In [0]:
# FN - TRAINING------------------------------------------------------------------------------------------------------
fn_percentage_df_train = (
    train_df.withColumn("state", F.split(F.col("origin_region"), "-").getItem(1))
    .groupBy("state")
    .agg(
        F.count(F.when(F.col("prediction_type") == "FN", True)).alias("total_FN"),
        F.count("*").alias("total_rows"),
    )
    .withColumn("percentage_FN", (F.col("total_FN") / F.col("total_rows")) * 100)
    .select("state", "total_FN", "percentage_FN")
)


# FP - TRAINING------------------------------------------------------------------------------------------------------
fp_percentage_df_train = (
    train_df.withColumn("state", F.split(F.col("origin_region"), "-").getItem(1))
    .groupBy("state")
    .agg(
        F.count(F.when(F.col("prediction_type") == "FP", True)).alias("total_FP"),
        F.count("*").alias("total_rows"),
    )
    .withColumn("percentage_FP", (F.col("total_FP") / F.col("total_rows")) * 100)
    .select("state", "total_FP", "percentage_FP")
)


# FN - VALIDATION------------------------------------------------------------------------------------------------------
fn_percentage_df_val = (
    val_df.withColumn("state", F.split(F.col("origin_region"), "-").getItem(1))
    .groupBy("state")
    .agg(
        F.count(F.when(F.col("prediction_type") == "FN", True)).alias("total_FN"),
        F.count("*").alias("total_rows"),
    )
    .withColumn("percentage_FN", (F.col("total_FN") / F.col("total_rows")) * 100)
    .select("state", "total_FN", "percentage_FN")
)

# FP - VALIDATION------------------------------------------------------------------------------------------------------
fp_percentage_df_val = (
    val_df.withColumn("state", F.split(F.col("origin_region"), "-").getItem(1))
    .groupBy("state")
    .agg(
        F.count(F.when(F.col("prediction_type") == "FP", True)).alias("total_FP"),
        F.count("*").alias("total_rows"),
    )
    .withColumn("percentage_FP", (F.col("total_FP") / F.col("total_rows")) * 100)
    .select("state", "total_FP", "percentage_FP")
)


# FN - TEST ------------------------------------------------------------------------------------------------------
fn_percentage_df_test = (
    val_df.withColumn("state", F.split(F.col("origin_region"), "-").getItem(1))
    .groupBy("state")
    .agg(
        F.count(F.when(F.col("prediction_type") == "FN", True)).alias("total_FN"),
        F.count("*").alias("total_rows"),
    )
    .withColumn("percentage_FN", (F.col("total_FN") / F.col("total_rows")) * 100)
    .select("state", "total_FN", "percentage_FN")
)
# FP - TEST------------------------------------------------------------------------------------------------------
fp_percentage_df_test = (
    val_df.withColumn("state", F.split(F.col("origin_region"), "-").getItem(1))
    .groupBy("state")
    .agg(
        F.count(F.when(F.col("prediction_type") == "FP", True)).alias("total_FP"),
        F.count("*").alias("total_rows"),
    )
    .withColumn("percentage_FP", (F.col("total_FP") / F.col("total_rows")) * 100)
    .select("state", "total_FP", "percentage_FP")
)

In [0]:
url = ("https://eric.clst.org/assets/wiki/uploads/Stuff/gz_2010_us_040_00_20m.json")

# We read the file and print it.
geoJSON_df = gpd.read_file(url)


geoJSON_df['NAME'] = geoJSON_df['NAME'].str.strip()
geoJSON_df.head()

In [0]:
%sql
SELECT * FROM `hive_metastore`.`default`.`us_states_territories`;

In [0]:
# Load csv for US states and territories
states = _sqldf    # Original code line
states_df = states.toPandas()
states_df['Name'] = states_df['Name'].str.strip()

# Left join geoJSON_df on NAME with states_df on Name
joined_df = geoJSON_df.merge(states_df, left_on='NAME', right_on='Name', how='left')
joined_df['Abbreviation'] = joined_df['Abbreviation'].str.strip()


# FN - TRAINING----------------------------------------------------------------------------------------------------------
fn_percentage_pandas = fn_percentage_df_train.toPandas()
fn_percentage_pandas.head()
fn_percentage_pandas['state'] = fn_percentage_pandas['state'].str.strip()
geo_fn_percentage_training = fn_percentage_pandas.merge(joined_df, left_on='state', right_on='Abbreviation', how='left')
geo_fn_percentage_training.head()

# FP - TRAINING----------------------------------------------------------------------------------------------------------
fp_percentage_pandas = fp_percentage_df_train.toPandas()
fp_percentage_pandas.head()
fp_percentage_pandas['state'] = fp_percentage_pandas['state'].str.strip()
geo_fp_percentage_training = fp_percentage_pandas.merge(joined_df, left_on='state', right_on='Abbreviation', how='left')


# FN - validation----------------------------------------------------------------------------------------------------------
fn_percentage_pandas_val = fn_percentage_df_val.toPandas()
fn_percentage_pandas_val.head()
fn_percentage_pandas_val['state'] = fn_percentage_pandas_val['state'].str.strip()
geo_fn_percentage_val = fn_percentage_pandas_val.merge(joined_df, left_on='state', right_on='Abbreviation', how='left')
geo_fn_percentage_val.head()

# FP - TRAINING----------------------------------------------------------------------------------------------------------
fp_percentage_pandas_val = fp_percentage_df_val.toPandas()
fp_percentage_pandas_val.head()
fp_percentage_pandas_val['state'] = fp_percentage_pandas_val['state'].str.strip()
geo_fp_percentage_val = fp_percentage_pandas_val.merge(joined_df, left_on='state', right_on='Abbreviation', how='left')
geo_fp_percentage_val.head()

# FN - TEST----------------------------------------------------------------------------------------------------------
fn_percentage_pandas_test = fn_percentage_df_test.toPandas()
fn_percentage_pandas_test.head()
fn_percentage_pandas_test['state'] = fn_percentage_pandas_test['state'].str.strip()
geo_fn_percentage_test = fn_percentage_pandas_test.merge(joined_df, left_on='state', right_on='Abbreviation', how='left')
geo_fn_percentage_test.head()

# FP - TEST----------------------------------------------------------------------------------------------------------
fp_percentage_pandas_test = fp_percentage_df_test.toPandas()
fp_percentage_pandas_test.head()
fp_percentage_pandas_test['state'] = fp_percentage_pandas_test['state'].str.strip()
geo_fp_percentage_test = fp_percentage_pandas_test.merge(joined_df, left_on='state', right_on='Abbreviation', how='left')
geo_fp_percentage_test.head()


In [0]:
fn_percentage_pandas.describe()

In [0]:
# map centered on the US
map_fn_percentage = folium.Map(location=[39.50, -98.35], zoom_start=4)

#chose based on 75% percentile of distribution
def get_color(percentage):
    """
    Produces color based on range of % FN
    """
    if percentage >= 23:
        return "#e92658"
    else:
        return "#ef9816"


for index, row in geo_fn_percentage_training.iterrows():
    state = row["state"]
    percentage_FN = row["percentage_FN"]
    color = get_color(percentage_FN)

    # Create a GeoJson overlay for each state if the geometry is not None
    if row["geometry"]:
        folium.GeoJson(
            row["geometry"],
            style_function=lambda x, color=color: {
                "fillColor": color,
                "color": "black",
                "weight": 2,
                "fillOpacity": 0.8,
            },
        ).add_to(map_fn_percentage)


# Add a legend
template = """
{% macro html(this, kwargs) %}
<div style="position: fixed;
     bottom: 50px; left: 50px; width: 130px; height: 100px;
     border:2px solid grey; z-index:9999; font-size:12px;
     background-color: white;
     opacity: 0.9;
     ">
     <p style="margin-left: 5px;">Percentage of FN per state - training</p>
     <ul style="list-style-type:none; padding-left: 5px;">
         <li><i class="fa fa-square fa-lg" style="color:#e92658"></i> >= 23%</li>
         <li><i class="fa fa-square fa-lg" style="color:#ef9816"></i> < 23%</li>
     </ul>
</div>
{% endmacro %}
"""


macro = MacroElement()
macro._template = Template(template)

map_fn_percentage.get_root().add_child(macro)

# Display the map
map_fn_percentage

In [0]:
fp_percentage_pandas.describe()

In [0]:
# map centered on the US
map_fp_percentage = folium.Map(location=[39.50, -98.35], zoom_start=4)

#chose value based on 75% percentile of distribution
def get_color(percentage):
    """
    Produces color based on range of % FP
    """
    if percentage >= 9:
        return "#008f86"
    else:
        return "#69dab3"


for index, row in geo_fp_percentage_training.iterrows():
    state = row["state"]
    percentage_FP = row["percentage_FP"]
    color = get_color(percentage_FP)

    # Create a GeoJson overlay for each state if the geometry is not None
    if row["geometry"]:
        folium.GeoJson(
            row["geometry"],
            style_function=lambda x, color=color: {
                "fillColor": color,
                "color": "black",
                "weight": 2,
                "fillOpacity": 0.8,
            },
        ).add_to(map_fp_percentage)


# Add a legend
template = """
{% macro html(this, kwargs) %}
<div style="position: fixed;
     bottom: 50px; left: 50px; width: 130px; height: 100px;
     border:2px solid grey; z-index:9999; font-size:12px;
     background-color: white;
     opacity: 0.9;
     ">
     <p style="margin-left: 5px;">Percentage of FP per state - training</p>
     <ul style="list-style-type:none; padding-left: 5px;">
         <li><i class="fa fa-square fa-lg" style="color:#008f86"></i> >= 9%</li>
         <li><i class="fa fa-square fa-lg" style="color:#69dab3"></i> < 9%</li>
     </ul>
</div>
{% endmacro %}
"""


macro = MacroElement()
macro._template = Template(template)

map_fp_percentage.get_root().add_child(macro)

# Display the map
map_fp_percentage

In [0]:
fn_percentage_pandas_val.describe()

In [0]:
# map centered on the US
map_fn_percentage = folium.Map(location=[39.50, -98.35], zoom_start=4)

#chose value for 75% as cut-off
def get_color(percentage):
    """
    Produces color based on range of % FN
    """
    if percentage >= 15:
        return "#e92658"
    else:
        return "#ef9816"


for index, row in geo_fn_percentage_val.iterrows():
    state = row["state"]
    percentage_FN = row["percentage_FN"]
    color = get_color(percentage_FN)

    # Create a GeoJson overlay for each state if the geometry is not None
    if row["geometry"]:
        folium.GeoJson(
            row["geometry"],
            style_function=lambda x, color=color: {
                "fillColor": color,
                "color": "black",
                "weight": 2,
                "fillOpacity": 0.8,
            },
        ).add_to(map_fn_percentage)


# Add a legend
template = """
{% macro html(this, kwargs) %}
<div style="position: fixed;
     bottom: 50px; left: 50px; width: 130px; height: 100px;
     border:2px solid grey; z-index:9999; font-size:12px;
     background-color: white;
     opacity: 0.9;
     ">
     <p style="margin-left: 5px;">Percentage of FN per state - validation</p>
     <ul style="list-style-type:none; padding-left: 5px;">
         <li><i class="fa fa-square fa-lg" style="color:#e92658"></i> >= 15%</li>
         <li><i class="fa fa-square fa-lg" style="color:#ef9816"></i> < 15%</li>
     </ul>
</div>
{% endmacro %}
"""


macro = MacroElement()
macro._template = Template(template)

map_fn_percentage.get_root().add_child(macro)

# Display the map
map_fn_percentage

In [0]:
fp_percentage_pandas_val.describe()

In [0]:
# map centered on the US
map_fp_percentage = folium.Map(location=[39.50, -98.35], zoom_start=4)

#chose value based on 75% percentile of distribution
def get_color(percentage):
    """
    Produces color based on range of % FP
    """
    if percentage >= 15:
        return "#008f86"
    else:
        return "#69dab3"


for index, row in geo_fp_percentage_val.iterrows():
    state = row["state"]
    percentage_FP = row["percentage_FP"]
    color = get_color(percentage_FP)

    # Create a GeoJson overlay for each state if the geometry is not None
    if row["geometry"]:
        folium.GeoJson(
            row["geometry"],
            style_function=lambda x, color=color: {
                "fillColor": color,
                "color": "black",
                "weight": 2,
                "fillOpacity": 0.8,
            },
        ).add_to(map_fp_percentage)


# Add a legend
template = """
{% macro html(this, kwargs) %}
<div style="position: fixed;
     bottom: 50px; left: 50px; width: 130px; height: 100px;
     border:2px solid grey; z-index:9999; font-size:12px;
     background-color: white;
     opacity: 0.9;
     ">
     <p style="margin-left: 5px;">Percentage of FP per state - validation</p>
     <ul style="list-style-type:none; padding-left: 5px;">
         <li><i class="fa fa-square fa-lg" style="color:#008f86"></i> >= 15%</li>
         <li><i class="fa fa-square fa-lg" style="color:#69dab3"></i> < 15%</li>
     </ul>
</div>
{% endmacro %}
"""


macro = MacroElement()
macro._template = Template(template)

map_fp_percentage.get_root().add_child(macro)

# Display the map
map_fp_percentage

In [0]:
# map centered on the US
map_fp_percentage = folium.Map(location=[39.50, -98.35], zoom_start=4)

#chose 75th percentile
def get_color(percentage):
    """
    Produces color based on range of % FP
    """
    if percentage >= 15:
        return "#008f86"
    else:
        return "#69dab3"


for index, row in geo_fp_percentage_test.iterrows():
    state = row["state"]
    percentage_FP = row["percentage_FP"]
    color = get_color(percentage_FP)

    # Create a GeoJson overlay for each state if the geometry is not None
    if row["geometry"]:
        folium.GeoJson(
            row["geometry"],
            style_function=lambda x, color=color: {
                "fillColor": color,
                "color": "black",
                "weight": 2,
                "fillOpacity": 0.8,
            },
        ).add_to(map_fp_percentage)


# Add a legend
template = """
{% macro html(this, kwargs) %}
<div style="position: fixed;
     bottom: 50px; left: 50px; width: 130px; height: 100px;
     border:2px solid grey; z-index:9999; font-size:12px;
     background-color: white;
     opacity: 0.9;
     ">
     <p style="margin-left: 5px;">Percentage of FP per state - test</p>
     <ul style="list-style-type:none; padding-left: 5px;">
         <li><i class="fa fa-square fa-lg" style="color:#008f86"></i> >= 15%</li>
         <li><i class="fa fa-square fa-lg" style="color:#69dab3"></i> < 15%</li>
     </ul>
</div>
{% endmacro %}
"""


macro = MacroElement()
macro._template = Template(template)

map_fp_percentage.get_root().add_child(macro)

# Display the map
map_fp_percentage

In [0]:
fn_percentage_pandas_test.describe()

In [0]:
# map centered on the US
map_fn_percentage = folium.Map(location=[39.50, -98.35], zoom_start=4)

#chose 75%ile
def get_color(percentage):
    """
    Produces color based on range of % FN
    """
    if percentage >= 15:
        return "#e92658"
    else:
        return "#ef9816"


for index, row in geo_fn_percentage_test.iterrows():
    state = row["state"]
    percentage_FN = row["percentage_FN"]
    color = get_color(percentage_FN)

    # Create a GeoJson overlay for each state if the geometry is not None
    if row["geometry"]:
        folium.GeoJson(
            row["geometry"],
            style_function=lambda x, color=color: {
                "fillColor": color,
                "color": "black",
                "weight": 2,
                "fillOpacity": 0.8,
            },
        ).add_to(map_fn_percentage)


# Add a legend
template = """
{% macro html(this, kwargs) %}
<div style="position: fixed;
     bottom: 50px; left: 50px; width: 130px; height: 100px;
     border:2px solid grey; z-index:9999; font-size:12px;
     background-color: white;
     opacity: 0.9;
     ">
     <p style="margin-left: 5px;">Percentage of FN per state - test</p>
     <ul style="list-style-type:none; padding-left: 5px;">
         <li><i class="fa fa-square fa-lg" style="color:#e92658"></i> >= 15%</li>
         <li><i class="fa fa-square fa-lg" style="color:#ef9816"></i> < 15%</li>
     </ul>
</div>
{% endmacro %}
"""


macro = MacroElement()
macro._template = Template(template)

map_fn_percentage.get_root().add_child(macro)

# Display the map
map_fn_percentage