# Explore and Analyze the Fuser data for KATL

In [0]:
# THese are the FUSER datasets
fuser_data_types = [
    'configs_data_set',
    'runways_data_set',
    'first_position_data_set',
    'TBFM_data_set',
    'TFM_track_data_set',
    'ETD_data_set',
    'LAMP_data_set',
    'MFS_data_set',
]

In [0]:
# Load data from Hive
atl_runway_data_df = spark.table("fuser_train_katl_runways_data_set") 
atl_configs_data_set = spark.table("fuser_train_katl_configs_data_set")
atl_first_position_data_set = spark.table("fuser_train_katl_first_position_data_set")
atl_TBFM_data_set = spark.table("fuser_train_katl_TBFM_data_set")
atl_TFM_track_data_set = spark.table("fuser_train_katl_TFM_track_data_set")
atl_ETD_data_set = spark.table("fuser_train_katl_ETD_data_set")
atl_LAMP_data_set = spark.table("fuser_train_katl_LAMP_data_set")
atl_MFS_data_set = spark.table("fuser_train_katl_MFS_data_set")

## Explore Runway Data

In [0]:
display(atl_runway_data_df.describe())

In [0]:
atl_runway_data_pd_df = atl_runway_data_df.toPandas()

In [0]:
atl_runway_data_pd_df['arrival_runway_actual'].unique()

- gufi is an actual flight identifier
- arrival_runway_actual_time is the time of arrival 
- arrival_runway_actual is the arrival runway
- departure_runway_actual_time is the time of the departure
- departure_runway_actual is the departure runway
- 

In [0]:
# look at distribution of arrival times
atl_runway_data_pd_df['arrival_runway_actual_time'].hist(bins=100)

In [0]:
display(atl_runway_data_df.select('arrival_runway_actual_time'))

In [0]:
# extract time of day and day of week from arrival_runway_actual_time
from pyspark.sql.functions import hour, minute, second, dayofweek

atl_runway_data_df = atl_runway_data_df.withColumn('arrival_hour', hour('arrival_runway_actual_time')) \
                                       .withColumn('arrival_minute', minute('arrival_runway_actual_time')) \
                                       .withColumn('arrival_second', second('arrival_runway_actual_time')) \
                                       .withColumn('arrival_day_of_week', dayofweek('arrival_runway_actual_time'))

atl_runway_data_df = atl_runway_data_df.withColumn("arrival_minutes_since_midnight", hour('arrival_runway_actual_time')*60 + minute('arrival_runway_actual_time'))


In [0]:
from pyspark.sql.functions import when

day_of_week_mapping = {1: "Sun", 2: "Mon", 3: "Tue", 4: "Wed", 5: "Thu", 6: "Fri", 7: "Sat"}

atl_runway_data_df = atl_runway_data_df.withColumn(
    'arrival_day_of_week_str',
    when(atl_runway_data_df['arrival_day_of_week'] == 1, "Sun")
    .when(atl_runway_data_df['arrival_day_of_week'] == 2, "Mon")
    .when(atl_runway_data_df['arrival_day_of_week'] == 3, "Tue")
    .when(atl_runway_data_df['arrival_day_of_week'] == 4, "Wed")
    .when(atl_runway_data_df['arrival_day_of_week'] == 5, "Thu")
    .when(atl_runway_data_df['arrival_day_of_week'] == 6, "Fri")
    .when(atl_runway_data_df['arrival_day_of_week'] == 7, "Sat")
)

In [0]:
# get distribution of arrival day of the week
display(atl_runway_data_df.select('arrival_day_of_week_str'))

Databricks visualization. Run in Databricks to view.

In [0]:
# get distribution of time of day for the arrivals
display(atl_runway_data_df.select('arrival_minutes_since_midnight'))

Databricks visualization. Run in Databricks to view.

In [0]:
# convert arrival_minutes_since_midnight to hours
display(atl_runway_data_df.select('arrival_minutes_since_midnight', (atl_runway_data_df.arrival_minutes_since_midnight/60).alias('arrival_hours')))

Databricks visualization. Run in Databricks to view.