In [40]:
import assets.script as scr

import numpy as np
import pandas as pd
import datetime as dt
import re

from math import trunc
from psycopg2 import sql
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from pathlib import Path

### Example `script.py` Use

In [41]:
x = [np.random.randn(10) for i in range(100)]
y = np.random.randint(1, 100, 100)

In [42]:
df = pd.DataFrame(x, columns=['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine'])
df = df.join(pd.Series(y, name='target'))

In [43]:
X_train, X_test, y_train, y_test = scr.split_data(df, target='target')


No columns dropped.

Target values: [94 29 78 96 59 49 85 87 83  9 43 95 15 73 22 27  5 70 76 54 36 48 27 73
  1 85 57 97 61 97 14 79 21 42 62 99 38  4 52 73 33 97 96 98 84 14 28 31
 49 75 19 35 34 82 51 71 65 23 14 42 14 24 36 80 36 54  6 54 61 83 95  9
 59 80 85 34  7 37 54 32 15 18 31 73 29  8 96 68 44 49 35 69 36 29 86 75
 63 58  1 36] 

Column(s) remaining: Index(['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
       'nine', 'target'],
      dtype='object')

Data is unscaled.
training sample size:  80
testing sample size:  20


In [44]:
y_pred, y_prob = scr.get_predictions(GaussianNB(), X_train, y_train, X_test)

## Midterm Project

#### Goals

The goal is to `predict arrival delays` of commercial flights. Often, there isn't much airlines can do to avoid the delays, which are often costly and . It is critical for airlines to estimate flight delays as accurate as possible because the results can be applied to improvements in customer satisfaction and income of airline agencies.

#### SQL Query

In [45]:
scr.sql_read_tables()

0             flights
1        flights_test
2    fuel_comsumption
3         pass_sample
4          passengers
5              sample
6           temptable
7          test_table
dtype: object

In [46]:
table_name = 'passengers'
limit = 200000

query = sql.SQL(
    "SELECT * \
        FROM {table} \
        LIMIT {limit};").format(
        table = sql.Identifier(table_name),
        limit = sql.Literal(limit)
        ,
    )
    
filename = f'{table_name}_{str(limit/1000)}k_sample.csv'

In [47]:
scr.make_csv(query, filename, overwrite=False)

File exists. Returning DataFrame...


Unnamed: 0,departures_scheduled,departures_performed,payload,seats,passengers,freight,mail,distance,ramp_to_ramp,air_time,unique_carrier,airline_id,unique_carrier_name,region,carrier,carrier_name,carrier_group,carrier_group_new,origin_airport_id,origin_city_market_id,origin,origin_city_name,origin_country,origin_country_name,dest_airport_id,dest_city_market_id,dest,dest_city_name,dest_country,dest_country_name,aircraft_group,aircraft_type,aircraft_config,year,month,distance_group,class,data_source
0,15.0,15.0,944625.0,0.0,0.0,432992.0,0.0,812.0,1895.0,1590.0,FX,20107,Federal Express Corporation,D,FX,Federal Express Corporation,3,3,13244,33244,MEM,"Memphis, TN",US,United States,16271,36106,YYZ,"Toronto, Canada",CA,Canada,6,622,2,2019,9,2,G,IU
1,15.0,15.0,52500.0,0.0,0.0,19850.0,0.0,293.0,1650.0,1504.0,FX,20107,Federal Express Corporation,D,FX,Federal Express Corporation,3,3,13342,33342,MKE,"Milwaukee, WI",US,United States,11076,31076,CMX,"Hancock/Houghton, MI",US,United States,4,416,2,2019,9,1,G,DU
2,15.0,15.0,944962.0,0.0,0.0,634021.0,0.0,238.0,925.0,652.0,FX,20107,Federal Express Corporation,D,FX,Federal Express Corporation,3,3,13342,33342,MKE,"Milwaukee, WI",US,United States,12339,32337,IND,"Indianapolis, IN",US,United States,6,622,2,2019,9,1,G,DU
3,15.0,15.0,52500.0,0.0,0.0,20405.0,0.0,206.0,1262.0,1132.0,FX,20107,Federal Express Corporation,D,FX,Federal Express Corporation,3,3,13461,33461,MQY,"Smyrna, TN",US,United States,13244,33244,MEM,"Memphis, TN",US,United States,4,416,2,2019,9,1,G,DU
4,15.0,15.0,943398.0,0.0,0.0,432818.0,0.0,284.0,1021.0,756.0,FX,20107,Federal Express Corporation,D,FX,Federal Express Corporation,3,3,13485,33485,MSN,"Madison, WI",US,United States,12339,32337,IND,"Indianapolis, IN",US,United States,6,622,2,2019,9,1,G,DU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0.0,5.0,107500.0,0.0,0.0,0.0,0.0,304.0,359.0,268.0,U7,20447,USA Jet Airlines Inc.,D,U7,USA Jet Airlines Inc.,2,2,14730,33044,SDF,"Louisville, KY",US,United States,16091,31295,YIP,"Detroit, MI",US,United States,6,635,2,2018,4,1,P,DU
199996,0.0,2.0,60000.0,0.0,0.0,0.0,0.0,304.0,140.0,107.0,U7,20447,USA Jet Airlines Inc.,D,U7,USA Jet Airlines Inc.,2,2,14730,33044,SDF,"Louisville, KY",US,United States,16091,31295,YIP,"Detroit, MI",US,United States,6,640,2,2018,4,1,P,DU
199997,0.0,1.0,36000.0,0.0,0.0,0.0,0.0,304.0,64.0,55.0,U7,20447,USA Jet Airlines Inc.,D,U7,USA Jet Airlines Inc.,2,2,14730,33044,SDF,"Louisville, KY",US,United States,16091,31295,YIP,"Detroit, MI",US,United States,6,655,2,2018,4,1,P,DU
199998,0.0,5.0,28000.0,0.0,0.0,0.0,0.0,304.0,320.0,270.0,U7,20447,USA Jet Airlines Inc.,D,U7,USA Jet Airlines Inc.,2,2,14730,33044,SDF,"Louisville, KY",US,United States,16091,31295,YIP,"Detroit, MI",US,United States,6,681,2,2018,4,1,P,DU


# TABLES

### Table `flights`

In [48]:
table_name = 'flights'
limit = 1000000
filename = f'{table_name}_{str(limit/1000)}k_sample.csv'

df_flights = pd.read_csv(Path('./data') / filename)

In [49]:
print(df_flights.columns.tolist())
print(df_flights.shape)

['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name', 'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time', 'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled', 'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'flights', 'distance', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay', 'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'no_name']
(1000000, 42)


- **`fl_date`**: Flight Date (yyyy-mm-dd)
- **`mkt_unique_carrier`**: Unique Marketing Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users, for example, PA, PA(1), PA(2). Use this field for analysis across a range of years.
- **`branded_code_share`**: Reporting Carrier Operated or Branded Code Share Partners
- **`mkt_carrier`**: Code assigned by IATA and commonly used to identify a carrier. As the same code may have been assigned to different carriers over time, the code is not always unique. For analysis, use the Unique Carrier Code.
- **`mkt_carrier_fl_num`**: Flight Number
- **`op_unique_carrier`**: Unique Scheduled Operating Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users,for example, PA, PA(1), PA(2). Use this field for analysis across a range of years.
- **`tail_num`**: Tail Number
- **`op_carrier_fl_num`**: Flight Number
- **`origin_airport_id`**: Origin Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.
- **`origin`**: Origin Airport
- **`origin_city_name`**: Origin Airport, City Name
- **`dest_airport_id`**: Destination Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.
- **`dest`**: Destination Airport
- **`dest_city_name`**: Destination Airport, City Name
- **`crs_dep_time`**: CRS Departure Time (local time: hhmm)
- **`dep_time`**: Actual Departure Time (local time: hhmm)
- **`dep_delay`**: Difference in minutes between scheduled and actual departure time. Early departures show negative numbers.	
- **`taxi_out`**: Taxi Out Time, in Minutes
- **`wheels_off`**: Wheels Off Time (local time: hhmm)
- **`wheels_on`**: Wheels On Time (local time: hhmm)
- **`taxi_in`**: 	Taxi In Time, in Minutes
- **`crs_arr_time`**: CRS Arrival Time (local time: hhmm)
- **`arr_time`**: Actual Arrival Time (local time: hhmm)
- **`arr_delay`**: Difference in minutes between scheduled and actual arrival time. Early arrivals show negative numbers.
- **`cancelled`**: Cancelled Flight Indicator (1=Yes)
- **`cancellation_code`**: Specifies The Reason For Cancellation
- **`diverted`**: Diverted Flight Indicator (1=Yes)
- **`dup`**: Duplicate flag marked Y if the flight is swapped based on Form-3A data
- **`crs_elapsed_time`**: CRS Elapsed Time of Flight, in Minutes
- **`actual_elapsed_time`**: Elapsed Time of Flight, in Minutes
- **`air_time`**: Flight Time, in Minutes
- **`flights`**: Number of Flights
- **`distance`**: Distance between airports (miles)
- **`carrier_delay`**: Carrier Delay, in Minutes
- **`weather_delay`**: Weather Delay, in Minutes
- **`nas_delay`**: National Air System Delay, in Minutes
- **`security_delay`**: Security Delay, in Minutes
- **`late_aircraft_delay`**: Late Aircraft Delay, in Minutes
- **`first_dep_time`**: First Gate Departure Time at Origin Airport
- **`total_add_gtime`**: Total Ground Time Away from Gate for Gate Return or Cancelled Flight
- **`longest_add_gtime`**: Longest Time Away from Gate for Gate Return or Cancelled Flight

### Table `passengers`

In [50]:
table_name = 'passengers'
limit = 1000000
filename = f'{table_name}_{str(limit/1000)}k_sample.csv'

df_passengers = pd.read_csv(Path('./data') / filename)

In [51]:
print(df_passengers.columns.tolist())
print(df_passengers.shape)

['departures_scheduled', 'departures_performed', 'payload', 'seats', 'passengers', 'freight', 'mail', 'distance', 'ramp_to_ramp', 'air_time', 'unique_carrier', 'airline_id', 'unique_carrier_name', 'region', 'carrier', 'carrier_name', 'carrier_group', 'carrier_group_new', 'origin_airport_id', 'origin_city_market_id', 'origin', 'origin_city_name', 'origin_country', 'origin_country_name', 'dest_airport_id', 'dest_city_market_id', 'dest', 'dest_city_name', 'dest_country', 'dest_country_name', 'aircraft_group', 'aircraft_type', 'aircraft_config', 'year', 'month', 'distance_group', 'class', 'data_source']
(1000000, 38)


- **`departures_scheduled`**: Departures Scheduled
- **`departures_performed`**: Departures Performed
- **`payload`**: Available Payload (pounds)
  - **`seats`**: Available Seats
- **`passengers`**: Non-Stop Segment Passengers Transported
  - **`freight`**: Non-Stop Segment Freight Transported (pounds)
  - **`mail`**: Non-Stop Segment Mail Transported (pounds)
- **`distance`**: Distance between airports (miles)
- **`ramp_to_ramp`**: Ramp to Ramp Time (minutes)
  - **`air_time`**: Airborne Time (minutes)
- **`unique_carrier`**: Unique Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users, for example, PA, PA(1), PA(2). Use this field for analysis across a range of years.
- **`airline_id`**: An identification number assigned by US DOT to identify a unique airline (carrier). A unique airline (carrier) is defined as one holding and reporting under the same DOT certificate regardless of its Code, Name, or holding company/corporation.
- **`unique_carrier_name`**: Unique Carrier Name. When the same name has been used by multiple carriers, a numeric suffix is used for earlier users, for example, Air Caribbean, Air Caribbean (1).
- **`region`**: Carrier's Operation Region. Carriers Report Data by Operation Region
- **`carrier`**: Code assigned by IATA and commonly used to identify a carrier. As the same code may have been assigned to different carriers over time, the code is not always unique. For analysis, use the Unique Carrier Code.
- **`carrier_name`**: Carrier Name
- **`carrier_group`**: Carrier Group Code
- **`carrier_group_new`**: Carrier Group New
- **`origin_airport_id`**: Origin Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.
- **`origin_city_market_id`**: Origin Airport, City Market ID. City Market ID is an identification number assigned by US DOT to identify a city market. Use this field to consolidate airports serving the same city market.	
- **`origin`**: Origin Airport
- **`origin_city_name`**: Origin City
- **`origin_country`**: Origin Country Code
- **`origin_country_name`**: Origin Country
- **`dest_airport_id`**: Destination Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.
- **`dest_city_market_id`**: Destination Airport, City Market ID. City Market ID is an identification number assigned by US DOT to identify a city market. Use this field to consolidate airports serving the same city market.
- **`dest`**: Destination Airport
- **`dest_city_name`**: Destination City
- **`dest_country`**: Destination Country Code
- **`dest_country_name`**: Destination Country
- **`aircraft_group`**: Aircraft Group
- **`aircraft_type`**: Aircraft Type
- **`aircraft_config`**: Aircraft Configuration
- **`month`**: Month
- **`year`**: Year
- **`distance_group`**: Distance Intervals, every 500 Miles, for Flight Segment
- **`class`**: Service Class

### Table `pass_sample` (`passengers` samples)

In [52]:
table_name = 'pass_sample'
limit = 1000000
filename = f'{table_name}_{str(limit/1000)}k_sample.csv'

df_pass_sample = pd.read_csv(Path('./data') / filename)

In [53]:
print(df_pass_sample.columns.tolist())
print(df_pass_sample.shape)
print(df_passengers.shape)

['departures_scheduled', 'departures_performed', 'payload', 'seats', 'passengers', 'freight', 'mail', 'distance', 'ramp_to_ramp', 'air_time', 'unique_carrier', 'airline_id', 'unique_carrier_name', 'region', 'carrier', 'carrier_name', 'carrier_group', 'carrier_group_new', 'origin_airport_id', 'origin_city_market_id', 'origin', 'origin_city_name', 'origin_country', 'origin_country_name', 'dest_airport_id', 'dest_city_market_id', 'dest', 'dest_city_name', 'dest_country', 'dest_country_name', 'aircraft_group', 'aircraft_type', 'aircraft_config', 'year', 'month', 'distance_group', 'class', 'data_source']
(156448, 38)
(1000000, 38)


### Table `fuel_comsumption`

In [54]:
table_name = 'fuel_comsumption'
limit = 1000000
filename = f'{table_name}_{str(limit/1000)}k_sample.csv'

df_fuel_comsumption = pd.read_csv(Path('./data') / filename)

In [55]:
print(df_fuel_comsumption.columns.tolist())
print(df_fuel_comsumption.shape)

['month', 'airline_id', 'unique_carrier', 'carrier', 'carrier_name', 'carrier_group_new', 'sdomt_gallons', 'satl_gallons', 'spac_gallons', 'slat_gallons', 'sint_gallons', 'ts_gallons', 'tdomt_gallons', 'tint_gallons', 'total_gallons', 'sdomt_cost', 'satl_cost', 'spac_cost', 'slat_cost', 'sint_cost', 'ts_cost', 'tdomt_cost', 'tint_cost', 'total_cost', 'year']
(3035, 25)


- **`month`**: Month
- **`airline_id`**: An identification number assigned by US DOT to identify a unique airline (carrier). A unique airline (carrier) is defined as one holding and reporting under the same DOT certificate regardless of its Code, Name, or holding company/corporation.
- **`unique_carrier`**: Unique Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users, for example, PA, PA(1), PA(2). Use this field for analysis across a range of years.
- **`carrier`**: Code assigned by IATA and commonly used to identify a carrier. As the same code may have been assigned to different carriers over time, the code is not always unique. For analysis, use the Unique Carrier Code.
- **`carrier_name`**: Carrier Name
- **`carrier_group_new`**: Carrier Group New
- **`sdomt_gallons`**: Total Scheduled Domestic, Fuel Consumption (Gallons)
- **`satl_gallons`**: Scheduled Service International Atlantic - Fuel Consumption (Gallons)
- **`spac_gallons`**: Scheduled Service International Pacific - Fuel Consumption (Gallons)
- **`slat_gallons`**: Scheduled Service International Latin America - Fuel Consumption (Gallons)
- **`sint_gallons`**: Scheduled Service International Subtotal - Fuel Consumption (Gallons)
- **`ts_gallons`**: Total Scheduled Service - Fuel Consumption (Gallons)
- **`tdomt_gallons`**: Total Domestic - Fuel Consumption (Gallons)
- **`tint_gallons`**: Total International - Fuel Consumption (Gallons)
- **`total_gallons`**: Grand Total - Fuel Consumption (Gallons)
- **`sdomt_cost`**: Total Scheduled Domestic, Fuel Cost (Dollars)
- **`satl_cost`**: Scheduled Service International Atlantic - Fuel Cost (Dollars)
- **`spac_cost`**: Scheduled Service International Pacific - Fuel Cost (Dollars)
- **`slat_cost`**: Scheduled Service International Latin America - Fuel Cost (Dollars)
- **`sint_cost`**: Scheduled Service International Subtotal - Fuel Cost (Dollars)
- **`ts_cost`**: Total Scheduled Service - Fuel Cost (Dollars)
- **`tdomt_cost`**: Total Domestic - Fuel Cost (Dollars)
- **`tint_cost`**: Total International - Fuel Cost (Dollars)
- **`total_cost`**: Grand Total - Fuel Cost (Dollars)
- **`year`**: year

### Table `flights_test` (`flights` sample for prediction - January 2022)

In [56]:
table_name = 'flights_test'
limit = 1000000
filename = f'{table_name}_{str(limit/1000)}k_sample.csv'

df_flights_test = pd.read_csv(Path('./data') / filename)

In [57]:
print(df_flights_test.columns.tolist())
print(df_flights_test.shape)
print(df_flights.shape)
df_flights_test.tail(10)

['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name', 'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'crs_arr_time', 'dup', 'crs_elapsed_time', 'flights', 'distance']
(660556, 20)
(1000000, 42)


Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
660546,2020-01-31,DL,DL_CODESHARE,DL,4950,9E,N228PQ,4950,10397,ATL,"Atlanta, GA",15323,TRI,"Bristol/Johnson City/Kingsport, TN",2225,2331,N,66,1,227
660547,2020-01-31,DL,DL_CODESHARE,DL,4952,9E,N600LR,4952,12953,LGA,"New York, NY",13342,MKE,"Milwaukee, WI",1735,1921,N,166,1,738
660548,2020-01-31,DL,DL_CODESHARE,DL,4953,9E,N655CA,4953,12953,LGA,"New York, NY",14524,RIC,"Richmond, VA",1642,1825,N,103,1,292
660549,2020-01-31,DL,DL_CODESHARE,DL,4953,9E,N655CA,4953,14524,RIC,"Richmond, VA",12953,LGA,"New York, NY",1900,2033,N,93,1,292
660550,2020-01-31,DL,DL_CODESHARE,DL,4954,9E,N926XJ,4954,11193,CVG,"Cincinnati, OH",11278,DCA,"Washington, DC",1655,1824,N,89,1,411
660551,2020-01-31,DL,DL_CODESHARE,DL,4954,9E,N926XJ,4954,11278,DCA,"Washington, DC",11193,CVG,"Cincinnati, OH",1859,2041,N,102,1,411
660552,2020-01-31,DL,DL_CODESHARE,DL,4955,9E,N309PQ,4955,11278,DCA,"Washington, DC",11193,CVG,"Cincinnati, OH",1515,1702,N,107,1,411
660553,2020-01-31,DL,DL_CODESHARE,DL,4956,9E,N324PQ,4956,12478,JFK,"New York, NY",10785,BTV,"Burlington, VT",2205,2337,N,92,1,266
660554,2020-01-31,DL,DL_CODESHARE,DL,4957,9E,N132EV,4957,13930,ORD,"Chicago, IL",12478,JFK,"New York, NY",1035,1356,N,141,1,740
660555,2020-01-31,DL,DL_CODESHARE,DL,4958,9E,N390CA,4958,12953,LGA,"New York, NY",11057,CLT,"Charlotte, NC",2005,2222,N,137,1,544


### Table `sample` (`flights` samples)

In [58]:
table_name = 'sample'
limit = 1000000
filename = f'{table_name}_{str(limit/1000)}k_sample.csv'

df_sample = pd.read_csv(Path('./data') / filename)

  df_sample = pd.read_csv(Path('./data') / filename)


In [59]:
print(df_sample.columns.tolist())
print(df_sample.shape)
print(df_flights.shape)

['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name', 'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time', 'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled', 'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'flights', 'distance', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay', 'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'no_name']
(1000000, 42)
(1000000, 42)



This table consists of subset of columns from table flights. It represents flights from January 2020 which will be used for evaluation. Therefore, we are missing some features that we are not suppossed to know before the flight lands.

- **`fl_date`**: Flight Date (yyyy-mm-dd)
- **`mkt_unique_carrier`**: Unique Marketing Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users, for example, PA, PA(1), PA(2). Use this field for analysis across a range of years.
- **`branded_code_share`**: Reporting Carrier Operated or Branded Code Share Partners
- **`mkt_carrier`**: Code assigned by IATA and commonly used to identify a carrier. As the same code may have been assigned to different carriers over time, the code is not always unique. For analysis, use the Unique Carrier Code.
- **`mkt_carrier_fl_num`**: Flight Number
- **`op_unique_carrier`**: Unique Scheduled Operating Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users,for example, PA, PA(1), PA(2). Use this field for analysis across a range of years.
- **`tail_num`**: Tail Number
- **`op_carrier_fl_num`**: Flight Number
- **`origin_airport_id`**: Origin Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.
- **`origin`**: Origin Airport
- **`origin_city_name`**: Origin Airport, City Name
- **`dest_airport_id`**: Destination Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.
- **`dest`**: Destination Airport
- **`dest_city_name`**: Destination Airport, City Name
- **`crs_dep_time`**: CRS Departure Time (local time: hhmm)
- **`crs_arr_time`**: CRS Arrival Time (local time: hhmm)
- **`dup`**: Duplicate flag marked Y if the flight is swapped based on Form-3A data
- **`crs_elapsed_time`**: CRS Elapsed Time of Flight, in Minutes
- **`flights`**: Number of Flights
- **`distance`**: Distance between airports (miles)

### Table `temptable` (`flights` but different samples)

In [60]:
table_name = 'temptable'
limit = 1000000
filename = f'{table_name}_{str(limit/1000)}k_sample.csv'

df_temptable = pd.read_csv(Path('./data') / filename)

  df_temptable = pd.read_csv(Path('./data') / filename)


In [61]:
print(df_temptable.columns.tolist())
print(df_temptable.shape)
print(df_flights.shape)
df_temptable.head(10)

['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier', 'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num', 'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name', 'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time', 'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled', 'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time', 'actual_elapsed_time', 'air_time', 'flights', 'distance', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay', 'first_dep_time', 'total_add_gtime', 'longest_add_gtime', 'no_name']
(785850, 42)
(1000000, 42)


Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2019-12-19,AA,AA,AA,885,AA,N961UW,885,10397,ATL,"Atlanta, GA",12953,LGA,"New York, NY",1536,1529.0,-7.0,30.0,1559.0,1743.0,5.0,1744,1748.0,4.0,0.0,,0.0,N,128.0,139.0,104.0,1.0,762.0,,,,,,,,,
1,2019-12-19,AA,AA,AA,890,AA,N317PG,890,10397,ATL,"Atlanta, GA",12892,LAX,"Los Angeles, CA",1819,1834.0,15.0,11.0,1845.0,2159.0,18.0,2040,2217.0,,0.0,,1.0,N,321.0,,,1.0,1947.0,,,,,,,,,
2,2019-12-19,AA,AA,AA,1309,AA,N956AN,1309,10397,ATL,"Atlanta, GA",11298,DFW,"Dallas/Fort Worth, TX",1412,1422.0,10.0,15.0,1437.0,1535.0,12.0,1545,1547.0,2.0,0.0,,0.0,N,153.0,145.0,118.0,1.0,731.0,,,,,,,,,
3,2019-12-19,AA,AA,AA,1365,AA,N828AW,1365,10397,ATL,"Atlanta, GA",11298,DFW,"Dallas/Fort Worth, TX",1224,1226.0,2.0,11.0,1237.0,1335.0,14.0,1355,1349.0,-6.0,0.0,,0.0,N,151.0,143.0,118.0,1.0,731.0,,,,,,,,,
4,2019-12-19,AA,AA,AA,1409,AA,N979NN,1409,10397,ATL,"Atlanta, GA",12892,LAX,"Los Angeles, CA",1608,1612.0,4.0,9.0,1621.0,1748.0,5.0,1824,1753.0,-31.0,0.0,,0.0,N,316.0,281.0,267.0,1.0,1947.0,,,,,,,,,
5,2019-12-19,DL,DL_CODESHARE,DL,5435,9E,N922XJ,5435,10397,ATL,"Atlanta, GA",12197,HPN,"White Plains, NY",730,725.0,-5.0,14.0,739.0,927.0,6.0,943,933.0,-10.0,0.0,,0.0,N,133.0,128.0,108.0,1.0,780.0,,,,,,,,,
6,2019-12-19,DL,DL_CODESHARE,DL,5452,9E,N914XJ,5452,10397,ATL,"Atlanta, GA",10408,ATW,"Appleton, WI",1501,1456.0,-5.0,11.0,1507.0,1553.0,7.0,1620,1600.0,-20.0,0.0,,0.0,N,139.0,124.0,106.0,1.0,765.0,,,,,,,,,
7,2019-12-19,DL,DL_CODESHARE,DL,5502,9E,N368CA,5502,10397,ATL,"Atlanta, GA",11003,CID,"Cedar Rapids/Iowa City, IA",915,905.0,-10.0,18.0,923.0,1005.0,5.0,1033,1010.0,-23.0,0.0,,0.0,N,138.0,125.0,102.0,1.0,694.0,,,,,,,,,
8,2019-12-19,DL,DL_CODESHARE,DL,5514,9E,N295PQ,5514,10397,ATL,"Atlanta, GA",14814,SHV,"Shreveport, LA",1953,1949.0,-4.0,10.0,1959.0,2029.0,5.0,2058,2034.0,-24.0,0.0,,0.0,N,125.0,105.0,90.0,1.0,551.0,,,,,,,,,
9,2019-12-19,DL,DL_CODESHARE,DL,5015,9E,N331PQ,5015,10397,ATL,"Atlanta, GA",15412,TYS,"Knoxville, TN",1738,1806.0,28.0,13.0,1819.0,1849.0,3.0,1837,1852.0,15.0,0.0,,0.0,N,59.0,46.0,30.0,1.0,152.0,15.0,0.0,0.0,0.0,0.0,,,,


### Table `test_table` (no data)

In [62]:
table_name = 'test_table'
limit = 1000000
filename = f'{table_name}_{str(limit/1000)}k_sample.csv'

df_test_table = pd.read_csv(Path('./data') / filename)

In [63]:
print(df_test_table.columns.tolist())
print(df_test_table.shape)

['testid']
(0, 1)


# Predict Flight Delay (Target: `arr_delay` in `flights` table)

In [64]:
# consolidate classes to peasant, middle, rich
for i,cls in df_passengers[df_passengers['class'] == 'L']['class'].iteritems():
    df_passengers.loc[i, 'class'] = 'G'

df_passengers['class'].unique()

array(['G', 'P', 'F'], dtype=object)

In [65]:
# take 1000 random samples from each month and return to dataframe
# not overwriting will still return the dataframe
df_flights_by_month = pd.DataFrame()

for i in range(0,12):
    df_flights_by_month = pd.concat(
        [df_flights_by_month,
         scr.sql_search_date(table='flights', y=2019, m=i+1, overwrite=False).sort_values(by='fl_date')])
    
# reduce columns to columns in `flight_test``
# print(df_flights_by_month.shape)
# df_flights_by_month = df_flights_by_month[df_flights_test.columns]
# print(df_flights_by_month.shape)

File exists. Returning DataFrame...
File exists. Returning DataFrame...
File exists. Returning DataFrame...
File exists. Returning DataFrame...
File exists. Returning DataFrame...
File exists. Returning DataFrame...
File exists. Returning DataFrame...
File exists. Returning DataFrame...
File exists. Returning DataFrame...
File exists. Returning DataFrame...
File exists. Returning DataFrame...
File exists. Returning DataFrame...


In [66]:
# filter out 'cancelled' flights
df_flights_by_month = df_flights_by_month[df_flights_by_month['cancelled'] != 1]

In [67]:
# same columns between df_flights and df_passengers
print(np.intersect1d(df_flights_by_month.columns, df_passengers.columns))

['air_time' 'dest' 'dest_airport_id' 'dest_city_name' 'distance' 'origin'
 'origin_airport_id' 'origin_city_name']


In [68]:
# same columns between df_passengers and df_compsumption
print(np.intersect1d(df_passengers.columns, df_fuel_comsumption.columns))

['airline_id' 'carrier' 'carrier_group_new' 'carrier_name' 'month'
 'unique_carrier' 'year']


In [69]:
# same columns between df_flights and df_comsumption
print(np.intersect1d(df_flights_test.columns, df_passengers.columns))

['dest' 'dest_airport_id' 'dest_city_name' 'distance' 'origin'
 'origin_airport_id' 'origin_city_name']


In [70]:
for column in df_flights_by_month.columns:
    if re.search(r'delay', column):
        print(column)

dep_delay
arr_delay
carrier_delay
weather_delay
nas_delay
security_delay
late_aircraft_delay


In [71]:
# drop any delay related columns
df_flights_by_month =df_flights_by_month.drop([
    'cancellation_code',
    'cancelled',
    'dep_time', 
    'crs_dep_time', 
    'arr_time', 
    'crs_arr_time', 
    'actual_elapsed_time', 
    'crs_elapsed_time'], axis = 1)

In [72]:
df_flights_by_month = df_flights_by_month.reset_index(drop=True)

### Get `flights` table by month with random days

In [73]:
# merge tables using month and year

# make month column
for i,date in enumerate(df_flights_by_month['fl_date']):
    df_flights_by_month.loc[i, 'month'] = dt.datetime.strptime(df_flights_by_month.loc[i, 'fl_date'], '%Y-%m-%d').month
    df_flights_by_month.loc[i, 'year'] = dt.datetime.strptime(df_flights_by_month.loc[i, 'fl_date'], '%Y-%m-%d').year

In [74]:
df_flights_by_month['month'] = df_flights_by_month['month'].astype(int)
df_flights_by_month['year'] = df_flights_by_month['year'].astype(int)

In [75]:
df_flights_by_month.sort_values(by='fl_date')

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,arr_delay,diverted,dup,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name,month,year
0,2019-01-01,G4,G4,G4,2176,G4,271NV,2176,14761,SFB,"Sanford, FL",11721,FNT,"Flint, MI",23.0,14.0,1221.0,1431.0,7.0,24.0,0.0,N,130.0,1.0,988.0,0.0,0.0,24.0,0.0,0.0,,,,,1,2019
31,2019-01-01,WN,WN,WN,864,WN,N553WN,864,14893,SMF,"Sacramento, CA",12892,LAX,"Los Angeles, CA",4.0,13.0,752.0,849.0,6.0,-15.0,0.0,N,57.0,1.0,373.0,,,,,,,,,,1,2019
30,2019-01-01,AA,AA,AA,1067,AA,N141NN,1067,11292,DEN,"Denver, CO",11298,DFW,"Dallas/Fort Worth, TX",9.0,46.0,655.0,929.0,13.0,28.0,0.0,N,94.0,1.0,641.0,9.0,0.0,19.0,0.0,0.0,,,,,1,2019
29,2019-01-01,UA,UA,UA,1155,UA,N78501,1155,13930,ORD,"Chicago, IL",10397,ATL,"Atlanta, GA",-2.0,16.0,614.0,845.0,5.0,-9.0,0.0,N,91.0,1.0,606.0,,,,,,,,,,1,2019
28,2019-01-01,AA,AA,AA,1260,AA,N996AN,1260,14771,SFO,"San Francisco, CA",11298,DFW,"Dallas/Fort Worth, TX",-4.0,15.0,1208.0,1714.0,12.0,-3.0,0.0,N,186.0,1.0,1464.0,,,,,,,,,,1,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11761,2019-12-31,AA,AA,AA,2910,AA,N827AW,2910,11298,DFW,"Dallas/Fort Worth, TX",15919,XNA,"Fayetteville, AR",30.0,14.0,1322.0,1402.0,11.0,26.0,0.0,N,40.0,1.0,280.0,11.0,0.0,0.0,0.0,15.0,,,,,12,2019
11760,2019-12-31,DL,DL,DL,954,DL,N668DN,954,12892,LAX,"Los Angeles, CA",10397,ATL,"Atlanta, GA",-4.0,19.0,650.0,1327.0,5.0,-11.0,0.0,N,217.0,1.0,1947.0,,,,,,,,,,12,2019
11787,2019-12-31,DL,DL,DL,1824,DL,N862DN,1824,14057,PDX,"Portland, OR",14869,SLC,"Salt Lake City, UT",-1.0,15.0,614.0,840.0,5.0,-10.0,0.0,N,86.0,1.0,630.0,,,,,,,,,,12,2019
11773,2019-12-31,WN,WN,WN,4291,WN,N792SW,4291,11193,CVG,"Cincinnati, OH",13232,MDW,"Chicago, IL",0.0,9.0,1104.0,1052.0,5.0,-13.0,0.0,N,48.0,1.0,249.0,,,,,,,,,,12,2019


In [76]:
# same columns between df_flights and df_passengers
merge_on = list(np.intersect1d(df_flights_by_month.columns, df_passengers.columns))
merge_on

['air_time',
 'dest',
 'dest_airport_id',
 'dest_city_name',
 'distance',
 'month',
 'origin',
 'origin_airport_id',
 'origin_city_name',
 'year']

In [77]:
df_flights_pass = pd.merge(
    df_flights_by_month,
    df_passengers[[
        'departures_performed',
        'seats',
        'passengers',
        'freight',
        'payload',
        'air_time',
        'airline_id',
        'carrier',
        'origin',
        'dest',
        'distance',
        'aircraft_group',
        'aircraft_type',
        'aircraft_config',
        'class'
        ]])

In [78]:
df_flights_pass_fuel = pd.merge(
    df_flights_pass, df_fuel_comsumption[[
        'month',
        'airline_id',
        'carrier',
        'carrier_name',
        'carrier_group_new',
        'total_gallons',
        'total_cost',
        'year']])

In [79]:
# cleanup
# df_flights_pass_fuel.sort_values(by='fl_date').dropna(inplace=True)

### Get dummy values

In [80]:
df_flights_pass_fuel = pd.get_dummies(df_flights_pass_fuel, drop_first=True, columns=['class', 'aircraft_group', 'aircraft_type', 'aircraft_config', 'carrier_group_new', 'mkt_unique_carrier'], dummy_na=True)

### Get weather data and incorporate into existing DataFrame

In [81]:
import requests as req
from calendar import Calendar

df_weather = pd.read_csv(Path('./data') / 'weather.csv')

In [82]:
base_url = 'https://api.worldweatheronline.com/premium/v1/past-weather.ashx?key=5964df60e1c04a11883230245221204&q&format=json'

# locations
locations = df_flights['origin_city_name'].unique()

# params
dates = [
    '2019-01-01',
    '2019-02-01',
    '2019-03-01',
    '2019-04-01',
    '2019-05-01',
    '2019-06-01',
    '2019-07-01',
    '2019-08-01',
    '2019-09-01',
    '2019-10-01',
    '2019-11-01',
    '2019-12-01']
enddates = [
    '2019-01-31',
    '2019-02-28',
    '2019-03-31',
    '2019-04-30',
    '2019-05-31',
    '2019-06-30',
    '2019-07-31',
    '2019-08-31',
    '2019-09-30',
    '2019-10-31',
    '2019-11-30',
    '2019-12-31']

In [83]:
base_url = 'https://api.worldweatheronline.com/premium/v1/past-weather.ashx?key=5964df60e1c04a11883230245221204&q&format=json'

# locations
locations = df_flights['origin_city_name'].unique()

# params
dates = [
    '2020-01-01',]
enddates = [
    '2020-01-31',]

In [84]:
scr.make_csv()

TypeError: make_csv() missing 2 required positional arguments: 'query' and 'filename'

In [None]:
weather_conditions = pd.read_csv(Path('./data') / 'weather_test.csv', usecols=[0,1,2], sep=',')

In [None]:
weather_conditions.drop_duplicates(inplace=True)
weather_conditions.sort_values(by='overhead_code', inplace=True)

In [None]:
weather_conditions

# rain or like weather = > 263
# sunny = 113
# cloudy = 116, 119, 122
# snow = 227, 230

Unnamed: 0,overhead_code,daycondition,NightCondition
0,113,Sunny,Clear
40,116,Partly cloudy,Partly cloudy
80,119,Cloudy,Cloudy
120,122,Overcast,Overcast
160,143,Mist,Mist
200,176,Patchy rain possible,Patchy rain possible
240,179,Patchy snow possible,Patchy snow possible
280,182,Patchy sleet possible,Patchy sleet possible
320,185,Patchy freezing drizzle possible,Patchy freezing drizzle possible
360,200,Thundery outbreaks possible,Thundery outbreaks possible


In [None]:
import requests as req
weather_loc = {}

# for each location, save monthly weather
for location in locations:
    
    weather_req = []
    
    for i in range(0,1):
        
        # payload using params
        payload = dict(
            date=dates[i],
            enddate=enddates[i],
            q=location)
        
        weather_req.append(req.get(url=base_url, params=payload).json())
    
    weather_loc[location] = weather_req

In [None]:
from calendar import Calendar
cal = Calendar()
df_weather = pd.DataFrame(columns=['location', 'date', 'time', 'tempC', 'condition'])
df_placeholder = pd.DataFrame(columns=['location', 'date', 'time', 'tempC', 'condition'])
df_dict = {}

for location in locations:
    
    for m in range(0,1):
        for i,d in enumerate([x for x in cal.itermonthdates(2020, m+1) if x.month == m+1]):
            
            for j in range(len(weather_loc[location][m]['data']['weather'][i]['hourly'])):
                df_placeholder.loc[j+1, 'location'] = location
                df_placeholder.loc[j+1, 'date'] = d
                df_placeholder.loc[j+1, 'time'] = weather_loc[location][m]['data']['weather'][i]['hourly'][j]['time']
                df_placeholder.loc[j+1, 'tempC'] = int(weather_loc[location][m]['data']['weather'][i]['hourly'][j]['tempC'])
                df_placeholder.loc[j+1, 'condition'] = int(weather_loc[location][m]['data']['weather'][i]['hourly'][j]['weatherCode'])
            
            df_weather = pd.concat([df_weather, df_placeholder], ignore_index=True, axis=0)
        

In [None]:
# df_weather = pd.read_csv('./data/weather_locations.csv')

In [None]:
df_weather.to_csv('./data/weather_test.csv')

In [None]:
df_weather_copy = df_weather.copy()

In [None]:
# add 6 hours to make it easier to separate into night/day
df_weather_copy['time'] = [time+600 for time in df_weather['time'].astype(int)]
df_weather_copy['tempC'] = [temp for temp in df_weather['tempC'].astype(int)]
df_weather_copy['condition'] = [con for con in df_weather['condition'].astype(int)]

In [None]:
daytime = []

for i,time in df_weather_copy['time'].iteritems():
    
    # code the time to 24 hours
    if time >= 2400:
        df_weather_copy.loc[i, 'time'] = time - 2400
    
    # convert 
    if df_weather_copy.loc[i, 'time'] >= 1200:
        daytime.append(0) # day
    else:
        daytime.append(1) # night

df_weather_copy = df_weather_copy.join(pd.Series(daytime, name='day/night'))

In [None]:
# rain or like weather = > 263
# sunny = 113
# cloudy = 116, 119, 122
# snow = 227, 230

df_weather_copy = df_weather_copy.drop(['time'], axis=1).groupby(['location', 'date', 'day/night'])['condition'].aggregate('median')

In [None]:
df_weather_copy = df_weather_copy.reset_index()

In [None]:
conditions = []

for condition in df_weather_copy['condition']:
    if condition > 250:
        conditions.append('rain')
    elif condition > 200 and condition <= 250:
        conditions.append('snow')
    elif condition < 115:
        conditions.append('sunny')
    else:
        conditions.append('cloudy')


In [None]:
df_weather_copy = df_weather_copy.join(pd.Series(conditions, name='cond_desc'))

In [None]:
df_weather_copy

Unnamed: 0,location,date,day/night,condition,cond_desc
0,"Aberdeen, SD",2019-01-01,0,116.0,cloudy
1,"Aberdeen, SD",2019-01-01,1,117.5,cloudy
2,"Aberdeen, SD",2019-01-02,0,194.0,cloudy
3,"Aberdeen, SD",2019-01-02,1,194.0,cloudy
4,"Aberdeen, SD",2019-01-03,0,113.0,sunny
...,...,...,...,...,...
267175,"Yuma, AZ",2019-12-29,1,266.0,rain
267176,"Yuma, AZ",2019-12-30,0,230.0,snow
267177,"Yuma, AZ",2019-12-30,1,230.0,snow
267178,"Yuma, AZ",2019-12-31,0,230.0,snow


In [None]:
df_weather = pd.get_dummies(df_weather_copy, columns=['cond_desc'])
df_weather = df_weather.groupby(['location', 'date']).aggregate('sum')

In [None]:
df_weather.reset_index(inplace=True)

In [None]:
df_flights_pass_fuel['fl_date'] = pd.to_datetime(df_flights_pass_fuel['fl_date'])

In [None]:
type(df_flights_pass_fuel['fl_date'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [None]:
df_weather['date'] = pd.to_datetime(df_weather['date'])

In [None]:
df_flights_pass_weather = pd.merge(
    df_flights_pass_fuel,
    df_weather.drop(columns=['day/night', 'condition'], axis=1),
    left_on=['fl_date', 'origin_city_name'],
    right_on=['date', 'location'],
    how='left'
    )

In [None]:
len(df_flights_pass_weather['origin_city_name'].unique())

303

In [None]:
df_flights_pass_weather

Unnamed: 0,fl_date,origin,origin_city_name,dest,taxi_out,taxi_in,arr_delay,distance,month,year,...,mkt_unique_carrier_NK,mkt_unique_carrier_UA,mkt_unique_carrier_WN,mkt_unique_carrier_nan,location,date,cond_desc_cloudy,cond_desc_rain,cond_desc_snow,cond_desc_sunny
0,2019-01-01,BUF,"Buffalo, NY",EWR,16.0,6.0,-26.0,282.0,1,2019,...,0,1,0,0,"Buffalo, NY",2019-01-01,0,0,0,2
1,2019-01-01,BUF,"Buffalo, NY",EWR,16.0,6.0,-26.0,282.0,1,2019,...,0,1,0,0,"Buffalo, NY",2019-01-01,0,0,0,2
2,2019-01-01,BUF,"Buffalo, NY",EWR,16.0,6.0,-26.0,282.0,1,2019,...,0,1,0,0,"Buffalo, NY",2019-01-01,0,0,0,2
3,2019-01-01,BUF,"Buffalo, NY",EWR,16.0,6.0,-26.0,282.0,1,2019,...,0,1,0,0,"Buffalo, NY",2019-01-01,0,0,0,2
4,2019-01-01,BUF,"Buffalo, NY",EWR,16.0,6.0,-26.0,282.0,1,2019,...,0,1,0,0,"Buffalo, NY",2019-01-01,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1652486,2019-07-19,MYR,"Myrtle Beach, SC",ISP,9.0,5.0,-19.0,586.0,7,2019,...,0,0,0,0,"Myrtle Beach, SC",2019-07-19,1,1,0,0
1652487,2019-07-19,MYR,"Myrtle Beach, SC",ISP,9.0,5.0,-19.0,586.0,7,2019,...,0,0,0,0,"Myrtle Beach, SC",2019-07-19,1,1,0,0
1652488,2019-08-13,ATL,"Atlanta, GA",MIA,14.0,59.0,47.0,594.0,8,2019,...,0,0,0,0,"Atlanta, GA",2019-08-13,0,0,0,2
1652489,2019-08-15,ATL,"Atlanta, GA",MIA,19.0,7.0,-8.0,594.0,8,2019,...,0,0,0,0,"Atlanta, GA",2019-08-15,2,0,0,0


In [None]:
(df_flights_pass_weather.index == range(0, 1652491)).sum()

1652491

### Develop weekend columns to account for date

In [None]:
df_flights_pass_weather['weekday'] = ((pd.DatetimeIndex(df_flights_pass_weather['date']).dayofweek) // 5 == 1).astype(int)

In [None]:
df_flights_pass_weather['weekday'].value_counts()

0    1210052
1     442439
Name: weekday, dtype: int64

In [None]:
print(df_flights_pass_weather.columns.tolist())

['fl_date', 'origin', 'origin_city_name', 'dest', 'taxi_out', 'taxi_in', 'arr_delay', 'distance', 'month', 'year', 'departures', 'seats', 'passengers', 'freight', 'payload', 'air_time', 'airline_id', 'carrier', 'carrier_name', 'total_gallons', 'total_cost', 'class_G', 'class_P', 'class_nan', 'carrier_group_new_2.0', 'carrier_group_new_3.0', 'carrier_group_new_nan', 'carrier_AS', 'carrier_B6', 'carrier_DL', 'carrier_F9', 'carrier_G4', 'carrier_HA', 'carrier_NK', 'carrier_UA', 'carrier_WN', 'mkt_unique_carrier_nan', 'location', 'date', 'weather_cloudy', 'weather_rain', 'weather_snow', 'weather_sunny', 'weekday']


In [None]:
print(df_flights_pass_weather.dtypes[df_flights_pass_weather.dtypes == 'object'])

Index([], dtype='object')


In [None]:
# df_flights_pass_weather.drop(columns=['origin', 'origin_city_name', 'dest', 'carrier', 'carrier_name',
#        'location'], inplace=True)

In [None]:
# df_flights_pass_weather.drop(columns=[
#     'aircraft_group_2.0',
#     'aircraft_group_4.0', 
#     'aircraft_group_5.0', 
#     'aircraft_group_6.0', 
#     'aircraft_group_7.0', 
#     'aircraft_group_8.0', 
#     'aircraft_group_nan', 
#     'aircraft_type_143.0', 
#     'aircraft_type_218.0', 
#     'aircraft_type_416.0', 
#     'aircraft_type_430.0', 
#     'aircraft_type_459.0', 
#     'aircraft_type_461.0', 
#     'aircraft_type_479.0', 
#     'aircraft_type_482.0', 
#     'aircraft_type_556.0', 
#     'aircraft_type_608.0', 
#     'aircraft_type_612.0', 
#     'aircraft_type_614.0', 
#     'aircraft_type_616.0', 
#     'aircraft_type_617.0', 
#     'aircraft_type_619.0', 
#     'aircraft_type_620.0', 
#     'aircraft_type_621.0', 
#     'aircraft_type_622.0', 
#     'aircraft_type_623.0', 
#     'aircraft_type_624.0', 
#     'aircraft_type_625.0', 
#     'aircraft_type_626.0', 
#     'aircraft_type_627.0', 
#     'aircraft_type_628.0', 
#     'aircraft_type_629.0', 
#     'aircraft_type_631.0', 
#     'aircraft_type_634.0', 
#     'aircraft_type_635.0', 
#     'aircraft_type_637.0', 
#     'aircraft_type_638.0', 
#     'aircraft_type_640.0', 
#     'aircraft_type_641.0', 
#     'aircraft_type_647.0', 
#     'aircraft_type_648.0', 
#     'aircraft_type_651.0', 
#     'aircraft_type_655.0', 
#     'aircraft_type_656.0', 
#     'aircraft_type_667.0', 
#     'aircraft_type_671.0', 
#     'aircraft_type_673.0', 
#     'aircraft_type_674.0', 
#     'aircraft_type_675.0', 
#     'aircraft_type_676.0', 
#     'aircraft_type_677.0', 
#     'aircraft_type_678.0', 
#     'aircraft_type_681.0', 
#     'aircraft_type_682.0', 
#     'aircraft_type_683.0', 
#     'aircraft_type_684.0', 
#     'aircraft_type_685.0', 
#     'aircraft_type_687.0', 
#     'aircraft_type_688.0', 
#     'aircraft_type_691.0', 
#     'aircraft_type_692.0', 
#     'aircraft_type_694.0', 
#     'aircraft_type_696.0', 
#     'aircraft_type_698.0', 
#     'aircraft_type_699.0', 
#     'aircraft_type_715.0', 
#     'aircraft_type_721.0', 
#     'aircraft_type_722.0', 
#     'aircraft_type_730.0', 
#     'aircraft_type_732.0', 
#     'aircraft_type_740.0', 
#     'aircraft_type_750.0', 
#     'aircraft_type_770.0', 
#     'aircraft_type_771.0', 
#     'aircraft_type_775.0', 
#     'aircraft_type_817.0', 
#     'aircraft_type_819.0', 
#     'aircraft_type_820.0', 
#     'aircraft_type_821.0', 
#     'aircraft_type_822.0', 
#     'aircraft_type_838.0', 
#     'aircraft_type_839.0', 
#     'aircraft_type_887.0', 
#     'aircraft_type_888.0', 
#     'aircraft_type_889.0', 
#     'aircraft_type_nan', 
#     'aircraft_config_2.0', 
#     'aircraft_config_3.0', 
#     'aircraft_config_nan'],
#                              inplace=True)

In [None]:
df_flights_pass_weather.rename(columns={
    'departures_performed' : 'departures',
    'mkt_unique_carrier_HA' : 'carrier_HA',
    'mkt_unique_carrier_NK' : 'carrier_NK',
    'mkt_unique_carrier_UA' : 'carrier_UA',
    'mkt_unique_carrier_WN' : 'carrier_WN',
    'cond_desc_cloudy' : 'weather_cloudy',
    'cond_desc_rain' : 'weather_rain',
    'cond_desc_snow' : 'weather_snow',
    'cond_desc_sunny' : 'weather_sunny',
    'mkt_unique_carrier_AS' : 'carrier_AS',
    'mkt_unique_carrier_B6' : 'carrier_B6',
    'mkt_unique_carrier_DL' : 'carrier_DL',
    'mkt_unique_carrier_F9' : 'carrier_F9',
    'mkt_unique_carrier_G4' : 'carrier_G4'
}, inplace=True)

In [None]:
print(df_flights_pass_weather.columns.to_list())

['fl_date', 'taxi_out', 'taxi_in', 'arr_delay', 'distance', 'month', 'year', 'departures', 'seats', 'passengers', 'freight', 'payload', 'air_time', 'airline_id', 'total_gallons', 'total_cost', 'class_G', 'class_P', 'class_nan', 'carrier_group_new_2.0', 'carrier_group_new_3.0', 'carrier_group_new_nan', 'carrier_AS', 'carrier_B6', 'carrier_DL', 'carrier_F9', 'carrier_G4', 'carrier_HA', 'carrier_NK', 'carrier_UA', 'carrier_WN', 'mkt_unique_carrier_nan', 'date', 'weather_cloudy', 'weather_rain', 'weather_snow', 'weather_sunny', 'weekday']


In [None]:
# df_flights_pass_weather.drop(columns=[
#     'fl_date', 
#     'year', 
#     'seats', 
#     'airline_id', 
#     'class_G', 
#     'class_P', 
#     'class_nan', 
#     'carrier_AS', 
#     'carrier_B6', 
#     'carrier_DL', 
#     'carrier_F9', 
#     'carrier_G4', 
#     'carrier_HA', 
#     'carrier_NK', 
#     'carrier_UA', 
#     'carrier_WN', 
#     'mkt_unique_carrier_nan'],
#     inplace=True)

In [None]:
# df_flights_pass_weather.drop(columns=['date'], inplace=True)

In [None]:
df_flights_pass_weather

Unnamed: 0,taxi_out,taxi_in,arr_delay,distance,month,departures,passengers,freight,payload,air_time,total_gallons,total_cost,carrier_group_new_2.0,carrier_group_new_3.0,carrier_group_new_nan,weather_cloudy,weather_rain,weather_snow,weather_sunny,weekday
0,16.0,6.0,-26.0,282.0,1,3.0,47.0,0.0,149280.0,151.0,273706863.0,562392438.0,0,1,0,0,0,0,2,0
1,16.0,6.0,-26.0,282.0,1,1.0,74.0,0.0,32870.0,58.0,273706863.0,562392438.0,0,1,0,0,0,0,2,0
2,16.0,6.0,-26.0,282.0,1,1.0,87.0,232.0,29900.0,54.0,273706863.0,562392438.0,0,1,0,0,0,0,2,0
3,16.0,6.0,-26.0,282.0,1,1.0,47.0,0.0,53160.0,46.0,273706863.0,562392438.0,0,1,0,0,0,0,2,0
4,16.0,6.0,-26.0,282.0,1,0.0,0.0,0.0,0.0,0.0,273706863.0,562392438.0,0,1,0,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1652486,9.0,5.0,-19.0,586.0,7,7.0,204.0,0.0,91700.0,567.0,157065.0,500082.0,0,0,0,1,1,0,0,0
1652487,9.0,5.0,-19.0,586.0,7,10.0,359.0,0.0,185000.0,810.0,157065.0,500082.0,0,0,0,1,1,0,0,0
1652488,14.0,59.0,47.0,594.0,8,1.0,0.0,650.0,37000.0,85.0,459018.0,1084816.0,0,0,0,0,0,0,2,0
1652489,19.0,7.0,-8.0,594.0,8,1.0,0.0,650.0,37000.0,85.0,459018.0,1084816.0,0,0,0,2,0,0,0,0


In [None]:
df_flights_pass_weather.to_csv('train_test_data.csv')

In [None]:
features_list = [feature for feature in df_flights_pass_weather.columns.to_list() if not 'arr_delay']

In [None]:
df_flights_pass_weather

Unnamed: 0,taxi_out,taxi_in,arr_delay,distance,month,departures,seats,passengers,freight,payload,air_time,airline_id,total_gallons,total_cost,class_G,class_P,class_nan,aircraft_group_2.0,aircraft_group_4.0,aircraft_group_5.0,aircraft_group_6.0,aircraft_group_7.0,aircraft_group_8.0,aircraft_group_nan,aircraft_type_143.0,aircraft_type_218.0,aircraft_type_416.0,aircraft_type_430.0,aircraft_type_459.0,aircraft_type_461.0,aircraft_type_479.0,aircraft_type_482.0,aircraft_type_556.0,aircraft_type_608.0,aircraft_type_612.0,aircraft_type_614.0,aircraft_type_616.0,aircraft_type_617.0,aircraft_type_619.0,aircraft_type_620.0,aircraft_type_621.0,aircraft_type_622.0,aircraft_type_623.0,aircraft_type_624.0,aircraft_type_625.0,aircraft_type_626.0,aircraft_type_627.0,aircraft_type_628.0,aircraft_type_629.0,aircraft_type_631.0,aircraft_type_634.0,aircraft_type_635.0,aircraft_type_637.0,aircraft_type_638.0,aircraft_type_640.0,aircraft_type_641.0,aircraft_type_647.0,aircraft_type_648.0,aircraft_type_651.0,aircraft_type_655.0,aircraft_type_656.0,aircraft_type_667.0,aircraft_type_671.0,aircraft_type_673.0,aircraft_type_674.0,aircraft_type_675.0,aircraft_type_676.0,aircraft_type_677.0,aircraft_type_678.0,aircraft_type_681.0,aircraft_type_682.0,aircraft_type_683.0,aircraft_type_684.0,aircraft_type_685.0,aircraft_type_687.0,aircraft_type_688.0,aircraft_type_691.0,aircraft_type_692.0,aircraft_type_694.0,aircraft_type_696.0,aircraft_type_698.0,aircraft_type_699.0,aircraft_type_715.0,aircraft_type_721.0,aircraft_type_722.0,aircraft_type_730.0,aircraft_type_732.0,aircraft_type_740.0,aircraft_type_750.0,aircraft_type_770.0,aircraft_type_771.0,aircraft_type_775.0,aircraft_type_817.0,aircraft_type_819.0,aircraft_type_820.0,aircraft_type_821.0,aircraft_type_822.0,aircraft_type_838.0,aircraft_type_839.0,aircraft_type_887.0,aircraft_type_888.0,aircraft_type_889.0,aircraft_type_nan,aircraft_config_2.0,aircraft_config_3.0,aircraft_config_nan,carrier_group_new_2.0,carrier_group_new_3.0,carrier_group_new_nan,carrier_AS,carrier_B6,carrier_DL,carrier_F9,carrier_G4,carrier_HA,carrier_NK,carrier_UA,carrier_WN,condition,weather_cloudy,weather_rain,weather_snow,weather_sunny,weekday


In [None]:
parameter_candidates = {
    'objective' : ['reg:squarederror'],
    'colsample_bytree' : [0.6],
    'learning_rate' : [0.2, 0.3, 0.4],
        'max_depth' : [3, 4],
        'lambda' : [0.1, 1],
        'alpha' : [0.1, 1],
        'n_estimators' : [3, 4]
}

cv_parameters = {
    'nfold' : 5,
    'num_boost_round' : 6,
    'early_stopping_rounds' : 10
}

In [None]:
y_pred, cv_results = scr.xgboost_det(
    df=df_flights_pass_weather,
    target='arr_delay',
    params=parameter_candidates,
    cv_params=cv_parameters,
    features=features_list,
    gridsearch=True,
    scaler='minmax',
    )

  from pandas import MultiIndex, Int64Index



No columns dropped.

Target values: [] 

Column(s) remaining: Index(['taxi_out', 'taxi_in', 'arr_delay', 'distance', 'month', 'departures',
       'seats', 'passengers', 'freight', 'payload',
       ...
       'carrier_HA', 'carrier_NK', 'carrier_UA', 'carrier_WN', 'condition',
       'weather_cloudy', 'weather_rain', 'weather_snow', 'weather_sunny',
       'weekday'],
      dtype='object', length=124)



ValueError: Found array with 0 sample(s) (shape=(0, 123)) while a minimum of 1 is required by MinMaxScaler.

In [None]:
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,44.646682,1.242404,44.892328,4.7515
1,43.568203,0.994601,44.651096,4.932266
2,42.965443,1.242391,44.54879,4.730438
3,42.562669,1.157294,44.360941,4.872215
4,42.057714,0.991851,44.241037,4.922517
5,41.716217,1.0344,44.238607,4.832155


In [None]:
y_pred

array([ 1.26013842e+01,  5.07120991e+00, -4.71883237e-01,  1.04455917e+02,
       -1.42280941e+01, -1.45246716e+01, -6.74107313e+00, -5.66258192e+00,
        3.27011566e+01, -4.56860209e+00, -1.34129548e+00,  1.41810250e+00,
        2.68281441e+01, -9.09787846e+00,  8.33067894e+00,  3.28774858e+00,
       -9.13692951e+00,  9.10815964e+01, -1.47170043e+00, -1.55744143e+01,
        1.18837938e+01,  4.83523321e+00,  1.98346367e+01, -9.25419033e-01,
       -1.05111294e+01, -1.00624437e+01, -4.60996151e+00, -9.00304031e+00,
        2.92688155e+00,  3.00041842e+00, -4.45204401e+00,  3.95995474e+00,
        4.15616760e+01,  4.33590591e-01,  3.18110967e+00, -1.02698679e+01,
       -2.24194789e+00, -5.73876238e+00,  2.99268866e+00, -1.16821079e+01,
        2.55504322e+00, -8.48759651e+00, -1.47029877e+01, -3.66870117e+00,
       -1.05293703e+01,  4.41503477e+00, -1.41947818e+00,  2.48892593e+01,
       -1.13278618e+01,  9.57273769e+00,  1.94667065e+00,  1.40725574e+01,
        4.14559841e-01,  

##### Best parameters:
```python
{'alpha': 0,
'colsample_bytree': 0.6,
'lambda': 0,
'learning_rate': 0.1,
'max_depth': 3,
'n_estimators': 6,
'objective': 'reg:squarederror'}
```

### `flights`

As a matter of timing, dates are important. Also gather different times during the day.
- **`fl_date`**: Flight Date (yyyy-mm-dd). Analyse monthly changes. HOT-ENCODE months, holidays.
- **`unique_carrier`**: Unique Marketing Carrier Code. <-- analise for trends first.
- **`branded_code_share`**: Reporting Carrier Operated or Branded Code Share Partners. HOT-ENCODE binary, code-shared vs not.
- **`carrier_fl_num`**: Flight Number, monthly count may reveal flight density.
- **`origin`**: Origin Airport. Less than dest. Check for null values.
- **`dest`**: Destination Airport
- **`air_time`**: Airborne Time (minutes)
- **`arr_time`**: Actual Arrival Time (local time: hhmm). Might reveal docking delays due to time of day.
- **`dep_time`**: Actual Departure Time (local time: hhmm). Might reveal passenger delays due to time of day.
- **`distance`**: Distance between airports (miles)
- **`taxi_out`**: from wheels down to gate, in Minutes
- **`taxi_in`**: from gate to wheels off, in Minutes
- **`diverted`**: Diverted Flight Indicator (1=Yes)

### `passengers`

- **`departures_performed`**: Departures Performed
- **`passengers`**: Non-Stop Segment Passengers Transported
- **`payload`**: Available Payload (pounds)
- **`distance_group`**: Distance Intervals, every 500 Miles, for Flight Segment
- **`class`**: Service Class

### ```fuel_consumption```

### `flights_test`

In [None]:
# groupby to aggregrate delay based on tail_num
tail_delay = df_flights[['tail_num', 'arr_delay']].groupby('tail_num').aggregate('mean')
tail_delay.reset_index(inplace=True)

# merge
df_flights = pd.merge(df_flights, tail_delay, how='left', on='tail_num')

# check arr_delay based on tail_num
df_flights[df_flights[['tail_num', 'arr_delay_y']]['tail_num'] == 'N798SW'][['tail_num', 'arr_delay_y']]

In [None]:
df_flights['dep_time'][0]

545.0

In [None]:
df_flights.columns.to_list()

['fl_date',
 'mkt_unique_carrier',
 'branded_code_share',
 'mkt_carrier',
 'mkt_carrier_fl_num',
 'op_unique_carrier',
 'tail_num',
 'op_carrier_fl_num',
 'origin_airport_id',
 'origin',
 'origin_city_name',
 'dest_airport_id',
 'dest',
 'dest_city_name',
 'crs_dep_time',
 'dep_time',
 'dep_delay',
 'taxi_out',
 'wheels_off',
 'wheels_on',
 'taxi_in',
 'crs_arr_time',
 'arr_time',
 'arr_delay_x',
 'cancelled',
 'cancellation_code',
 'diverted',
 'dup',
 'crs_elapsed_time',
 'actual_elapsed_time',
 'air_time',
 'flights',
 'distance',
 'carrier_delay',
 'weather_delay',
 'nas_delay',
 'security_delay',
 'late_aircraft_delay',
 'first_dep_time',
 'total_add_gtime',
 'longest_add_gtime',
 'no_name',
 'arr_delay_y']