In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import os

In [2]:
# Set option to display all columns
pd.set_option('display.max_columns', None)

In [10]:
# Get the current script's directory
current_script_dir = os.getcwd()

# Move up to the parent directory
parent_dir = os.path.dirname(current_script_dir)

# Define the sibling data directory name
data_dir = os.path.join(parent_dir, 'data')

# Viewing the attached feature documentation of the dataset

In [11]:
# Read the documentation of the dataset
doc_filename = 'documentation.csv'
doc_file_path = os.path.join(data_dir, doc_filename)
doc_df = pd.read_csv(doc_file_path)

In [12]:
pd.set_option('display.max_colwidth', None)  # Show all text in cells
pd.set_option('display.max_rows', None)

display(doc_df)

pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')

Unnamed: 0,SYS_FIELD_NAME,FIELD_DESC
0,YEAR,Year
1,QUARTER,Quarter (1-4)
2,MONTH,Month
3,DAY_OF_MONTH,Day of Month
4,DAY_OF_WEEK,Day of Week
5,FL_DATE,Flight Date (yyyymmdd)
6,OP_UNIQUE_CARRIER,"Unique Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users, for example, PA, PA(1), PA(2). Use this field for analysis across a range of years."
7,OP_CARRIER_AIRLINE_ID,"An identification number assigned by US DOT to identify a unique airline (carrier). A unique airline (carrier) is defined as one holding and reporting under the same DOT certificate regardless of its Code, Name, or holding company/corporation."
8,OP_CARRIER,"Code assigned by IATA and commonly used to identify a carrier. As the same code may have been assigned to different carriers over time, the code is not always unique. For analysis, use the Unique Carrier Code."
9,TAIL_NUM,Tail Number


## What is useful for the goal of predicting deparature delays?
In short, we need information we know before the actual deparature of a future flight so we can use such information to make predictions. Thus we need information about:

- Time but not year since we are only using 2022
- Carrier ID, Flight Number, and tail number
- Scheduled deparature and arrival times
- Origin Airport, city and state
- Destination Airport, city and state
- Distance and Scheduled air time between origin and destination
- Historic Delay information that is out target to predict
- Number of scheduled flights between origin and destination
  
## Useful Features
1. QUARTER
2. MONTH
3. DAY_OF_MONTH
4. DAY_OF_WEEK
5. OP_UNIQUE_CARRIER
6. OP_CARRIER_AIRLINE_ID
7. TAIL_NUM
8. OP_CARRIER_FL_NUM
9. ORIGIN_AIRPORT_ID 
10. ORIGIN_CITY_MARKET_ID
11. ORIGIN_STATE_FIPS
12. ORIGIN_WAC
13. DEST_AIRPORT_ID
14. DEST_CITY_MARKET_ID
15. DEST_STATE_FIPS
16. DEST_WAC
17. CRS_DEP_TIME
18. DEP_DELAY
19. DEP_DELAY_NEW
20. DEP_DEL15
21. DEP_DELAY_GROUP
22. DEP_TIME_BLK
23. CRS_ARR_TIME
24. ARR_TIME_BLK
25. CRS_ELAPSED_TIME
26. DISTANCE
27. DISTANCE_GROUP
28. CARRIER_DELAY
29. WEATHER_DELAY
30. NAS_DELAY
31. SECURITY_DELAY
32. LATE_AIRCRAFT_DELAY
33. FLIGHTS

## Non-usefull Features
1. Year
2. FL_DATE (Month and day are already separated in different features)
3. OP_CARRIER (The same code may have been assigned to different carriers over time, we need to be able to 
distinguish between carriers thus better to use OP_UNIQUE_CARRIER)
4. ORIGIN_AIRPORT_SEQ_ID (It is only unique at a given point of time and can change for an airport in the future)
5. ORIGIN (No need since ORIGIN_AIRPORT_ID is available)
6. ORIGIN_CITY_NAME (No need since ORIGIN_CITY_MARKET_ID is available)
7. ORIGIN_STATE_ABR (It consists of 2 letters and thus categorical, better to have a numerical value for training of ML models)
8. ORIGIN_STATE_NM (No need since ORIGIN_STATE_FIPS is available)
9. DEST_AIRPORT_SEQ_ID (It is only unique at a given point of time and can change for an airport in the future)
10. DEST (No need since DEST_AIRPORT_ID is available)
11. DEST_CITY_NAME (No need since DEST_CITY_MARKET_ID is available)
12. DEST_STATE_ABR (It consists of 2 letters and thus categorical, better to have a numerical value for training of ML models)
13. DEST_STATE_NM (No need since DEST_STATE_FIPS is available)
14. DEP_TIME (No need for this field as we already given the delay and groups in other columns such as DEP_DELAY)
15. TAXI_OUT (Actual and not schedualed, we dont no this when we need to make a prediction)
16. WHEELS_OFF (Actual and not schedualed, we dont no this when we need to make a prediction)
17. WHEELS_ON (Actual and not schedualed, we dont no this when we need to make a prediction)
18. TAXI_IN (Actual and not schedualed, we dont no this when we need to make a prediction)
19. ARR_TIME (Actual and not schedualed, we dont no this when we need to make a prediction)
20. ARR_DELAY (We are interested in deparature delay only)
21. ARR_DELAY_NEW (We are interested in deparature delay only)
22. ARR_DEL15 (We are interested in deparature delay only)
23. ARR_DELAY_GROUP (We are interested in deparature delay only)
24. ACTUAL_ELAPSED_TIME (Actual and not schedualed, we dont no this when we need to make a prediction)
25. AIR_TIME (Actual and not schedualed, we dont no this when we need to make a prediction)
26. CANCELLED
27. CANCELLATION_CODE
28. DIVERTED

In [13]:
## Set the useful features in a columns array
columns = [
    "QUARTER",
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    "OP_UNIQUE_CARRIER",
    "OP_CARRIER_AIRLINE_ID",
    "TAIL_NUM",
    "OP_CARRIER_FL_NUM",
    
    "ORIGIN_AIRPORT_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN_STATE_FIPS",
    "ORIGIN_WAC",
    "DEST_AIRPORT_ID",
    
    "DEST_CITY_MARKET_ID",
    "DEST_STATE_FIPS",
    "DEST_WAC",
    "CRS_DEP_TIME",
    "DEP_DELAY",
    
    "DEP_DELAY_NEW",
    "DEP_DEL15",
    "DEP_DELAY_GROUP",
     
    "DEP_TIME_BLK",
    "CRS_ARR_TIME",
    "ARR_TIME_BLK",

    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DISTANCE_GROUP",
    
    "CARRIER_DELAY",
    "WEATHER_DELAY",
    "NAS_DELAY",
    "SECURITY_DELAY",
    "LATE_AIRCRAFT_DELAY",
    "FLIGHTS"
]

In [15]:
# Create an empty dataFrame to store the dataset
df = pd.DataFrame(columns=columns)

for year in ['2022']:
    # Define the year data directory name
    year_dir = os.path.join(data_dir, year)
    
    for month in range(1, 13, 1):
        month_filename = f'{str(month).zfill(2)}_2022.csv'
        
        # Join the directory and filename
        file_path = os.path.join(year_dir, month_filename)
    
        # Read the CSV file into a DataFrame
        month_df = pd.read_csv(file_path)
    
        # Only keep useful data colums
        month_df = month_df[columns]
    
        # Concatenate the empty DataFrame with the this month data    
        if not df.empty:
            df = pd.concat([df, month_df], ignore_index=True)
        else:
            df = month_df

In [16]:
# Display the first few rows of the DataFrame
display(df)

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_STATE_FIPS,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_CITY_MARKET_ID,DEST_STATE_FIPS,DEST_WAC,CRS_DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,FLIGHTS
0,1,1,1,6,9E,20363,N138EV,4732,10135,30135,42,23,11433,31295,26,43,1015,-1.0,0.0,0.0,-1.0,1000-1059,1209,1200-1259,114.0,425.0,2,,,,,,1.0
1,1,1,1,6,9E,20363,N138EV,4979,11433,31295,26,43,14524,34524,51,38,2130,,,,,2100-2159,2305,2300-2359,95.0,456.0,2,,,,,,1.0
2,1,1,1,6,9E,20363,N138EV,5430,10135,30135,42,23,11433,31295,26,43,1700,,,,,1700-1759,1853,1800-1859,113.0,425.0,2,,,,,,1.0
3,1,1,1,6,9E,20363,N138EV,5430,11433,31295,26,43,10135,30135,42,23,1422,-3.0,0.0,0.0,-1.0,1400-1459,1548,1500-1559,86.0,425.0,2,,,,,,1.0
4,1,1,1,6,9E,20363,N146PQ,4722,11433,31295,26,43,11423,31423,19,61,1255,,,,,1200-1259,1348,1300-1359,113.0,534.0,3,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6729120,4,12,31,6,YX,20452,N882RW,5741,12953,31703,36,22,11066,31066,39,44,830,-7.0,0.0,0.0,-1.0,0800-0859,1038,1000-1059,128.0,479.0,2,,,,,,1.0
6729121,4,12,31,6,YX,20452,N979RP,3607,14321,34321,23,12,13930,30977,17,41,635,-7.0,0.0,0.0,-1.0,0600-0659,758,0700-0759,143.0,900.0,4,0.0,0.0,25.0,0.0,0.0,1.0
6729122,4,12,31,6,YX,20452,N979RP,3686,13930,30977,17,41,14321,34321,23,12,1800,-7.0,0.0,0.0,-1.0,1800-1859,2129,2100-2159,149.0,900.0,4,,,,,,1.0
6729123,4,12,31,6,YX,20452,N979RP,3699,11003,31003,19,61,13930,30977,17,41,1508,-10.0,0.0,0.0,-1.0,1500-1559,1627,1600-1659,79.0,196.0,1,,,,,,1.0


In [17]:
# View the data types of columns
display(df.dtypes)

QUARTER                    int64
MONTH                      int64
DAY_OF_MONTH               int64
DAY_OF_WEEK                int64
OP_UNIQUE_CARRIER         object
OP_CARRIER_AIRLINE_ID      int64
TAIL_NUM                  object
OP_CARRIER_FL_NUM          int64
ORIGIN_AIRPORT_ID          int64
ORIGIN_CITY_MARKET_ID      int64
ORIGIN_STATE_FIPS          int64
ORIGIN_WAC                 int64
DEST_AIRPORT_ID            int64
DEST_CITY_MARKET_ID        int64
DEST_STATE_FIPS            int64
DEST_WAC                   int64
CRS_DEP_TIME               int64
DEP_DELAY                float64
DEP_DELAY_NEW            float64
DEP_DEL15                float64
DEP_DELAY_GROUP          float64
DEP_TIME_BLK              object
CRS_ARR_TIME               int64
ARR_TIME_BLK              object
CRS_ELAPSED_TIME         float64
DISTANCE                 float64
DISTANCE_GROUP             int64
CARRIER_DELAY            float64
WEATHER_DELAY            float64
NAS_DELAY                float64
SECURITY_D

In [None]:
display(df.describe().round(2))

In [None]:
df['DEP_DELAY']