In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import os

In [2]:
# Set option to display all columns
pd.set_option('display.max_columns', None)

In [3]:
# Get the current script's directory
current_script_dir = os.getcwd()

# Move up to the parent directory
parent_dir = os.path.dirname(current_script_dir)

# Define the sibling data directory name
data_dir = os.path.join(parent_dir, 'data')

# Viewing the attached feature documentation of the dataset

In [4]:
# Read the documentation of the dataset
doc_filename = 'documentation.csv'
doc_file_path = os.path.join(data_dir, doc_filename)
doc_df = pd.read_csv(doc_file_path)

In [5]:
pd.set_option('display.max_colwidth', None)  # Show all text in cells
pd.set_option('display.max_rows', None)

display(doc_df)

pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')

Unnamed: 0,SYS_FIELD_NAME,FIELD_DESC
0,YEAR,Year
1,QUARTER,Quarter (1-4)
2,MONTH,Month
3,DAY_OF_MONTH,Day of Month
4,DAY_OF_WEEK,Day of Week
5,FL_DATE,Flight Date (yyyymmdd)
6,OP_UNIQUE_CARRIER,"Unique Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users, for example, PA, PA(1), PA(2). Use this field for analysis across a range of years."
7,OP_CARRIER_AIRLINE_ID,"An identification number assigned by US DOT to identify a unique airline (carrier). A unique airline (carrier) is defined as one holding and reporting under the same DOT certificate regardless of its Code, Name, or holding company/corporation."
8,OP_CARRIER,"Code assigned by IATA and commonly used to identify a carrier. As the same code may have been assigned to different carriers over time, the code is not always unique. For analysis, use the Unique Carrier Code."
9,TAIL_NUM,Tail Number


## What is useful for the goal of predicting deparature delays?
In short, we need information we know before the actual deparature of a future flight so we can use such information to make predictions. Thus we need information about:

- Time but not year since we are only using 2022
- Carrier ID, Flight Number, and tail number
- Scheduled deparature and arrival times
- Origin Airport, city and state
- Destination Airport, city and state
- Distance and Scheduled air time between origin and destination
- Historic Delay information that is out target to predict
- Number of scheduled flights between origin and destination
  
## Useful Features
1. QUARTER
2. MONTH
3. DAY_OF_MONTH
4. DAY_OF_WEEK
5. OP_UNIQUE_CARRIER
6. OP_CARRIER_AIRLINE_ID
7. TAIL_NUM
8. OP_CARRIER_FL_NUM
9. ORIGIN_AIRPORT_ID 
10. ORIGIN_CITY_MARKET_ID
11. ORIGIN_STATE_FIPS
12. ORIGIN_WAC
13. DEST_AIRPORT_ID
14. DEST_CITY_MARKET_ID
15. DEST_STATE_FIPS
16. DEST_WAC
17. CRS_DEP_TIME
18. DEP_DELAY
19. DEP_DELAY_NEW
20. DEP_DEL15
21. DEP_DELAY_GROUP
22. DEP_TIME_BLK
23. CRS_ARR_TIME
24. ARR_TIME_BLK
25. CRS_ELAPSED_TIME
26. DISTANCE
27. DISTANCE_GROUP
28. CARRIER_DELAY
29. WEATHER_DELAY
30. NAS_DELAY
31. SECURITY_DELAY
32. LATE_AIRCRAFT_DELAY
33. FLIGHTS

## Non-usefull Features
1. Year
2. FL_DATE (Month and day are already separated in different features)
3. OP_CARRIER (The same code may have been assigned to different carriers over time, we need to be able to 
distinguish between carriers thus better to use OP_UNIQUE_CARRIER)
4. ORIGIN_AIRPORT_SEQ_ID (It is only unique at a given point of time and can change for an airport in the future)
5. ORIGIN (No need since ORIGIN_AIRPORT_ID is available)
6. ORIGIN_CITY_NAME (No need since ORIGIN_CITY_MARKET_ID is available)
7. ORIGIN_STATE_ABR (It consists of 2 letters and thus categorical, better to have a numerical value for training of ML models)
8. ORIGIN_STATE_NM (No need since ORIGIN_STATE_FIPS is available)
9. DEST_AIRPORT_SEQ_ID (It is only unique at a given point of time and can change for an airport in the future)
10. DEST (No need since DEST_AIRPORT_ID is available)
11. DEST_CITY_NAME (No need since DEST_CITY_MARKET_ID is available)
12. DEST_STATE_ABR (It consists of 2 letters and thus categorical, better to have a numerical value for training of ML models)
13. DEST_STATE_NM (No need since DEST_STATE_FIPS is available)
14. DEP_TIME (No need for this field as we already given the delay and groups in other columns such as DEP_DELAY)
15. TAXI_OUT (Actual and not schedualed, we dont no this when we need to make a prediction)
16. WHEELS_OFF (Actual and not schedualed, we dont no this when we need to make a prediction)
17. WHEELS_ON (Actual and not schedualed, we dont no this when we need to make a prediction)
18. TAXI_IN (Actual and not schedualed, we dont no this when we need to make a prediction)
19. ARR_TIME (Actual and not schedualed, we dont no this when we need to make a prediction)
20. ARR_DELAY (We are interested in deparature delay only)
21. ARR_DELAY_NEW (We are interested in deparature delay only)
22. ARR_DEL15 (We are interested in deparature delay only)
23. ARR_DELAY_GROUP (We are interested in deparature delay only)
24. ACTUAL_ELAPSED_TIME (Actual and not schedualed, we dont no this when we need to make a prediction)
25. AIR_TIME (Actual and not schedualed, we dont no this when we need to make a prediction)
26. CANCELLED
27. CANCELLATION_CODE
28. DIVERTED

In [6]:
## Set the useful features in a columns array
columns = [
    "QUARTER",
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",
    
    "OP_UNIQUE_CARRIER",
    "OP_CARRIER_AIRLINE_ID",
    "TAIL_NUM",
    "OP_CARRIER_FL_NUM",
    
    "ORIGIN_AIRPORT_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN_STATE_FIPS",
    "ORIGIN_WAC",
    "DEST_AIRPORT_ID",
    "DEST_CITY_MARKET_ID",
    "DEST_STATE_FIPS",
    "DEST_WAC",
    
    "CRS_DEP_TIME",
    "DEP_TIME_BLK",
    "CRS_ARR_TIME",
    "ARR_TIME_BLK",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DISTANCE_GROUP",
    "FLIGHTS",
    
    "DEP_DELAY",
    "DEP_DELAY_NEW",
    "DEP_DEL15",
    "DEP_DELAY_GROUP",
    "CARRIER_DELAY",
    "WEATHER_DELAY",
    "NAS_DELAY",
    "SECURITY_DELAY",
    "LATE_AIRCRAFT_DELAY"
]

In [7]:
# Create an empty dataFrame to store the dataset
df = pd.DataFrame(columns=columns)

for year in ['2022']:
    # Define the year data directory name
    year_dir = os.path.join(data_dir, year)
    
    for month in range(1, 13, 1):
        month_filename = f'{str(month).zfill(2)}_2022.csv'
        
        # Join the directory and filename
        file_path = os.path.join(year_dir, month_filename)
    
        # Read the CSV file into a DataFrame
        month_df = pd.read_csv(file_path)
    
        # Only keep useful data colums
        month_df = month_df[columns]
    
        # Concatenate the empty DataFrame with the this month data    
        if not df.empty:
            df = pd.concat([df, month_df], ignore_index=True)
        else:
            df = month_df

In [8]:
# Display the first few rows of the DataFrame
display(df)

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_STATE_FIPS,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_CITY_MARKET_ID,DEST_STATE_FIPS,DEST_WAC,CRS_DEP_TIME,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,FLIGHTS,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,1,1,1,6,9E,20363,N138EV,4732,10135,30135,42,23,11433,31295,26,43,1015,1000-1059,1209,1200-1259,114.0,425.0,2,1.0,-1.0,0.0,0.0,-1.0,,,,,
1,1,1,1,6,9E,20363,N138EV,4979,11433,31295,26,43,14524,34524,51,38,2130,2100-2159,2305,2300-2359,95.0,456.0,2,1.0,,,,,,,,,
2,1,1,1,6,9E,20363,N138EV,5430,10135,30135,42,23,11433,31295,26,43,1700,1700-1759,1853,1800-1859,113.0,425.0,2,1.0,,,,,,,,,
3,1,1,1,6,9E,20363,N138EV,5430,11433,31295,26,43,10135,30135,42,23,1422,1400-1459,1548,1500-1559,86.0,425.0,2,1.0,-3.0,0.0,0.0,-1.0,,,,,
4,1,1,1,6,9E,20363,N146PQ,4722,11433,31295,26,43,11423,31423,19,61,1255,1200-1259,1348,1300-1359,113.0,534.0,3,1.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6729120,4,12,31,6,YX,20452,N882RW,5741,12953,31703,36,22,11066,31066,39,44,830,0800-0859,1038,1000-1059,128.0,479.0,2,1.0,-7.0,0.0,0.0,-1.0,,,,,
6729121,4,12,31,6,YX,20452,N979RP,3607,14321,34321,23,12,13930,30977,17,41,635,0600-0659,758,0700-0759,143.0,900.0,4,1.0,-7.0,0.0,0.0,-1.0,0.0,0.0,25.0,0.0,0.0
6729122,4,12,31,6,YX,20452,N979RP,3686,13930,30977,17,41,14321,34321,23,12,1800,1800-1859,2129,2100-2159,149.0,900.0,4,1.0,-7.0,0.0,0.0,-1.0,,,,,
6729123,4,12,31,6,YX,20452,N979RP,3699,11003,31003,19,61,13930,30977,17,41,1508,1500-1559,1627,1600-1659,79.0,196.0,1,1.0,-10.0,0.0,0.0,-1.0,,,,,


## Target Data Quality Requirement
- The table below shows what requirement should be met by each target column for a satisfying data quality

|                         Target                           |                 Data Quality Requirement                        |
|----------------------------------------------------------|-----------------------------------------------------------------|
| DEP_DELAY                                                | Value is required; If Nan, delete row                           |
|                                                          | Value type is int                                               |
| DEP_DELAY_NEW                                            | Value is required; If Nan, delete row                           |
|                                                          | Value type is int                                               |
|                                                          | Negative values are invalid                                     |
| DEP_DEL15                                                | Value must be 1 or 0                                            |
|                                                          | Value is required; If Nan, delete row                           |
|                                                          | Value type is int                                               |
| DEP_DELAY_GROUP                                          | Values must range from -2 to 12, by 1 step                      |
|                                                          | Value is required; If Nan, delete row                           |
|                                                          | Value type is int                                               |
| CARRIER_DELAY                                            | Value is not required; If Nan, set to 0                         |
|                                                          | Negative values are invalid                                     |
|                                                          | Value type is int                                               |
| WEATHER_DELAY                                            | Value is not required; If Nan, set to 0                         |
|                                                          | Negative values are invalid                                     |
|                                                          | Value type is int                                               |
| NAS_DELAY                                                | Value is not required; If Nan, set to 0                         |
|                                                          | Negative values are invalid                                     |
|                                                          | Value type is int                                               |
| SECURITY_DELAY                                           | Value is not required; If Nan, set to 0                         |
|                                                          | Negative values are invalid                                     |
|                                                          | Value type is int                                               |
| LATE_AIRCRAFT_DELAY                                      | Value is not required; If Nan, set to 0                         |
|                                                          | Negative values are invalid                                     |
|                                                          | Value type is int                                               |

In [9]:
# We start by viewing the descriptiuon statstics of each target column
targets = [
    "DEP_DELAY",
    "DEP_DELAY_NEW",
    "DEP_DEL15",
    "DEP_DELAY_GROUP",
    "CARRIER_DELAY",
    "WEATHER_DELAY",
    "NAS_DELAY",
    "SECURITY_DELAY",
    "LATE_AIRCRAFT_DELAY"
]

display(df[targets].describe().round())

Unnamed: 0,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
count,6551777.0,6551777.0,6551777.0,6551777.0,1376798.0,1376798.0,1376798.0,1376798.0,1376798.0
mean,13.0,16.0,0.0,0.0,27.0,4.0,11.0,0.0,25.0
std,53.0,52.0,0.0,2.0,74.0,31.0,29.0,4.0,55.0
min,-96.0,0.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0
25%,-5.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
50%,-2.0,0.0,0.0,-1.0,7.0,0.0,0.0,0.0,0.0
75%,10.0,10.0,0.0,0.0,26.0,0.0,15.0,0.0,30.0
max,3433.0,3433.0,1.0,12.0,3423.0,2363.0,1740.0,1245.0,2302.0


In [10]:
prosessed_df = df.copy()

In [11]:
## DEP_DELAY
# Value is required; If Nan, delete row
# Value type is int

print("Removing NaN from 'DEP_DELAY'") 
# Get Nan values in target
nan_values = prosessed_df['DEP_DELAY'].isna()

# Get number of Nan values in target
nan_invalid_count = nan_values.sum()
print(f"{nan_invalid_count = }")

# Get number of valid exiting values in target
valid_count = (~nan_values).sum()
print(f"{valid_count = }")
  
# Drop Nan values
prosessed_df = prosessed_df.dropna(subset=['DEP_DELAY'])
print(f"{len(prosessed_df) = }")
print()

# Convert to int
print("Converting to dtype int")
print(f"Before {prosessed_df['DEP_DELAY'].dtype = }")
prosessed_df['DEP_DELAY'] = prosessed_df['DEP_DELAY'].astype(int)
print(f"After {prosessed_df['DEP_DELAY'].dtype = }")

Removing NaN from 'DEP_DELAY'
nan_invalid_count = np.int64(177348)
valid_count = np.int64(6551777)
len(prosessed_df) = 6551777

Converting to dtype int
Before prosessed_df['DEP_DELAY'].dtype = dtype('float64')
After prosessed_df['DEP_DELAY'].dtype = dtype('int64')


In [12]:
## DEP_DELAY_NEW
# Value is required; If Nan, delete row
# Value type is int
# Negative values are invalid

print("Removing NaN from 'DEP_DELAY_NEW'") 
# Get Nan values in target
nan_values = prosessed_df['DEP_DELAY_NEW'].isna()

# Get number of Nan values in target
nan_invalid_count = nan_values.sum()
print(f"{nan_invalid_count = }")

# Get number of valid exiting values in target
valid_count = (~nan_values).sum()
print(f"{valid_count = }")
  
# Drop Nan values
prosessed_df = prosessed_df.dropna(subset=['DEP_DELAY_NEW'])
print(f"{len(prosessed_df) = }")
print()

# Convert to int
print("Converting to dtype int")
print(f"Before {prosessed_df['DEP_DELAY_NEW'].dtype = }")
prosessed_df['DEP_DELAY_NEW'] = prosessed_df['DEP_DELAY_NEW'].astype(int)
print(f"After {prosessed_df['DEP_DELAY_NEW'].dtype = }")
print()

# Remove rows with invalid negative column at this column
print("Remove rows with invalid negative column at this column")
negative_count = (prosessed_df['DEP_DELAY_NEW'] < 0).sum()
print(f"{negative_count = }")
prosessed_df = prosessed_df[prosessed_df['DEP_DELAY_NEW'] >= 0]
print(f"{len(prosessed_df) = }")
print()

Removing NaN from 'DEP_DELAY_NEW'
nan_invalid_count = np.int64(0)
valid_count = np.int64(6551777)
len(prosessed_df) = 6551777

Converting to dtype int
Before prosessed_df['DEP_DELAY_NEW'].dtype = dtype('float64')
After prosessed_df['DEP_DELAY_NEW'].dtype = dtype('int64')

Remove rows with invalid negative column at this column
negative_count = np.int64(0)
len(prosessed_df) = 6551777



In [13]:
## DEP_DELAY15
# Value is required; If Nan, delete row
# Value type is int
# Values must range from -2 to 12, by 1 step

print("Removing NaN from 'DEP_DELAY15'") 
# Get Nan values in target
nan_values = prosessed_df['DEP_DEL15'].isna()

# Get number of Nan values in target
nan_invalid_count = nan_values.sum()
print(f"{nan_invalid_count = }")

# Get number of valid exiting values in target
valid_count = (~nan_values).sum()
print(f"{valid_count = }")
  
# Drop Nan values
prosessed_df = prosessed_df.dropna(subset=['DEP_DEL15'])
print(f"{len(prosessed_df) = }")
print()

# Convert to int
print("Converting to dtype int")
print(f"Before {prosessed_df['DEP_DEL15'].dtype = }")
prosessed_df['DEP_DEL15'] = prosessed_df['DEP_DEL15'].astype(int)
print(f"After {prosessed_df['DEP_DEL15'].dtype = }")
print()

# Drop rows where values in column 'DEP_DEL15' are not in [0, 1]
print("Dropping rows where values in column 'DEP_DEL15' are not in [0, 1]")
non_valid_count = (~prosessed_df['DEP_DEL15'].isin([0, 1])).sum()
print(f"{non_valid_count = }")
prosessed_df = prosessed_df[prosessed_df['DEP_DEL15'].isin([0, 1])]
print(f"After {len(prosessed_df) = }")

Removing NaN from 'DEP_DELAY15'
nan_invalid_count = np.int64(0)
valid_count = np.int64(6551777)
len(prosessed_df) = 6551777

Converting to dtype int
Before prosessed_df['DEP_DEL15'].dtype = dtype('float64')
After prosessed_df['DEP_DEL15'].dtype = dtype('int64')

Dropping rows where values in column 'DEP_DEL15' are not in [0, 1]
non_valid_count = np.int64(0)
After len(prosessed_df) = 6551777


In [14]:
## DEP_DELAY_GROUP
# Value is required; If Nan, delete row
# Value type is int
# Values must range from -2 to 12, by 1 step

print("Removing NaN from 'DEP_DELAY_GROUP'") 
# Get Nan values in target
nan_values = prosessed_df['DEP_DELAY_GROUP'].isna()

# Get number of Nan values in target
nan_invalid_count = nan_values.sum()
print(f"{nan_invalid_count = }")

# Get number of valid exiting values in target
valid_count = (~nan_values).sum()
print(f"{valid_count = }")
  
# Drop Nan values
prosessed_df = prosessed_df.dropna(subset=['DEP_DELAY_GROUP'])
print(f"{len(prosessed_df) = }")
print()

# Convert to int
print("Converting to dtype int")
print(f"Before {prosessed_df['DEP_DELAY_GROUP'].dtype = }")
prosessed_df['DEP_DELAY_GROUP'] = prosessed_df['DEP_DELAY_GROUP'].astype(int)
print(f"After {prosessed_df['DEP_DELAY_GROUP'].dtype = }")
print()

# Drop rows where values in column 'DEP_DEL15' are not in [-2, -1, 0, ..., 10, 11, 12]
print("Dropping rows where values in column 'DEP_DELAY_GROUP' are not in [-2, -1, 0, ..., 10, 11, 12]")
non_valid_count = (~prosessed_df['DEP_DELAY_GROUP'].isin(list(range(-2, 13, 1)))).sum()
print(f"{non_valid_count = }")
prosessed_df = prosessed_df[prosessed_df['DEP_DELAY_GROUP'].isin(list(range(-2, 13, 1)))]
print(f"After {len(prosessed_df) = }")

Removing NaN from 'DEP_DELAY_GROUP'
nan_invalid_count = np.int64(0)
valid_count = np.int64(6551777)
len(prosessed_df) = 6551777

Converting to dtype int
Before prosessed_df['DEP_DELAY_GROUP'].dtype = dtype('float64')
After prosessed_df['DEP_DELAY_GROUP'].dtype = dtype('int64')

Dropping rows where values in column 'DEP_DELAY_GROUP' are not in [-2, -1, 0, ..., 10, 11, 12]
non_valid_count = np.int64(0)
After len(prosessed_df) = 6551777


In [15]:
## CARRIER_DELAY, WEATHER_DELAY, NAS_DELAY, SECURITY_DELAY, LATE_AIRCRAFT_DELAY
# All of the 5 above columns have the same following requirements
# Value is required; If Nan, set to 0
# Value type is int
# Negative values are invalid, delete row

for column in ["CARRIER_DELAY", "WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY"]:
    print(f"Filling NaN with 0 from '{column}'")
    before_zero_count = (prosessed_df[column] == 0).sum()
    print(f"{before_zero_count = }")
    # Get Nan values in target
    nan_values = prosessed_df[column].isna()
    # Get number of Nan values in target
    nan_count = nan_values.sum()
    print(f"{nan_count = }")
    # Fill with 0
    prosessed_df[column] = prosessed_df[column].fillna(0)
    # Get 0 count
    zero_count = (prosessed_df[column] == 0).sum()
    print(f"{zero_count = } should be equal to {before_zero_count+nan_count}")
    print()
    
    # Convert to int
    print("Converting to dtype int")
    print(f"Before {prosessed_df[column].dtype = }")
    prosessed_df[column] = prosessed_df[column].astype(int)
    print(f"After {prosessed_df[column].dtype = }")
    print()

    # Delete rows with invalid negative values
    print(f"Dropping rows with invalid negative values in {column}")
    # Count the number of negative values in column
    negative_count = (prosessed_df[column] < 0).sum()
    print(f"{negative_count = }")
    # Drop rows where column 'A' has negative values
    prosessed_df = prosessed_df[prosessed_df[column] >= 0]
    print(f"{len(prosessed_df) = }")
    print("----------------------------------------------------------------------------------------")
    print()


Filling NaN with 0 from 'CARRIER_DELAY'
before_zero_count = np.int64(538461)
nan_count = np.int64(5174979)
zero_count = np.int64(5713440) should be equal to 5713440

Converting to dtype int
Before prosessed_df[column].dtype = dtype('float64')
After prosessed_df[column].dtype = dtype('int64')

Dropping rows with invalid negative values in CARRIER_DELAY
negative_count = np.int64(0)
len(prosessed_df) = 6551777
----------------------------------------------------------------------------------------

Filling NaN with 0 from 'WEATHER_DELAY'
before_zero_count = np.int64(1300954)
nan_count = np.int64(5174979)
zero_count = np.int64(6475933) should be equal to 6475933

Converting to dtype int
Before prosessed_df[column].dtype = dtype('float64')
After prosessed_df[column].dtype = dtype('int64')

Dropping rows with invalid negative values in WEATHER_DELAY
negative_count = np.int64(0)
len(prosessed_df) = 6551777
---------------------------------------------------------------------------------------

In [17]:
display(prosessed_df[targets])

Unnamed: 0,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,-1,0,0,-1,0,0,0,0,0
3,-3,0,0,-1,0,0,0,0,0
6,-5,0,0,-1,0,0,0,0,0
7,-3,0,0,-1,0,0,0,0,0
8,-3,0,0,-1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
6729120,-7,0,0,-1,0,0,0,0,0
6729121,-7,0,0,-1,0,0,25,0,0
6729122,-7,0,0,-1,0,0,0,0,0
6729123,-10,0,0,-1,0,0,0,0,0


In [18]:
display(prosessed_df[targets].describe().round())

Unnamed: 0,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
count,6551777.0,6551777.0,6551777.0,6551777.0,6551777.0,6551777.0,6551777.0,6551777.0,6551777.0
mean,13.0,16.0,0.0,0.0,6.0,1.0,2.0,0.0,5.0
std,53.0,52.0,0.0,2.0,36.0,14.0,14.0,2.0,27.0
min,-96.0,0.0,0.0,-2.0,0.0,0.0,0.0,0.0,0.0
25%,-5.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
50%,-2.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0
75%,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3433.0,3433.0,1.0,12.0,3423.0,2363.0,1740.0,1245.0,2302.0


|                         Feature                           |                 Data Quality Requirement                        |
|-----------------------------------------------------------|-----------------------------------------------------------------|
| QUARTER                                                   | Values must range from 1 to 4, by 1 step                        |
|                                                           | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| MONTH                                                     | Values must range from 1 to 12, by 1 step                       |
|                                                           | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| DAY_OF_MONTH                                              | Values must range from 1 to 28 for month (2)                    |
|                                                           | Values must range from 1 to 30 for month (4, 6, 9, 11)          |
|                                                           | Values must range from 1 to 31 for month (1, 3, 5, 7, 8, 10, 12)|
|                                                           | Value step by 1                                                 |
|                                                           | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| DAY_OF_WEEK                                               | Values must range from 1 to 7, by 1 step                        |
|                                                           | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| OP_UNIQUE_CARRIER                                         | Value is required; If Nan, delete row                           |
|                                                           | Value type is str                                               |
| OP_CARRIER_AIRLINE_ID                                     | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| TAIL_NUM                                                  | Value is required; If Nan, delete row                           |
|                                                           | Value type is str                                               |
| OP_CARRIER_FL_NUM                                         | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| ORIGIN_AIRPORT_ID                                         | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| ORIGIN_CITY_MARKET_ID                                     | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| ORIGIN_STATE_FIPS                                         | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| ORIGIN_WAC                                                | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| DEST_AIRPORT_ID                                           | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| DEST_CITY_MARKET_ID                                       | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| DEST_STATE_FIPS                                           | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| DEST_WAC                                                  | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| CRS_DEP_TIME                                              | Values must range from 1 to 2400, by 1 step                     |
|                                                           | Values with left 2 digits higher than 24 are invalid            |
|                                                           | Values with right 2 digits higher than 59 are invalid           |
|                                                           | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| DEP_TIME_BLK                                              | value split by "-" should contain 2 valid times of format HH:MM |
|                                                           | Value is required; If Nan, delete row                           |
|                                                           | Value type is str                                               |
| CRS_ARR_TIME                                              | Values must range from 1 to 2400, by 1 step                     |
|                                                           | Values with left 2 digits higher than 24 are invalid            |
|                                                           | Values with right 2 digits higher than 59 are invalid           |
|                                                           | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| ARR_TIME_BLK                                              | value split by "-" should contain 2 valid times of format HH:MM |
|                                                           | Value is required; If Nan, delete row                           |
|                                                           | Value type is str                                               |
| CRS_ELAPSED_TIME                                          | Value is required; If Nan, delete row                           |
|                                                           | Negative values are invalid                                     |
|                                                           | Value type is int                                               |
| DISTANCE                                                  | Value is required; If Nan, delete row                           |
|                                                           | Negative values are invalid                                     |
|                                                           | Value type is int                                               |
| DISTANCE_GROUP                                            | Values must range from 1 to 11, by 1 step                       |
|                                                           | Negative values are invalid                                     |
|                                                           | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |
| FLIGHTS                                                   | Negative values are invalid                                     |
|                                                           | Value is required; If Nan, delete row                           |
|                                                           | Value type is int                                               |