# This notebook perform label encoding on categorical/non-numrical useful features in the prosessed dataset

## Importing the need packages

In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import json
import os

In [2]:
# Set option to display all columns
pd.set_option('display.max_columns', None)

## Defining the columns with useful 

In [3]:
columns = [
    "QUARTER",
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",

    "OP_CARRIER_AIRLINE_ID",
    "TAIL_NUM",
    "OP_CARRIER_FL_NUM",

    "ORIGIN_AIRPORT_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN_STATE_FIPS",
    "ORIGIN_WAC",
    "DEST_AIRPORT_ID",
    "DEST_CITY_MARKET_ID",
    "DEST_STATE_FIPS",
    "DEST_WAC",
    
    "CRS_DEP_TIME",
    "DEP_TIME_BLK",
    "CRS_ARR_TIME",
    "ARR_TIME_BLK",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DISTANCE_GROUP",

    "DEP_DELAY",
    "DEP_DELAY_NEW",
    "DEP_DEL15",
    "DEP_DELAY_GROUP",
    "CARRIER_DELAY",
    "WEATHER_DELAY",
    "NAS_DELAY",
    "SECURITY_DELAY",
    "LATE_AIRCRAFT_DELAY"
]

## Importing the training and evaluation datasets

In [4]:
# Get the current script's directory
current_script_dir = os.getcwd()
# Move up to the parent directory
parent_dir = os.path.dirname(current_script_dir)
# Define the sibling data directory name
data_dir = os.path.join(parent_dir, 'data')

In [5]:
# Load the 2022 training datset
# Use os.path.join to create the full file path
training_dataset_filename = "prosessed_dataset_2022.csv"
file_path = os.path.join(data_dir, training_dataset_filename)

# Read the CSV file into a DataFrame
training_df = pd.read_csv(file_path)
training_df = training_df[columns]

# Display the DataFrame
display(training_df)

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_STATE_FIPS,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_CITY_MARKET_ID,DEST_STATE_FIPS,DEST_WAC,CRS_DEP_TIME,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,1,1,1,6,20363,N138EV,4732,10135,30135,42,23,11433,31295,26,43,1015,1000-1059,1209,1200-1259,114,425,2,-1,0,0,-1,0,0,0,0,0
1,1,1,1,6,20363,N138EV,5430,11433,31295,26,43,10135,30135,42,23,1422,1400-1459,1548,1500-1559,86,425,2,-3,0,0,-1,0,0,0,0,0
2,1,1,1,6,20363,N147PQ,4671,10397,30397,13,34,14783,34783,29,64,2057,2000-2059,2149,2100-2159,112,563,3,-5,0,0,-1,0,0,0,0,0
3,1,1,1,6,20363,N147PQ,5009,13487,31650,27,63,11423,31423,19,61,1041,1000-1059,1153,1100-1159,72,232,1,-3,0,0,-1,0,0,0,0,0
4,1,1,1,6,20363,N147PQ,5083,11423,31423,19,61,10397,30397,13,34,1300,1300-1359,1610,1600-1659,130,743,3,-3,0,0,-1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6551768,4,12,31,6,20452,N882RW,5741,12953,31703,36,22,11066,31066,39,44,830,0800-0859,1038,1000-1059,128,479,2,-7,0,0,-1,0,0,0,0,0
6551769,4,12,31,6,20452,N979RP,3607,14321,34321,23,12,13930,30977,17,41,635,0600-0659,758,0700-0759,143,900,4,-7,0,0,-1,0,0,25,0,0
6551770,4,12,31,6,20452,N979RP,3686,13930,30977,17,41,14321,34321,23,12,1800,1800-1859,2129,2100-2159,149,900,4,-7,0,0,-1,0,0,0,0,0
6551771,4,12,31,6,20452,N979RP,3699,11003,31003,19,61,13930,30977,17,41,1508,1500-1559,1627,1600-1659,79,196,1,-10,0,0,-1,0,0,0,0,0


In [6]:
# Load the 2023 evaluation datset
# Use os.path.join to create the full file path
evaluation_dataset_filename = "prosessed_dataset_2023.csv"
file_path = os.path.join(data_dir, evaluation_dataset_filename)

# Read the CSV file into a DataFrame
evaluation_df = pd.read_csv(file_path)
evaluation_df = evaluation_df[columns]

# Display the DataFrame
display(evaluation_df)

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_STATE_FIPS,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_CITY_MARKET_ID,DEST_STATE_FIPS,DEST_WAC,CRS_DEP_TIME,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,1,1,1,7,20363,N131EV,5244,13930,30977,17,41,12478,31703,36,22,1520,1500-1559,1841,1800-1859,141,740,3,4,4,0,0,0,0,0,0,0
1,1,1,1,7,20363,N131EV,5317,12478,31703,36,22,13930,30977,17,41,945,0900-0959,1144,1100-1159,179,740,3,-4,0,0,-1,0,0,0,0,0
2,1,1,1,7,20363,N131EV,5397,12478,31703,36,22,10581,30581,23,12,2100,2100-2159,2236,2200-2259,96,382,2,-4,0,0,-1,0,0,0,0,0
3,1,1,1,7,20363,N133EV,5076,10397,30397,13,34,14783,34783,29,64,1130,1100-1159,1225,1200-1259,115,563,3,-5,0,0,-1,0,0,0,0,0
4,1,1,1,7,20363,N133EV,5076,14783,34783,29,64,10397,30397,13,34,1400,1400-1459,1637,1600-1659,97,563,3,-6,0,0,-1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6763361,4,12,31,7,20452,N879RW,5680,10721,30721,25,13,11066,31066,39,44,1315,1300-1359,1538,1500-1559,143,640,3,-5,0,0,-1,0,0,0,0,0
6763362,4,12,31,7,20452,N879RW,5701,14122,30198,42,23,10721,30721,25,13,1038,1000-1059,1222,1200-1259,104,496,2,-7,0,0,-1,0,0,0,0,0
6763363,4,12,31,7,20452,N979RP,3428,10431,30431,37,36,13930,30977,17,41,1515,1500-1559,1627,1600-1659,132,536,3,-7,0,0,-1,0,0,0,0,0
6763364,4,12,31,7,20452,N979RP,3517,11193,33105,21,52,13930,30977,17,41,900,0900-0959,946,0900-0959,106,264,2,-7,0,0,-1,0,0,0,0,0


## Define non-numerical columns to encode

In [7]:
non_numerical_useful_features = [
    "TAIL_NUM",
    "DEP_TIME_BLK",
    "ARR_TIME_BLK",
]

## Label Encoding of defined non-numerical columns to encode

In [8]:
# Put together the training and evaluation datasets to apply the same encoding protocol
df = pd.concat([training_df, evaluation_df], ignore_index=True)

display(df)

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_STATE_FIPS,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_CITY_MARKET_ID,DEST_STATE_FIPS,DEST_WAC,CRS_DEP_TIME,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,1,1,1,6,20363,N138EV,4732,10135,30135,42,23,11433,31295,26,43,1015,1000-1059,1209,1200-1259,114,425,2,-1,0,0,-1,0,0,0,0,0
1,1,1,1,6,20363,N138EV,5430,11433,31295,26,43,10135,30135,42,23,1422,1400-1459,1548,1500-1559,86,425,2,-3,0,0,-1,0,0,0,0,0
2,1,1,1,6,20363,N147PQ,4671,10397,30397,13,34,14783,34783,29,64,2057,2000-2059,2149,2100-2159,112,563,3,-5,0,0,-1,0,0,0,0,0
3,1,1,1,6,20363,N147PQ,5009,13487,31650,27,63,11423,31423,19,61,1041,1000-1059,1153,1100-1159,72,232,1,-3,0,0,-1,0,0,0,0,0
4,1,1,1,6,20363,N147PQ,5083,11423,31423,19,61,10397,30397,13,34,1300,1300-1359,1610,1600-1659,130,743,3,-3,0,0,-1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13315134,4,12,31,7,20452,N879RW,5680,10721,30721,25,13,11066,31066,39,44,1315,1300-1359,1538,1500-1559,143,640,3,-5,0,0,-1,0,0,0,0,0
13315135,4,12,31,7,20452,N879RW,5701,14122,30198,42,23,10721,30721,25,13,1038,1000-1059,1222,1200-1259,104,496,2,-7,0,0,-1,0,0,0,0,0
13315136,4,12,31,7,20452,N979RP,3428,10431,30431,37,36,13930,30977,17,41,1515,1500-1559,1627,1600-1659,132,536,3,-7,0,0,-1,0,0,0,0,0
13315137,4,12,31,7,20452,N979RP,3517,11193,33105,21,52,13930,30977,17,41,900,0900-0959,946,0900-0959,106,264,2,-7,0,0,-1,0,0,0,0,0


In [9]:
# Print the number of unique values before encoding
for col in non_numerical_useful_features:
    print(f"Number of unique values in column {col} before encoding: {len(df[col].unique())}")

Number of unique values in column TAIL_NUM before encoding: 6350
Number of unique values in column DEP_TIME_BLK before encoding: 19
Number of unique values in column ARR_TIME_BLK before encoding: 19


In [10]:
# Dictionary to hold the encoding mappings
encoding_dict = {}

# Label encoding
for col in non_numerical_useful_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    # Store the mapping
    encoding_dict[col] = dict(zip(le.classes_, le.transform(le.classes_)))

In [11]:
# View the encoded dataframe
display(df)

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_STATE_FIPS,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_CITY_MARKET_ID,DEST_STATE_FIPS,DEST_WAC,CRS_DEP_TIME,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,1,1,1,6,20363,360,4732,10135,30135,42,23,11433,31295,26,43,1015,5,1209,7,114,425,2,-1,0,0,-1,0,0,0,0,0
1,1,1,1,6,20363,360,5430,11433,31295,26,43,10135,30135,42,23,1422,9,1548,10,86,425,2,-3,0,0,-1,0,0,0,0,0
2,1,1,1,6,20363,414,4671,10397,30397,13,34,14783,34783,29,64,2057,15,2149,16,112,563,3,-5,0,0,-1,0,0,0,0,0
3,1,1,1,6,20363,414,5009,13487,31650,27,63,11423,31423,19,61,1041,5,1153,6,72,232,1,-3,0,0,-1,0,0,0,0,0
4,1,1,1,6,20363,414,5083,11423,31423,19,61,10397,30397,13,34,1300,8,1610,11,130,743,3,-3,0,0,-1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13315134,4,12,31,7,20452,5247,5680,10721,30721,25,13,11066,31066,39,44,1315,8,1538,10,143,640,3,-5,0,0,-1,0,0,0,0,0
13315135,4,12,31,7,20452,5247,5701,14122,30198,42,23,10721,30721,25,13,1038,5,1222,7,104,496,2,-7,0,0,-1,0,0,0,0,0
13315136,4,12,31,7,20452,6245,3428,10431,30431,37,36,13930,30977,17,41,1515,10,1627,11,132,536,3,-7,0,0,-1,0,0,0,0,0
13315137,4,12,31,7,20452,6245,3517,11193,33105,21,52,13930,30977,17,41,900,4,946,4,106,264,2,-7,0,0,-1,0,0,0,0,0


In [12]:
# Print the number of unique values after encoding
for col in non_numerical_useful_features:
    print(f"Number of unique values in column {col} after encoding: {len(df[col].unique())}")

Number of unique values in column TAIL_NUM after encoding: 6350
Number of unique values in column DEP_TIME_BLK after encoding: 19
Number of unique values in column ARR_TIME_BLK after encoding: 19


In [13]:
# Now that label encoding is done, separate the datasets back into training (2022) and evaluation (2023) datasets
print("Training Dataset")
encoded_training_df = df.iloc[:len(training_df)]
display(encoded_training_df)
print()

print("Evaluation Dataset")
encoded_evaluation_df = df.iloc[len(training_df):]
display(encoded_evaluation_df)

Training Dataset


Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_STATE_FIPS,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_CITY_MARKET_ID,DEST_STATE_FIPS,DEST_WAC,CRS_DEP_TIME,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,1,1,1,6,20363,360,4732,10135,30135,42,23,11433,31295,26,43,1015,5,1209,7,114,425,2,-1,0,0,-1,0,0,0,0,0
1,1,1,1,6,20363,360,5430,11433,31295,26,43,10135,30135,42,23,1422,9,1548,10,86,425,2,-3,0,0,-1,0,0,0,0,0
2,1,1,1,6,20363,414,4671,10397,30397,13,34,14783,34783,29,64,2057,15,2149,16,112,563,3,-5,0,0,-1,0,0,0,0,0
3,1,1,1,6,20363,414,5009,13487,31650,27,63,11423,31423,19,61,1041,5,1153,6,72,232,1,-3,0,0,-1,0,0,0,0,0
4,1,1,1,6,20363,414,5083,11423,31423,19,61,10397,30397,13,34,1300,8,1610,11,130,743,3,-3,0,0,-1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6551768,4,12,31,6,20452,5292,5741,12953,31703,36,22,11066,31066,39,44,830,3,1038,5,128,479,2,-7,0,0,-1,0,0,0,0,0
6551769,4,12,31,6,20452,6245,3607,14321,34321,23,12,13930,30977,17,41,635,1,758,2,143,900,4,-7,0,0,-1,0,0,25,0,0
6551770,4,12,31,6,20452,6245,3686,13930,30977,17,41,14321,34321,23,12,1800,13,2129,16,149,900,4,-7,0,0,-1,0,0,0,0,0
6551771,4,12,31,6,20452,6245,3699,11003,31003,19,61,13930,30977,17,41,1508,10,1627,11,79,196,1,-10,0,0,-1,0,0,0,0,0



Evaluation Dataset


Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_STATE_FIPS,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_CITY_MARKET_ID,DEST_STATE_FIPS,DEST_WAC,CRS_DEP_TIME,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
6551773,1,1,1,7,20363,319,5244,13930,30977,17,41,12478,31703,36,22,1520,10,1841,13,141,740,3,4,4,0,0,0,0,0,0,0
6551774,1,1,1,7,20363,319,5317,12478,31703,36,22,13930,30977,17,41,945,4,1144,6,179,740,3,-4,0,0,-1,0,0,0,0,0
6551775,1,1,1,7,20363,319,5397,12478,31703,36,22,10581,30581,23,12,2100,16,2236,17,96,382,2,-4,0,0,-1,0,0,0,0,0
6551776,1,1,1,7,20363,332,5076,10397,30397,13,34,14783,34783,29,64,1130,6,1225,7,115,563,3,-5,0,0,-1,0,0,0,0,0
6551777,1,1,1,7,20363,332,5076,14783,34783,29,64,10397,30397,13,34,1400,9,1637,11,97,563,3,-6,0,0,-1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13315134,4,12,31,7,20452,5247,5680,10721,30721,25,13,11066,31066,39,44,1315,8,1538,10,143,640,3,-5,0,0,-1,0,0,0,0,0
13315135,4,12,31,7,20452,5247,5701,14122,30198,42,23,10721,30721,25,13,1038,5,1222,7,104,496,2,-7,0,0,-1,0,0,0,0,0
13315136,4,12,31,7,20452,6245,3428,10431,30431,37,36,13930,30977,17,41,1515,10,1627,11,132,536,3,-7,0,0,-1,0,0,0,0,0
13315137,4,12,31,7,20452,6245,3517,11193,33105,21,52,13930,30977,17,41,900,4,946,4,106,264,2,-7,0,0,-1,0,0,0,0,0


## Store the mapping of label encoded columns

In [14]:
# Convert numpy.int64 to int
converted_dict = {key: {k: int(v) for k, v in value.items()} for key, value in encoding_dict.items()}

# Store the mapping of the label encoding for documentation
encoding_file_path = os.path.join(data_dir, 'encoding.json')
with open(encoding_file_path, 'w') as f:
    json.dump(converted_dict, f, indent=4)
print(f"mapping of encoding is saved to: {encoding_file_path}")

mapping of encoding is saved to: C:\Users\mghun\projects\Flight-Delay-Prediction\data\encoding.json


## Store the encoded datasets each in its csv file

In [15]:
## Saving the encoded training dataset (2022)
encoded_training_dataset_path = os.path.join(data_dir, 'encoded_training_dataset_2022.csv')
encoded_training_df.to_csv(encoded_training_dataset_path, index=False)
print(f"prosessed dataset is saved to: {encoded_training_dataset_path}")

prosessed dataset is saved to: C:\Users\mghun\projects\Flight-Delay-Prediction\data\encoded_training_dataset_2022.csv


In [16]:
## Saving the encoded evaluation dataset (2023)
encoded_evaluation_dataset_path = os.path.join(data_dir, 'encoded_evaluation_dataset_2023.csv')
encoded_evaluation_df.to_csv(encoded_evaluation_dataset_path, index=False)
print(f"prosessed dataset is saved to: {encoded_evaluation_dataset_path}")

prosessed dataset is saved to: C:\Users\mghun\projects\Flight-Delay-Prediction\data\encoded_evaluation_dataset_2023.csv
