<h1>Process Dataset</h1>

In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

In [2]:
data_df = pd.read_csv('../data/challenge_set.csv')

challenge_set_df = data_df.loc[:, ['adep', 'country_code_adep', 'ades', 'country_code_ades', 'aircraft_type', 'wtc', 'airline', 'flight_duration', 'taxiout_time', 'flown_distance', 'tow']]
print("\nRow DataFrame (challenge_set.csv):")
display(challenge_set_df)

aircraft_max_weights = pd.read_csv("../data/aircraft_max_weights.csv")[['Type', 'MTOW [kg]']]
print("\nRow DataFrame (aircraft_max_weights.csv):")
display(aircraft_max_weights)

challenge_set_df['aircraft_max_weight'] = np.inf
for index, row in challenge_set_df.iterrows():
    aircraft_type = row['aircraft_type']
    for _, aircraft_max_weights_row in aircraft_max_weights.iterrows():
        if aircraft_type in aircraft_max_weights_row['Type']:
            challenge_set_df.at[index, 'aircraft_max_weight'] = np.int64(aircraft_max_weights_row['MTOW [kg]'])
            break
print("\nCombined DataFrame (challenge_set.csv & aircraft_max_weights.csv):")
display(challenge_set_df)

path = "../data/processed_challenge_set_df.csv"
challenge_set_df.to_csv(path, index=False)
print("Saved Combined DataFrame (challenge_set.csv & aircraft_max_weights.csv) tp path:", path)


Row DataFrame (challenge_set.csv):


Unnamed: 0,adep,country_code_adep,ades,country_code_ades,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow
0,EGLL,GB,EICK,IE,A320,M,a73f82288988b79be490c6322f4c32ed,61,18,321,54748.000000
1,LEBL,ES,KMIA,US,B772,H,5543e4dc327359ffaf5b9c0e6faaf0e1,570,13,4193,185441.000000
2,ESSA,SE,KORD,US,A333,H,8be5c854fd664bcb97fb543339f74770,554,15,3770,230396.000000
3,LSZH,CH,KPHL,US,B788,H,5543e4dc327359ffaf5b9c0e6faaf0e1,497,11,3607,157615.000000
4,EIDW,IE,EGLL,GB,A21N,M,a73f82288988b79be490c6322f4c32ed,55,14,305,70318.447226
...,...,...,...,...,...,...,...,...,...,...,...
369008,LFPG,FR,KMIA,US,B788,H,5543e4dc327359ffaf5b9c0e6faaf0e1,545,21,4046,163438.000000
369009,LTFM,TR,EDDB,DE,A21N,M,6351ec1b849adacc0cbb3b1313d8d39b,158,25,946,78707.000000
369010,EDDL,DE,EIDW,IE,A320,M,a73f82288988b79be490c6322f4c32ed,99,11,522,62942.750000
369011,LFPG,FR,EIDW,IE,A21N,M,a73f82288988b79be490c6322f4c32ed,84,12,466,72611.161024



Row DataFrame (aircraft_max_weights.csv):


Unnamed: 0,Type,MTOW [kg]
0,Antonov An-225,640000
1,Scaled Composites Model 351 Stratolaunch,"589,670 [1]"
2,Airbus A380-800[2][3][4],575000
3,Boeing 747-8F,447700
4,Boeing 747-8,443613
...,...,...
113,Embraer Phenom 300[26],8150
114,Beechcraft 1900D,7765
115,Cessna Citation CJ4[27],7761
116,de Havilland Hercules,7000



Combined DataFrame (challenge_set.csv & aircraft_max_weights.csv):


Unnamed: 0,adep,country_code_adep,ades,country_code_ades,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow,aircraft_max_weight
0,EGLL,GB,EICK,IE,A320,M,a73f82288988b79be490c6322f4c32ed,61,18,321,54748.000000,68000.0
1,LEBL,ES,KMIA,US,B772,H,5543e4dc327359ffaf5b9c0e6faaf0e1,570,13,4193,185441.000000,inf
2,ESSA,SE,KORD,US,A333,H,8be5c854fd664bcb97fb543339f74770,554,15,3770,230396.000000,inf
3,LSZH,CH,KPHL,US,B788,H,5543e4dc327359ffaf5b9c0e6faaf0e1,497,11,3607,157615.000000,inf
4,EIDW,IE,EGLL,GB,A21N,M,a73f82288988b79be490c6322f4c32ed,55,14,305,70318.447226,inf
...,...,...,...,...,...,...,...,...,...,...,...,...
369008,LFPG,FR,KMIA,US,B788,H,5543e4dc327359ffaf5b9c0e6faaf0e1,545,21,4046,163438.000000,inf
369009,LTFM,TR,EDDB,DE,A21N,M,6351ec1b849adacc0cbb3b1313d8d39b,158,25,946,78707.000000,inf
369010,EDDL,DE,EIDW,IE,A320,M,a73f82288988b79be490c6322f4c32ed,99,11,522,62942.750000,68000.0
369011,LFPG,FR,EIDW,IE,A21N,M,a73f82288988b79be490c6322f4c32ed,84,12,466,72611.161024,inf


Saved Combined DataFrame (challenge_set.csv & aircraft_max_weights.csv) tp path: ../data/processed_challenge_set_df.csv


<h3>Label Encoding</h3>

In [3]:
# Identify string columns
string_columns = challenge_set_df.select_dtypes(include=['object']).columns

# Create a global mapping for encoding
unique_values = pd.unique(challenge_set_df[string_columns].values.ravel('K'))
value_to_int = {value: idx for idx, value in enumerate(unique_values)}

# Define a function to encode values
def encode_value(value):
    return value_to_int.get(value, value)  # Encode string values, keep other values unchanged

# Apply encoding only to string columns
df_encoded = challenge_set_df.copy()  # Create a copy of the original DataFrame
df_encoded[string_columns] = df_encoded[string_columns].map(encode_value)

print("Original Combined DataFrame (challenge_set.csv & aircraft_max_weights.csv):")
display(challenge_set_df)

print("\nEncoded DataFrame (challenge_set.csv & aircraft_max_weights.csv):")
display(df_encoded)

path = "../data/processed_encoded_challenge_set_df.csv"
df_encoded.to_csv(path, index=False)
print("Saved Encoded DataFrame (challenge_set.csv & aircraft_max_weights.csv) to path:", path)

Original Combined DataFrame (challenge_set.csv & aircraft_max_weights.csv):


Unnamed: 0,adep,country_code_adep,ades,country_code_ades,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow,aircraft_max_weight
0,EGLL,GB,EICK,IE,A320,M,a73f82288988b79be490c6322f4c32ed,61,18,321,54748.000000,68000.0
1,LEBL,ES,KMIA,US,B772,H,5543e4dc327359ffaf5b9c0e6faaf0e1,570,13,4193,185441.000000,inf
2,ESSA,SE,KORD,US,A333,H,8be5c854fd664bcb97fb543339f74770,554,15,3770,230396.000000,inf
3,LSZH,CH,KPHL,US,B788,H,5543e4dc327359ffaf5b9c0e6faaf0e1,497,11,3607,157615.000000,inf
4,EIDW,IE,EGLL,GB,A21N,M,a73f82288988b79be490c6322f4c32ed,55,14,305,70318.447226,inf
...,...,...,...,...,...,...,...,...,...,...,...,...
369008,LFPG,FR,KMIA,US,B788,H,5543e4dc327359ffaf5b9c0e6faaf0e1,545,21,4046,163438.000000,inf
369009,LTFM,TR,EDDB,DE,A21N,M,6351ec1b849adacc0cbb3b1313d8d39b,158,25,946,78707.000000,inf
369010,EDDL,DE,EIDW,IE,A320,M,a73f82288988b79be490c6322f4c32ed,99,11,522,62942.750000,68000.0
369011,LFPG,FR,EIDW,IE,A21N,M,a73f82288988b79be490c6322f4c32ed,84,12,466,72611.161024,inf



Encoded DataFrame (challenge_set.csv & aircraft_max_weights.csv):


Unnamed: 0,adep,country_code_adep,ades,country_code_ades,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow,aircraft_max_weight
0,0,460,31,464,594,624,626,61,18,321,54748.000000,68000.0
1,1,461,14,467,595,625,627,570,13,4193,185441.000000,inf
2,2,462,64,467,596,625,628,554,15,3770,230396.000000,inf
3,3,463,136,467,597,625,627,497,11,3607,157615.000000,inf
4,4,464,0,460,598,624,626,55,14,305,70318.447226,inf
...,...,...,...,...,...,...,...,...,...,...,...,...
369008,23,474,14,467,597,625,627,545,21,4046,163438.000000,inf
369009,7,466,25,470,598,624,629,158,25,946,78707.000000,inf
369010,27,470,4,464,594,624,626,99,11,522,62942.750000,68000.0
369011,23,474,4,464,598,624,626,84,12,466,72611.161024,inf


Saved Encoded DataFrame (challenge_set.csv & aircraft_max_weights.csv) to path: ../data/processed_encoded_challenge_set_df.csv
