<h1>Prediction Using Current Best Random Forest Estimator</h1>

In [1]:
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

<h3>A - Import Training Data</h3>

In [2]:
label_encoded_challenge_set_df = pd.read_csv('../data/label_encoded_with_value_recognition_challenge_set.csv')

# Pick up features from dataset
features = label_encoded_challenge_set_df[['adep', 'country_code_adep', 'ades', 
                                           'country_code_ades', 'aircraft_type', 
                                           'wtc', 'airline', 'flight_duration', 
                                           'taxiout_time', 'flown_distance', 
                                           'month-day', 'month', 'day_in_month']]

# Pick up target from dataset
target = label_encoded_challenge_set_df['tow']

<h3>B - Train a RandomForestRegressor</h3>

In [3]:
rf = RandomForestRegressor(
    n_estimators=12, # From elbow graph in CV Grid Search 
)

rf.fit(features, target)

<h3>C - Predict for Submission</h3>

<h3>E - Encode submission data before predicting</h3>

In [4]:
submission_set_df = pd.read_csv('../data/submission_set.csv')

# Split date into 2 columns, year is not important since it is the same in the entire dataset 2022
submission_set_df['date'] = pd.to_datetime(submission_set_df['date'])
submission_set_df['month-day'] = submission_set_df['date'].dt.strftime('%m-%d')
submission_set_df['month'] = submission_set_df['date'].dt.strftime('%m').astype(int)
submission_set_df['day_in_month'] = submission_set_df['date'].dt.strftime('%d').astype(int)
# Drop the original date column
submission_set_df = submission_set_df.drop(columns=['date'])

# Add Maximum Aircraft Weights
aircraft_max_weights = pd.read_csv("../data/aircraft_max_weights.csv")[['Type', 'MTOW [kg]']]
print("Row (aircraft_max_weights.csv) DataFrame:")
display(aircraft_max_weights)

combined_submission_set_df = submission_set_df.copy()
combined_submission_set_df['aircraft_max_weight'] = np.inf
for index, row in combined_submission_set_df.iterrows():
    aircraft_type = row['aircraft_type']
    for _, aircraft_max_weights_row in aircraft_max_weights.iterrows():
        if aircraft_type in aircraft_max_weights_row['Type']:
            combined_submission_set_df.at[index, 'aircraft_max_weight'] = np.int64(aircraft_max_weights_row['MTOW [kg]'])
            break
print("Combined (submission_set.csv & aircraft_max_weights.csv) DataFrame:")

combined_submission_set_df = combined_submission_set_df.drop(columns=['callsign', 'name_adep', 'name_ades', 'actual_offblock_time', 'arrival_time'])

display(combined_submission_set_df)

Row (aircraft_max_weights.csv) DataFrame:


Unnamed: 0,Type,MTOW [kg]
0,Antonov An-225,640000
1,Scaled Composites Model 351 Stratolaunch,"589,670 [1]"
2,Airbus A380-800[2][3][4],575000
3,Boeing 747-8F,447700
4,Boeing 747-8,443613
...,...,...
113,Embraer Phenom 300[26],8150
114,Beechcraft 1900D,7765
115,Cessna Citation CJ4[27],7761
116,de Havilland Hercules,7000


Combined (submission_set.csv & aircraft_max_weights.csv) DataFrame:


Unnamed: 0,flight_id,adep,country_code_adep,ades,country_code_ades,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow,month-day,month,day_in_month,aircraft_max_weight
0,248753821,LTFJ,TR,LFLL,FR,B738,M,6351ec1b849adacc0cbb3b1313d8d39b,170,15,1122,,01-01,1,1,inf
1,248753822,EBBR,BE,KJFK,US,A333,H,bdeeef3a675587d530de70a25d7118d2,470,15,3205,,01-01,1,1,inf
2,248754498,KMIA,US,EGLL,GB,B77W,H,5543e4dc327359ffaf5b9c0e6faaf0e1,473,10,3965,,01-01,1,1,inf
3,248757623,EGCN,GB,LEAL,ES,B38M,M,3922524069809ac4326134429751e26f,156,10,986,,01-01,1,1,inf
4,248763603,EIDW,IE,LFLL,FR,A320,M,a73f82288988b79be490c6322f4c32ed,105,15,686,,01-01,1,1,68000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105954,258066302,LTFJ,TR,EKCH,DK,B38M,M,6351ec1b849adacc0cbb3b1313d8d39b,201,15,1199,,12-31,12,31,inf
105955,258068609,LOWW,AT,KIAD,US,B763,H,5d407cb11cc29578cc3e292e743f5393,575,14,3937,,12-31,12,31,inf
105956,258068876,LTFM,TR,LSZH,CH,A321,M,6351ec1b849adacc0cbb3b1313d8d39b,154,25,988,,12-31,12,31,83000.0
105957,258064675,EHAM,NL,EDDF,DE,A320,M,f502877cab405652cf0dd70c2213e730,42,9,240,,12-31,12,31,68000.0


In [8]:
combined_challenge_set_df = pd.read_csv('../data/combined_challenge_set.csv')
display(combined_challenge_set_df)

label_encoded_with_value_recognition_challenge_set_df = pd.read_csv('../data/label_encoded_with_value_recognition_challenge_set.csv')
display(label_encoded_with_value_recognition_challenge_set_df)

Unnamed: 0,flight_id,adep,country_code_adep,ades,country_code_ades,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow,month-day,month,day_in_month,aircraft_max_weight
0,248763780,EGLL,GB,EICK,IE,A320,M,a73f82288988b79be490c6322f4c32ed,61,18,321,54748.000000,01-01,1,1,68000.0
1,248760618,LEBL,ES,KMIA,US,B772,H,5543e4dc327359ffaf5b9c0e6faaf0e1,570,13,4193,185441.000000,01-01,1,1,inf
2,248753824,ESSA,SE,KORD,US,A333,H,8be5c854fd664bcb97fb543339f74770,554,15,3770,230396.000000,01-01,1,1,inf
3,248753852,LSZH,CH,KPHL,US,B788,H,5543e4dc327359ffaf5b9c0e6faaf0e1,497,11,3607,157615.000000,01-01,1,1,inf
4,248755934,EIDW,IE,EGLL,GB,A21N,M,a73f82288988b79be490c6322f4c32ed,55,14,305,70318.447226,01-01,1,1,inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369008,258058361,LFPG,FR,KMIA,US,B788,H,5543e4dc327359ffaf5b9c0e6faaf0e1,545,21,4046,163438.000000,12-31,12,31,inf
369009,258071247,LTFM,TR,EDDB,DE,A21N,M,6351ec1b849adacc0cbb3b1313d8d39b,158,25,946,78707.000000,12-31,12,31,inf
369010,258059152,EDDL,DE,EIDW,IE,A320,M,a73f82288988b79be490c6322f4c32ed,99,11,522,62942.750000,12-31,12,31,68000.0
369011,258072276,LFPG,FR,EIDW,IE,A21N,M,a73f82288988b79be490c6322f4c32ed,84,12,466,72611.161024,12-31,12,31,inf


Unnamed: 0,flight_id,adep,country_code_adep,ades,country_code_ades,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow,month-day,month,day_in_month,aircraft_max_weight
0,248763780,0,460,31,464,594,624,626,61,18,321,54748.000000,655,1,1,68000.0
1,248760618,1,461,14,467,595,625,627,570,13,4193,185441.000000,655,1,1,inf
2,248753824,2,462,64,467,596,625,628,554,15,3770,230396.000000,655,1,1,inf
3,248753852,3,463,136,467,597,625,627,497,11,3607,157615.000000,655,1,1,inf
4,248755934,4,464,0,460,598,624,626,55,14,305,70318.447226,655,1,1,inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369008,258058361,23,474,14,467,597,625,627,545,21,4046,163438.000000,1019,12,31,inf
369009,258071247,7,466,25,470,598,624,629,158,25,946,78707.000000,1019,12,31,inf
369010,258059152,27,470,4,464,594,624,626,99,11,522,62942.750000,1019,12,31,68000.0
369011,258072276,23,474,4,464,598,624,626,84,12,466,72611.161024,1019,12,31,inf


In [6]:
# Identify string columns
string_columns = combined_challenge_set_df.select_dtypes(include=['object']).columns

# Create a global mapping for encoding
unique_values = pd.unique(combined_challenge_set_df[string_columns].values.ravel('K'))
value_to_int = {value: idx for idx, value in enumerate(unique_values)}

# Define a function to encode values
def encode_value(value):
    return value_to_int.get(value, value)  # Encode string values, keep other values unchanged

# Apply encoding only to string columns
label_encoded_with_value_recognition_submission_set_df = combined_submission_set_df.copy()  # Create a copy of the original DataFrame
label_encoded_with_value_recognition_submission_set_df[string_columns] = label_encoded_with_value_recognition_submission_set_df[string_columns].map(encode_value)
display(label_encoded_with_value_recognition_submission_set_df)

Unnamed: 0,flight_id,adep,country_code_adep,ades,country_code_ades,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow,month-day,month,day_in_month,aircraft_max_weight
0,248753821,18,466,59,474,600,624,629,170,15,1122,,655,1,1,inf
1,248753822,20,471,46,467,596,625,633,470,15,3205,,655,1,1,inf
2,248754498,14,467,0,460,605,625,627,473,10,3965,,655,1,1,inf
3,248757623,97,460,12,461,612,624,632,156,10,986,,655,1,1,inf
4,248763603,4,464,59,474,594,624,626,105,15,686,,655,1,1,68000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105954,258066302,18,466,17,469,612,624,629,201,15,1199,,1019,12,31,inf
105955,258068609,9,468,66,467,609,625,630,575,14,3937,,1019,12,31,inf
105956,258068876,7,466,3,463,602,624,629,154,25,988,,1019,12,31,83000.0
105957,258064675,28,475,19,470,594,624,654,42,9,240,,1019,12,31,68000.0
