In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb

In [2]:
identifier_name = 'flight_id'

features_names = [
    'wtc', 
    'aircraft_type', 
    'flown_distance', 
    'groundspeed_max', 
    'latitude_min', 
    'altitude_25percentile', 
    'airline', 
    'flight_duration', 
    'longitude_max', 
    'vertical_rate_75percentile', 
    'altitude_median', 
    'ades', 
    'longitude_mean', 
    'altitude_75percentile', 
    'latitude_75percentile', 
    'vertical_rate_std', 
    'adep', 
    'latitude_std', 
    'vertical_rate_max', 
    'latitude_max', 
    'longitude_std', 
    'vertical_rate_25percentile', 
    'longitude_min', 
    'longitude_75percentile', 
    'altitude_mean', 
    'groundspeed_75percentile', 
    'country_code_adep', 
    'latitude_median', 
    'longitude_25percentile', 
    'groundspeed_min', 
    'country_code_ades', 
    'latitude_25percentile', 
    'actual_offblock_time_hour', 
    'longitude_median', 
    'month', 
    'altitude_std', 
    'latitude_count', 
    'taxiout_time', 
    'vertical_rate_median', 
    'month_day'
]

target_name = 'tow'

global_random_state = 123

In [3]:
#challenge_set_df = pd.read_csv('data/challenge_set_with_trajectories.csv')
challenge_set_df = pd.read_csv('data/encoded_challenge_set.csv')
challenge_set_df = challenge_set_df.fillna(0)
challenge_features = challenge_set_df[features_names+[target_name]]
display(challenge_set_df)

Unnamed: 0,flight_id,month_day,month,day,actual_offblock_time_hour_minute,actual_offblock_time_hour,actual_offblock_time_minute,adep,country_code_adep,arrival_time_hour_minute,...,temperature_75percentile,temperature_max,specific_humidity_count,specific_humidity_mean,specific_humidity_std,specific_humidity_min,specific_humidity_25percentile,specific_humidity_median,specific_humidity_75percentile,specific_humidity_max
0,248763780,0,1,1,826,13,46,69,37,904,...,268.945607,288.078244,3614.0,0.001569,0.002304,0.000003,0.000022,0.000114,0.002915,0.008067
1,248760618,0,1,1,595,9,55,216,33,1177,...,221.850708,298.691536,10358.0,0.001025,0.003064,0.000002,0.000028,0.000044,0.000085,0.013049
2,248753824,0,1,1,579,9,39,137,88,1148,...,241.223833,272.897641,12932.0,0.000512,0.000983,0.000003,0.000016,0.000022,0.000190,0.003214
3,248753852,0,1,1,664,11,4,338,19,1172,...,231.613519,286.518770,17534.0,0.000886,0.002158,0.000002,0.000006,0.000018,0.000154,0.008750
4,248755934,0,1,1,756,12,36,92,44,824,...,275.224544,288.014789,3265.0,0.002256,0.002640,0.000014,0.000029,0.000719,0.004477,0.008008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369008,258058361,364,12,31,578,9,38,253,36,1143,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
369009,258071247,364,12,31,567,9,27,385,96,749,...,221.953594,290.137029,9519.0,0.000698,0.001855,0.000003,0.000024,0.000040,0.000053,0.008280
369010,258059152,364,12,31,592,9,52,30,26,701,...,248.321020,289.537153,5952.0,0.001027,0.001972,0.000003,0.000020,0.000026,0.000880,0.008802
369011,258072276,364,12,31,577,9,37,253,36,673,...,258.227824,286.721784,5068.0,0.001184,0.001979,0.000005,0.000022,0.000029,0.001657,0.007904


In [4]:
categorical_columns = [
    'wtc', 
    'aircraft_type', 
    'airline', 
    'ades', 
    'adep', 
    'country_code_adep', 
    'country_code_ades',
    'actual_offblock_time_hour', 
    'month',
    'taxiout_time',
    'month_day'
]

numerical_columns = [
    'flown_distance', 
    'groundspeed_max', 
    'latitude_min', 
    'altitude_25percentile', 
    'flight_duration', 
    'longitude_max', 
    'vertical_rate_75percentile', 
    'altitude_median', 
    'longitude_mean', 
    'altitude_75percentile', 
    'latitude_75percentile', 
    'vertical_rate_std', 
    'latitude_std', 
    'vertical_rate_max', 
    'latitude_max', 
    'longitude_std', 
    'vertical_rate_25percentile', 
    'longitude_min', 
    'longitude_75percentile', 
    'altitude_mean', 
    'groundspeed_75percentile', 
    'latitude_median', 
    'longitude_25percentile', 
    'groundspeed_min', 
    'latitude_25percentile', 
    'longitude_median', 
    'altitude_std', 
    'latitude_count', 
    'vertical_rate_median'
]

In [5]:
# Preprocess the DataFrame
# Step 1: Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ]
)

# Step 2: Fit and transform the data
processed_data = preprocessor.fit_transform(challenge_features)

# Step 3: Calculate similarity for the first data point
first_data_point = processed_data[0].reshape(1, -1)  # Reshape for similarity calculation

# Step 4: Calculate cosine similarity
similarity_values = cosine_similarity(first_data_point, processed_data).flatten()

# Step 5: Add similarity values as a new column
challenge_features_with_similarity = challenge_features.copy()
challenge_features_with_similarity['similarity'] = similarity_values

# Step 6: Sort DataFrame by the new similarity column in descending order
challenge_features_with_similarity = challenge_features_with_similarity.sort_values(by='similarity', ascending=False)

In [6]:
display(challenge_features_with_similarity[challenge_features_with_similarity['similarity'] >= 0.99])

Unnamed: 0,wtc,aircraft_type,flown_distance,groundspeed_max,latitude_min,altitude_25percentile,airline,flight_duration,longitude_max,vertical_rate_75percentile,...,actual_offblock_time_hour,longitude_median,month,altitude_std,latitude_count,taxiout_time,vertical_rate_median,month_day,tow,similarity
0,1,4,321,390.0,51.453232,9550.00,20,61,-0.443802,368.0,...,13,-4.895513,1,11779.780447,3614.0,18,0.0,0,54748.000000,1.000000
117432,1,4,323,403.0,51.458518,9650.00,20,60,-0.478363,64.0,...,21,-5.177678,5,11751.493553,3601.0,12,0.0,140,56800.000000,0.999950
22707,1,4,425,507.0,52.299923,11181.25,20,65,4.771284,64.0,...,5,0.857988,2,13873.535953,4254.0,20,-64.0,35,61046.219978,0.999949
70147,1,3,365,513.0,45.830704,9700.00,22,59,5.080105,128.0,...,19,3.732828,4,11793.551843,3695.0,13,0.0,97,57274.344940,0.999929
337597,1,0,292,385.0,51.477046,9975.00,20,56,-0.463356,64.0,...,20,-2.832534,11,12657.006244,3415.0,16,0.0,328,54805.000000,0.999907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146018,1,4,323,477.0,50.866187,8000.00,20,68,0.020711,832.0,...,7,-1.383084,6,10155.997629,4234.0,22,0.0,164,50126.000000,0.990000
368861,1,23,125,311.0,46.231602,4781.25,3,32,8.564032,0.0,...,12,7.150561,12,5857.921109,2010.0,9,-64.0,364,47873.000000,0.990000
199805,1,28,537,450.0,47.792902,6843.75,13,83,16.555763,-576.0,...,10,15.028910,8,11991.533475,1832.0,12,-1024.0,214,47838.000000,0.990000
28243,1,4,178,457.0,49.016434,5425.00,22,37,4.834281,192.0,...,7,3.729248,2,9873.781234,2365.0,15,-448.0,44,59470.900340,0.990000


In [7]:
test_point = challenge_features_with_similarity.head(1)
display(test_point)

true_tow = test_point[target_name].values[0]
print(true_tow)

Unnamed: 0,wtc,aircraft_type,flown_distance,groundspeed_max,latitude_min,altitude_25percentile,airline,flight_duration,longitude_max,vertical_rate_75percentile,...,actual_offblock_time_hour,longitude_median,month,altitude_std,latitude_count,taxiout_time,vertical_rate_median,month_day,tow,similarity
0,1,4,321,390.0,51.453232,9550.0,20,61,-0.443802,368.0,...,13,-4.895513,1,11779.780447,3614.0,18,0.0,0,54748.0,1.0


54748.0


<h3>Build General XGBoost model with all other data points than the test datapoint, and predict the datapoint tow with it</h3>

In [8]:
general_training_data = challenge_features_with_similarity.iloc[1:]
general_training_data_features = general_training_data[features_names]
general_training_data_target = general_training_data[target_name]

general_model = xgb.XGBRegressor(
    colsample_bytree=1.0, 
    learning_rate=0.1,
    max_depth=10,
    n_estimators=91, 
    subsample= 1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)

general_model.fit(general_training_data_features, general_training_data_target)

In [9]:
general_y_pred = general_model.predict(test_point[features_names])[0]
print("predicted tow with general model: ", general_y_pred)
print("True two: ", true_tow)
diff_general = abs(general_y_pred-true_tow)
print("Error: ", diff_general)

predicted tow with general model:  55499.2
True two:  54748.0
Error:  751.19921875


<h3>Build Specific XGBoost model for this datapoint, and predict the datapoint tow with it.</h3>

In [10]:
similarity_based_training_data = challenge_features_with_similarity[challenge_features_with_similarity['similarity'] >= 0.99].iloc[1:]
similarity_based_training_data_features = similarity_based_training_data[features_names]
similarity_based_training_data_target = similarity_based_training_data[target_name]

similarity_based_model = xgb.XGBRegressor(
    colsample_bytree=1.0, 
    learning_rate=0.1,
    max_depth=10,
    n_estimators=91, 
    subsample= 1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)

similarity_based_model.fit(similarity_based_training_data_features, similarity_based_training_data_target)

In [11]:
similarity_based_y_pred = similarity_based_model.predict(test_point[features_names])[0]
print("predicted tow with general model: ", similarity_based_y_pred)
print("True two: ", true_tow)
diff_similarity_based = abs(similarity_based_y_pred-true_tow)
print("Error: ", diff_similarity_based)

predicted tow with general model:  54284.953
True two:  54748.0
Error:  463.046875


<h3>Compare the two above results</h3>

In [12]:
general_is_best,  similarity_based_is_best, equal = [], [], []
if diff_general < diff_similarity_based:
    general_is_best.append(1)
elif diff_general > diff_similarity_based:
    similarity_based_is_best.append(1)
else:
    equal.append(1)

print(f"{len(general_is_best) = }")
print(f"{len(similarity_based_is_best) = }")
print(f"{len(equal) = }")

len(general_is_best) = 0
len(similarity_based_is_best) = 1
len(equal) = 0
