In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
import pandas as pd
import numpy as np
import random

In [2]:
identifier_name = 'flight_id'

features_names = [
    'wtc', 
    'aircraft_type', 
    'flown_distance', 
    'groundspeed_max', 
    'latitude_min', 
    'altitude_25percentile', 
    'airline', 
    'flight_duration', 
    'longitude_max', 
    'vertical_rate_75percentile', 
    'altitude_median', 
    'ades', 
    'longitude_mean', 
    'altitude_75percentile', 
    'latitude_75percentile', 
    'vertical_rate_std', 
    'adep', 
    'latitude_std', 
    'vertical_rate_max', 
    'latitude_max', 
    'longitude_std', 
    'vertical_rate_25percentile', 
    'longitude_min', 
    'longitude_75percentile', 
    'altitude_mean', 
    'groundspeed_75percentile', 
    'country_code_adep', 
    'latitude_median', 
    'longitude_25percentile', 
    'groundspeed_min', 
    'country_code_ades', 
    'latitude_25percentile', 
    'actual_offblock_time_hour', 
    'longitude_median', 
    'month', 
    'altitude_std', 
    'latitude_count', 
    'taxiout_time', 
    'vertical_rate_median', 
    'month_day'
]

target_name = 'tow'

global_random_state = 123

In [3]:
challenge_set_df = pd.read_csv('data/encoded_challenge_set.csv')
challenge_set_df = challenge_set_df.fillna(0)
challenge_features = challenge_set_df[features_names+[target_name]]
display(challenge_set_df)

Unnamed: 0,flight_id,month_day,month,day,actual_offblock_time_hour_minute,actual_offblock_time_hour,actual_offblock_time_minute,adep,country_code_adep,arrival_time_hour_minute,...,temperature_75percentile,temperature_max,specific_humidity_count,specific_humidity_mean,specific_humidity_std,specific_humidity_min,specific_humidity_25percentile,specific_humidity_median,specific_humidity_75percentile,specific_humidity_max
0,248763780,0,1,1,826,13,46,69,37,904,...,268.945607,288.078244,3614.0,0.001569,0.002304,0.000003,0.000022,0.000114,0.002915,0.008067
1,248760618,0,1,1,595,9,55,216,33,1177,...,221.850708,298.691536,10358.0,0.001025,0.003064,0.000002,0.000028,0.000044,0.000085,0.013049
2,248753824,0,1,1,579,9,39,137,88,1148,...,241.223833,272.897641,12932.0,0.000512,0.000983,0.000003,0.000016,0.000022,0.000190,0.003214
3,248753852,0,1,1,664,11,4,338,19,1172,...,231.613519,286.518770,17534.0,0.000886,0.002158,0.000002,0.000006,0.000018,0.000154,0.008750
4,248755934,0,1,1,756,12,36,92,44,824,...,275.224544,288.014789,3265.0,0.002256,0.002640,0.000014,0.000029,0.000719,0.004477,0.008008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369008,258058361,364,12,31,578,9,38,253,36,1143,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
369009,258071247,364,12,31,567,9,27,385,96,749,...,221.953594,290.137029,9519.0,0.000698,0.001855,0.000003,0.000024,0.000040,0.000053,0.008280
369010,258059152,364,12,31,592,9,52,30,26,701,...,248.321020,289.537153,5952.0,0.001027,0.001972,0.000003,0.000020,0.000026,0.000880,0.008802
369011,258072276,364,12,31,577,9,37,253,36,673,...,258.227824,286.721784,5068.0,0.001184,0.001979,0.000005,0.000022,0.000029,0.001657,0.007904


In [4]:
categorical_columns = [
    'wtc', 
    'aircraft_type', 
    'airline', 
    'ades', 
    'adep', 
    'country_code_adep', 
    'country_code_ades',
    'actual_offblock_time_hour', 
    'month',
    'taxiout_time',
    'month_day'
]

numerical_columns = [
    'flown_distance', 
    'groundspeed_max', 
    'latitude_min', 
    'altitude_25percentile', 
    'flight_duration', 
    'longitude_max', 
    'vertical_rate_75percentile', 
    'altitude_median', 
    'longitude_mean', 
    'altitude_75percentile', 
    'latitude_75percentile', 
    'vertical_rate_std', 
    'latitude_std', 
    'vertical_rate_max', 
    'latitude_max', 
    'longitude_std', 
    'vertical_rate_25percentile', 
    'longitude_min', 
    'longitude_75percentile', 
    'altitude_mean', 
    'groundspeed_75percentile', 
    'latitude_median', 
    'longitude_25percentile', 
    'groundspeed_min', 
    'latitude_25percentile', 
    'longitude_median', 
    'altitude_std', 
    'latitude_count', 
    'vertical_rate_median'
]

In [5]:
general_is_best,  similarity_based_is_best, equal = [], [], []

# Preprocess the DataFrame
# Step 1: Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ]
)

# Step 2: Fit and transform the data
processed_data = preprocessor.fit_transform(challenge_features)


# Generate a list of numbers from 0 to 369011
numbers = list(range(len(challenge_set_df)-1))
random.shuffle(numbers)
unique_random_numbers = numbers[:1000]

for loop_index in range(1000):
    if loop_index % 10 == 0:
        print(f"Running for betweem {loop_index} and {loop_index+10}")

    index = unique_random_numbers[loop_index]
    
    # Step 3: Calculate similarity for the first data point
    first_data_point = processed_data[index].reshape(1, -1)  # Reshape for similarity calculation

    # Step 4: Calculate cosine similarity
    similarity_values = cosine_similarity(first_data_point, processed_data).flatten()

    # Step 5: Add similarity values as a new column
    challenge_features_with_similarity = challenge_features.copy()
    challenge_features_with_similarity['similarity'] = similarity_values

    # Step 6: Sort DataFrame by the new similarity column in descending order
    challenge_features_with_similarity = challenge_features_with_similarity.sort_values(by='similarity', ascending=False)

    # Get true tow target value
    test_point = challenge_features_with_similarity.head(1)
    true_tow = test_point[target_name].values[0]

    # Build General XGBoost model with all other data points than the test datapoint, and predict the datapoint tow with it
    general_training_data = challenge_features_with_similarity.iloc[1:]
    general_training_data_features = general_training_data[features_names]
    general_training_data_target = general_training_data[target_name]
    
    general_model = xgb.XGBRegressor(
        colsample_bytree=1.0, 
        learning_rate=0.1,
        max_depth=10,
        n_estimators=91, 
        subsample= 1.0,
        objective='reg:squarederror', 
        eval_metric='rmse',
        random_state=global_random_state
    )
    
    general_model.fit(general_training_data_features, general_training_data_target)
    general_y_pred = general_model.predict(test_point[features_names])[0]
    diff_general = abs(general_y_pred-true_tow)

    # Build similarity based XGBoost model for this datapoint, and predict the datapoint tow with it.
    similarity_based_training_data = challenge_features_with_similarity[challenge_features_with_similarity['similarity'] >= 0.99].iloc[1:]
    similarity_based_training_data_features = similarity_based_training_data[features_names]
    similarity_based_training_data_target = similarity_based_training_data[target_name]
    
    similarity_based_model = xgb.XGBRegressor(
        colsample_bytree=1.0, 
        learning_rate=0.1,
        max_depth=10,
        n_estimators=91, 
        subsample= 1.0,
        objective='reg:squarederror', 
        eval_metric='rmse',
        random_state=global_random_state
    )
    
    similarity_based_model.fit(similarity_based_training_data_features, similarity_based_training_data_target)
    similarity_based_y_pred = similarity_based_model.predict(test_point[features_names])[0]
    diff_similarity_based = abs(similarity_based_y_pred-true_tow)

    if diff_general < diff_similarity_based:
        general_is_best.append(diff_general)
    elif diff_general > diff_similarity_based:
        similarity_based_is_best.append(diff_similarity_based)
    else:
        equal.append(general_is_best)

print(f"{len(general_is_best) = }")
print(f"{len(similarity_based_is_best) = }")
print(f"{len(equal) = }")

Running for betweem 0 and 10
Running for betweem 10 and 20
Running for betweem 20 and 30
Running for betweem 30 and 40
Running for betweem 40 and 50
Running for betweem 50 and 60
Running for betweem 60 and 70
Running for betweem 70 and 80
Running for betweem 80 and 90
Running for betweem 90 and 100
Running for betweem 100 and 110
Running for betweem 110 and 120
Running for betweem 120 and 130
Running for betweem 130 and 140
Running for betweem 140 and 150
Running for betweem 150 and 160
Running for betweem 160 and 170
Running for betweem 170 and 180
Running for betweem 180 and 190
Running for betweem 190 and 200
Running for betweem 200 and 210
Running for betweem 210 and 220
Running for betweem 220 and 230
Running for betweem 230 and 240
Running for betweem 240 and 250
Running for betweem 250 and 260
Running for betweem 260 and 270
Running for betweem 270 and 280
Running for betweem 280 and 290
Running for betweem 290 and 300
Running for betweem 300 and 310
Running for betweem 310 and 



Running for betweem 620 and 630
Running for betweem 630 and 640
Running for betweem 640 and 650
Running for betweem 650 and 660
Running for betweem 660 and 670
Running for betweem 670 and 680
Running for betweem 680 and 690
Running for betweem 690 and 700
Running for betweem 700 and 710
Running for betweem 710 and 720
Running for betweem 720 and 730
Running for betweem 730 and 740
Running for betweem 740 and 750
Running for betweem 750 and 760
Running for betweem 760 and 770
Running for betweem 770 and 780
Running for betweem 780 and 790
Running for betweem 790 and 800
Running for betweem 800 and 810
Running for betweem 810 and 820
Running for betweem 820 and 830
Running for betweem 830 and 840
Running for betweem 840 and 850
Running for betweem 850 and 860
Running for betweem 860 and 870
Running for betweem 870 and 880
Running for betweem 880 and 890
Running for betweem 890 and 900
Running for betweem 900 and 910
Running for betweem 910 and 920
Running for betweem 920 and 930
Running 

In [6]:
print(f">>> {len(general_is_best) = }")
if len(general_is_best) > 0:
    print(f"{np.mean(general_is_best) = }")
    print(f"{np.median(general_is_best) = }")
    print(f"{np.std(general_is_best) = }")
    print(f"{np.sum(general_is_best) = }")
    print(f"{np.max(general_is_best) = }")
    print(f"{np.min(general_is_best) = }")
print()
print(f">>> {len(similarity_based_is_best) = }") # similarity is best
if len(similarity_based_is_best) > 0:
    print(f"{np.mean(similarity_based_is_best) = }") # similarity is best
    print(f"{np.median(similarity_based_is_best) = }") # similarity is best
    print(f"{np.std(similarity_based_is_best) = }") # similarity is best
    print(f"{np.sum(similarity_based_is_best) = }") # general is best
    print(f"{np.max(similarity_based_is_best) = }") # similarity is best
    print(f"{np.min(similarity_based_is_best) = }") # similarity is best
print()
print(f">>> {len(equal) = }")
if len(equal) > 0:
    print(f"{np.mean(equal) = }")
    print(f"{np.median(equal) = }")
    print(f"{np.std(equal) = }")
    print(f"{np.sum(equal) = }")
    print(f"{np.max(equal) = }")
    print(f"{np.min(equal) = }")

>>> len(general_is_best) = 457
np.mean(general_is_best) = 1669.1967916504038
np.median(general_is_best) = 1109.5390625
np.std(general_is_best) = 1973.6551976444189
np.sum(general_is_best) = 762822.9337842346
np.max(general_is_best) = 15090.3125
np.min(general_is_best) = 8.4609375

>>> len(similarity_based_is_best) = 543
np.mean(similarity_based_is_best) = 1481.087418711181
np.median(similarity_based_is_best) = 857.72265625
np.std(similarity_based_is_best) = 1799.2352577217805
np.sum(similarity_based_is_best) = 804230.4683601713
np.max(similarity_based_is_best) = 11606.78125
np.min(similarity_based_is_best) = 4.088372261627228

>>> len(equal) = 0
