In [1]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np

In [2]:
identifier_name = 'flight_id'

top_features_names = [
    'wtc', 
    'flown_distance', 
    'aircraft_type', 
    'groundspeed_max', 
    'airline', 
    'altitude_max', 
    'flight_duration', 
    'latitude_max', 
    'ades', 
    'adep', 
    'altitude_75percentile', 
    'altitude_median', 
    'longitude_max', 
    'vertical_rate_max', 
    'country_code_ades', 
    'longitude_min', 
    'latitude_min', 
    'vertical_rate_std', 
    'country_code_adep', 
    'longitude_std', 
    'altitude_25percentile', 
    'vertical_rate_75percentile', 
    'month', 
    'vertical_rate_25percentile', 
    'groundspeed_min', 
    'longitude_25percentile', 
    'latitude_25percentile', 
    'latitude_75percentile', 
    'taxiout_time', 
    'longitude_mean', 
    'longitude_median', 
    'month_day', 
    'latitude_mean', 
    'track_median', 
    'latitude_count', 
    'latitude_std', 
    'arrival_time_hour_minute', 
    'latitude_median', 
    'longitude_75percentile', 
    'track_75percentile', 
    'altitude_mean', 
    'temperature_min', 
    'actual_offblock_time_hour', 
    'vertical_rate_median', 
    'track_25percentile', 
    'vertical_rate_mean', 
    'arrival_time_hour'
]

target_name = 'tow'

global_random_state = 123

In [3]:
encoded_challenge_set = pd.read_csv('data/encoded_challenge_set.csv')
encoded_challenge_set = encoded_challenge_set.fillna(0)
df = encoded_challenge_set[top_features_names+[target_name]]

In [4]:
threshold_rmse = 2500
below_threshold_data = []
above_threshold_data = [df]

below_min_datapoints = []

while len(above_threshold_data) > 0:
    data_df = above_threshold_data.pop().copy()
    kmeans = KMeans(n_clusters=2, random_state=global_random_state)
    data_df['cluster'] = kmeans.fit_predict(data_df[top_features_names])
    
    for cluster_nr in range(0, 2, 1):
        cluster_df = data_df[data_df['cluster'] == cluster_nr]
        X = cluster_df[top_features_names]
        y = cluster_df[target_name]
        
        if len(X) < 90:
            below_min_datapoints.append(data_df)
            continue
        
        model = xgb.XGBRegressor(
            colsample_bytree=1.0,
            learning_rate=0.1, 
            max_depth=10, 
            n_estimators=91, 
            subsample=1.0,
            objective='reg:squarederror', 
            eval_metric='rmse',
            random_state=global_random_state
        )
        kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
        scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
        mse_scores = -scores
        mean_mse = np.mean(mse_scores)
        rmse = np.sqrt(mean_mse)
        
        if rmse < threshold_rmse:
            below_threshold_cluster = {
                "cluster_data": cluster_df,
                "cluster_RMSE": rmse,
                "cluster_centroid": kmeans.cluster_centers_[cluster_nr].tolist()
            }
            below_threshold_data.append(below_threshold_cluster)
            print(f"Below Threshold: Cluster {cluster_nr} has RMSE {rmse:.2f}")
        else:
            above_threshold_data.append(cluster_df)

print("Finished searching for clusters.")

Below Threshold: Cluster 0 has RMSE 2276.18
Below Threshold: Cluster 1 has RMSE 1258.14
Below Threshold: Cluster 0 has RMSE 2321.91
Below Threshold: Cluster 1 has RMSE 1313.75
Below Threshold: Cluster 0 has RMSE 1356.93
Below Threshold: Cluster 1 has RMSE 1549.01
Below Threshold: Cluster 0 has RMSE 1296.86
Below Threshold: Cluster 1 has RMSE 2066.05
Below Threshold: Cluster 0 has RMSE 1292.02
Below Threshold: Cluster 0 has RMSE 2428.25
Below Threshold: Cluster 0 has RMSE 2440.20
Below Threshold: Cluster 1 has RMSE 1898.51
Below Threshold: Cluster 1 has RMSE 2221.84
Below Threshold: Cluster 1 has RMSE 2481.48
Below Threshold: Cluster 0 has RMSE 2406.96
Finished searching for clusters.


In [5]:
original_count_of_datapoints = len(df)
datapoint_count_in_clusters = 0

for cluster_number, cluster_dict in enumerate(below_threshold_data):
    cluster_num = cluster_number+1
    print(f"Cluster nr. {cluster_num} got RMSE: {cluster_dict['cluster_RMSE']:.2f}")
    
    print("Cluster Centroid")
    print(cluster_dict['cluster_centroid'])
    
    cluster_df = cluster_dict["cluster_data"]
    cluster_df.to_csv(f"./data/clusters/clusters_threshold_{str(threshold_rmse)}/cluster_{str(cluster_num)}.csv", index=False)
    print(f"./data/clusters/cluster_{str(cluster_num)}.csv is saved!")

    datapoint_count_in_clusters += len(cluster_df)
    print()
    
print("Datapoints in Clusters: ", datapoint_count_in_clusters)
print("original count of datapoints: ", original_count_of_datapoints)
print(f"Percentage of clustered datapoints: {(datapoint_count_in_clusters/original_count_of_datapoints)*100:.2f}", )

Cluster nr. 1 got RMSE: 2276.18
Cluster Centroid
[1.0, 380.93195266272187, 24.789940828402365, 462.1065088757396, 16.639053254437872, 34859.097633136094, 64.32544378698225, 55.27217456751919, 74.60946745562129, 81.78698224852076, 33045.8720016186, 21560.059374701825, 12.852626420990255, 113555.10355029585, 23.653846153846153, 6.675162134011576, 50.35273346485952, 27493.81008720132, 32.33431952662722, 2.0164500327006136, 7056.2702680325165, 1203.5680473372786, 2.7100591715976323, -2183.0207100591674, 102.64201183431953, 7.678083920163713, 51.259113137366725, 54.28314015263868, 11.378698224852071, 9.669413089523678, 9.611971774008822, 64.07692307692304, 52.78994143595276, 158.43504572491028, 4437.378698224852, 1.5858629395983808, 886.8017751479288, 52.66530478322247, 11.547798206869153, 186.86444917381044, 19726.688056642848, 214.19449348554176, 13.065088757396449, -52.16568047337277, 145.45939503337976, -3382.5260172338603, 14.278106508875737]
./data/clusters/cluster_1.csv is saved!

Cl