In [1]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np

In [2]:
identifier_name = 'flight_id'

top_features_names = [
    'wtc', 
    'flown_distance', 
    'aircraft_type', 
    'groundspeed_max', 
    'airline', 
    'altitude_max', 
    'flight_duration', 
    'latitude_max', 
    'ades', 
    'adep', 
    'altitude_75percentile', 
    'altitude_median', 
    'longitude_max', 
    'vertical_rate_max', 
    'country_code_ades', 
    'longitude_min', 
    'latitude_min', 
    'vertical_rate_std', 
    'country_code_adep', 
    'longitude_std', 
    'altitude_25percentile', 
    'vertical_rate_75percentile', 
    'month', 
    'vertical_rate_25percentile', 
    'groundspeed_min', 
    'longitude_25percentile', 
    'latitude_25percentile', 
    'latitude_75percentile', 
    'taxiout_time', 
    'longitude_mean', 
    'longitude_median', 
    'month_day', 
    'latitude_mean', 
    'track_median', 
    'latitude_count', 
    'latitude_std', 
    'arrival_time_hour_minute', 
    'latitude_median', 
    'longitude_75percentile', 
    'track_75percentile', 
    'altitude_mean', 
    'temperature_min', 
    'actual_offblock_time_hour', 
    'vertical_rate_median', 
    'track_25percentile', 
    'vertical_rate_mean', 
    'arrival_time_hour'
]

target_name = 'tow'

global_random_state = 123

In [3]:
encoded_challenge_set = pd.read_csv('data/encoded_challenge_set.csv')
encoded_challenge_set = encoded_challenge_set.fillna(0)
df = encoded_challenge_set[top_features_names+[target_name]]

In [4]:
threshold_rmse = 3000
below_threshold_data = []
above_threshold_data = [df]

below_min_datapoints = []

remaining_datapoints = len(df)

while len(above_threshold_data) > 0:
    data_df = above_threshold_data.pop().copy()
    kmeans = KMeans(n_clusters=2, random_state=global_random_state)
    data_df['cluster'] = kmeans.fit_predict(data_df[top_features_names])
    
    for cluster_nr in range(0, 2, 1):
        cluster_df = data_df[data_df['cluster'] == cluster_nr]
        cluster_df = cluster_df.drop('cluster', axis=1)
        cluster_df_length = len(cluster_df)
        
        if len(cluster_df) < 90:
            below_min_datapoints.append(cluster_df)
            remaining_datapoints -= cluster_df_length
            print(f"Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: {remaining_datapoints}")
            continue

        X = cluster_df[top_features_names]
        y = cluster_df[target_name]
        
        model = xgb.XGBRegressor(
            colsample_bytree=1.0,
            learning_rate=0.1, 
            max_depth=10, 
            n_estimators=91, 
            subsample=1.0,
            objective='reg:squarederror', 
            eval_metric='rmse',
            random_state=global_random_state
        )
        kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
        scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
        mse_scores = -scores
        mean_mse = np.mean(mse_scores)
        rmse = np.sqrt(mean_mse)
        
        if rmse < threshold_rmse:
            below_threshold_cluster = {
                "cluster_data": cluster_df,
                "cluster_RMSE": rmse,
                "cluster_centroid": kmeans.cluster_centers_[cluster_nr].tolist()
            }
            below_threshold_data.append(below_threshold_cluster)
            remaining_datapoints -= cluster_df_length
            print(f"Below Threshold Cluster Found -> RMSE: {rmse:.2f}, Remaining Data Points: {remaining_datapoints}")
        else:
            above_threshold_data.append(cluster_df)

print("Finished searching for clusters.")

found 0 physical cores < 1
  File "C:\Users\MOHAMMEDG\Anaconda3\envs\prc\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Below Threshold Cluster Found -> RMSE: 2677.06, Remaining Data Points: 228379
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 228376
Below Threshold Cluster Found -> RMSE: 2784.71, Remaining Data Points: 227918
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 227833
Below Threshold Cluster Found -> RMSE: 2987.28, Remaining Data Points: 107495
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 107434
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 107358
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 107338
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 107267
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 107236
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 107175
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Dat

In [14]:
original_count_of_datapoints = len(df)
datapoint_count_in_clusters = 0

cluster_centroids_info = []

for cluster_number, cluster_dict in enumerate(below_threshold_data):
    cluster_num = cluster_number+1
    cluster_df = cluster_dict["cluster_data"]
    cluster_df_length = len(cluster_df)
    datapoint_count_in_clusters += cluster_df_length
    cluster_centroids_info.append(
        cluster_dict['cluster_centroid']+[cluster_dict['cluster_RMSE'], cluster_df_length]
    )
    cluster_df.to_csv(f"./data/clusters/clusters_threshold_{str(threshold_rmse)}/cluster_{str(cluster_num)}.csv", index=False)
    print(f"./data/clusters/cluster_{str(cluster_num)}.csv is saved!")
print()

print("Cluster centroid information")
cluster_centroids_info_df = pd.DataFrame(cluster_centroids_info, columns=top_features_names+['rmse', 'datapoint_count'])
display(cluster_centroids_info_df)
cluster_centroids_info_df.to_csv(f"./data/clusters/clusters_threshold_{str(threshold_rmse)}/cluster_centroids_info.csv", index=False)
print(f"./data/clusters/clusters_threshold_{str(threshold_rmse)}/cluster_centroids_info.csv is saved!")

below_min_datapoints_df = pd.DataFrame([])
for part_df in below_min_datapoints:
    below_min_datapoints_df = pd.concat([below_min_datapoints_df, part_df], ignore_index=True)
below_min_datapoints_df.to_csv(f"./data/clusters/clusters_threshold_{str(threshold_rmse)}/below_min_datapoints.csv", index=False)
print(f"./data/clusters/clusters_threshold_{str(threshold_rmse)}/below_min_datapoints.csv is saved!")
datapoint_count_not_in_clusters = len(below_min_datapoints_df)

print("Original count of datapoints: ", original_count_of_datapoints)
print("Datapoints in clusters: ", datapoint_count_in_clusters)
print("Datapoints not in clusters: ", datapoint_count_not_in_clusters)
print(f"Percentage of clustered datapoints: {(datapoint_count_in_clusters/original_count_of_datapoints)*100:.2f}", )
print(f"Percentage of non-clustered datapoints: {(datapoint_count_not_in_clusters/original_count_of_datapoints)*100:.2f}", )
print()

./data/clusters/cluster_1.csv is saved!
./data/clusters/cluster_2.csv is saved!
./data/clusters/cluster_3.csv is saved!
./data/clusters/cluster_4.csv is saved!
./data/clusters/cluster_5.csv is saved!
./data/clusters/cluster_6.csv is saved!
./data/clusters/cluster_7.csv is saved!
./data/clusters/cluster_8.csv is saved!
./data/clusters/cluster_9.csv is saved!
./data/clusters/cluster_10.csv is saved!
./data/clusters/cluster_11.csv is saved!
./data/clusters/cluster_12.csv is saved!
./data/clusters/cluster_13.csv is saved!
./data/clusters/cluster_14.csv is saved!
./data/clusters/cluster_15.csv is saved!
./data/clusters/cluster_16.csv is saved!
./data/clusters/cluster_17.csv is saved!
./data/clusters/cluster_18.csv is saved!

Cluster centroid information


Unnamed: 0,wtc,flown_distance,aircraft_type,groundspeed_max,airline,altitude_max,flight_duration,latitude_max,ades,adep,...,track_75percentile,altitude_mean,temperature_min,actual_offblock_time_hour,vertical_rate_median,track_25percentile,vertical_rate_mean,arrival_time_hour,rmse,datapoint_count
0,0.970199,398.334209,11.578702,417.946669,14.814895,29224.611318,66.435363,52.431771,157.612396,179.035466,...,215.401864,15692.239619,229.938614,11.757457,-203.8273,156.913576,-117.808457,12.754276,2677.064469,140634
1,0.982533,655.358079,22.796943,465.218341,15.063319,37682.20524,100.268559,55.911555,176.858079,88.941048,...,175.791165,26314.300266,211.486057,12.39738,-6.999749,151.410941,1521.577273,14.126638,2784.706771,458
2,0.897978,911.345115,9.32197,470.777162,14.393622,37237.937588,132.797592,52.143106,172.76096,194.827629,...,203.169545,25928.94222,216.48621,11.501723,-5.344185,166.40503,-55.167865,13.286027,2987.281562,120338
3,1.0,840.117021,9.010638,480.882979,16.489362,37166.489362,121.659574,54.436921,179.393617,136.468085,...,153.458373,29773.265986,214.94736,11.308511,2.723404,113.195541,24.423325,13.574468,2984.017778,94
4,0.950658,1077.319079,11.210526,485.125,15.358553,37081.661184,154.730263,51.17768,272.325658,124.292763,...,159.04117,29640.415821,215.249568,10.118421,-0.2105263,138.749848,-10.196015,12.601974,2848.013783,304
5,1.0,965.93007,11.601399,468.125874,17.055944,36105.244755,143.636364,51.708038,97.608392,278.076923,...,235.343222,29404.037301,217.17193,11.27972,0.0,171.934997,-1.006182,13.839161,2769.966729,143
6,1.0,834.281915,10.351064,481.329787,15.553191,37042.154255,120.930851,54.348552,190.691489,97.914894,...,145.901437,29467.094685,214.390353,9.324468,1.361702,123.861718,-0.471659,11.569149,2906.413555,188
7,1.0,1257.390476,3.533333,478.32381,14.057143,33534.761905,183.438095,50.197088,245.190476,212.847619,...,208.25638,28626.532194,219.482036,13.171429,1.387779e-17,192.320956,-221.764138,15.438095,2861.291171,105
8,1.0,1277.010989,3.175824,464.802198,13.802198,34086.813187,190.769231,50.872119,136.945055,361.087912,...,293.972231,28825.739609,216.361828,8.230769,0.0,272.243215,-12.785108,11.714286,2581.317374,91
9,0.992218,1491.326848,2.622568,462.7393,13.634241,34395.817121,223.447471,49.927103,143.097276,350.70428,...,274.969843,29807.090177,217.956019,9.14786,0.0,246.290473,1.362907,13.22179,2815.799959,257


./data/clusters/clusters_threshold_3000/cluster_centroids_info.csv is saved!
./data/clusters/clusters_threshold_3000/below_min_datapoints.csv is saved!
Original count of datapoints:  369013
Datapoints in clusters:  263642
Datapoints not in clusters:  105371
Percentage of clustered datapoints: 71.45
Percentage of non-clustered datapoints: 28.55



In [15]:
display(cluster_centroids_info_df.describe()['rmse'])

count      18.000000
mean     2757.834180
std       186.603762
min      2406.960060
25%      2605.254148
50%      2800.253365
75%      2895.132959
max      2991.543246
Name: rmse, dtype: float64

In [None]:
below_threshold_data_df = pd.DataFrame([])
for part_df in below_threshold_data:
    below_threshold_data_df = pd.concat([below_threshold_data_df, part_df["cluster_data"]], ignore_index=True)

X = below_threshold_data_df[top_features_names]
y = below_threshold_data_df[target_name]
        
model = xgb.XGBRegressor(
colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)
kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
mse_scores = -scores
mean_mse = np.mean(mse_scores)
rmse = np.sqrt(mean_mse)

print("RMSE of clustered datapoints all together: ", rmse)

In [None]:
X = below_min_datapoints_df[top_features_names]
y = below_min_datapoints_df[target_name]
        
model = xgb.XGBRegressor(
colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)
kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
mse_scores = -scores
mean_mse = np.mean(mse_scores)
rmse = np.sqrt(mean_mse)

print("RMSE of non-clustered datapoints: ", rmse)