In [1]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np

In [2]:
identifier_name = 'flight_id'

top_features_names = [
    'wtc', 
    'aircraft_type', 
    'flown_distance', 
    'groundspeed_max', 
    'latitude_min', 
    'altitude_25percentile', 
    'airline', 
    'flight_duration', 
    'longitude_max', 
    'vertical_rate_75percentile', 
    'altitude_median', 
    'ades', 
    'longitude_mean', 
    'altitude_75percentile', 
    'latitude_75percentile', 
    'vertical_rate_std', 
    'adep', 
    'latitude_std', 
    'vertical_rate_max', 
    'latitude_max', 
    'longitude_std', 
    'vertical_rate_25percentile', 
    'longitude_min', 
    'longitude_75percentile', 
    'altitude_mean', 
    'groundspeed_75percentile', 
    'country_code_adep', 
    'latitude_median', 
    'longitude_25percentile', 
    'groundspeed_min', 
    'country_code_ades', 
    'latitude_25percentile', 
    'actual_offblock_time_hour', 
    'longitude_median', 
    'month', 
    'altitude_std', 
    'latitude_count', 
    'taxiout_time', 
    'vertical_rate_median', 
    'month_day'
]

target_name = 'tow'

global_random_state = 123

In [3]:
encoded_challenge_set = pd.read_csv('data/encoded_challenge_set.csv')
encoded_challenge_set = encoded_challenge_set.fillna(0)
df = encoded_challenge_set[top_features_names+[target_name]]

In [4]:
threshold_rmse = 3000
below_threshold_data = []
above_threshold_data = [df]

below_min_datapoints = []

remaining_datapoints = len(df)

while len(above_threshold_data) > 0:
    data_df = above_threshold_data.pop().copy()
    kmeans = KMeans(n_clusters=2, random_state=global_random_state)
    data_df['cluster'] = kmeans.fit_predict(data_df[top_features_names])
    
    for cluster_nr in range(0, 2, 1):
        cluster_df = data_df[data_df['cluster'] == cluster_nr]
        cluster_df = cluster_df.drop('cluster', axis=1)
        cluster_df_length = len(cluster_df)
        
        if len(cluster_df) < 90:
            below_min_datapoints.append(cluster_df)
            remaining_datapoints -= cluster_df_length
            print(f"Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: {remaining_datapoints}")
            continue

        X = cluster_df[top_features_names]
        y = cluster_df[target_name]
        
        model = xgb.XGBRegressor(
            colsample_bytree=1.0,
            learning_rate=0.1, 
            max_depth=10, 
            n_estimators=91, 
            subsample=1.0,
            objective='reg:squarederror', 
            eval_metric='rmse',
            random_state=global_random_state
        )
        kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
        scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
        mse_scores = -scores
        mean_mse = np.mean(mse_scores)
        rmse = np.sqrt(mean_mse)
        
        if rmse < threshold_rmse:
            below_threshold_cluster = {
                "cluster_data": cluster_df,
                "cluster_RMSE": rmse,
                "cluster_centroid": kmeans.cluster_centers_[cluster_nr].tolist()
            }
            below_threshold_data.append(below_threshold_cluster)
            remaining_datapoints -= cluster_df_length
            print(f"Below Threshold Cluster Found -> RMSE: {rmse:.2f}, Remaining Data Points: {remaining_datapoints}")
        else:
            above_threshold_data.append(cluster_df)

print("Finished searching for clusters.")

found 0 physical cores < 1
  File "C:\Users\MOHAMMEDG\Anaconda3\envs\prc\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Below Threshold Cluster Found -> RMSE: 2611.73, Remaining Data Points: 232445
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 232442
Below Threshold Cluster Found -> RMSE: 2866.65, Remaining Data Points: 231873
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 231803
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 231752
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 231689
Below Threshold Cluster Found -> RMSE: 2685.82, Remaining Data Points: 116013
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 115935
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 115898
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 115838
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Data Points: 115764
Cluster Reached Below Minimum Requirement of Datapoints, Remaining Dat

In [5]:
original_count_of_datapoints = len(df)
datapoint_count_in_clusters = 0

cluster_centroids_info = []

for cluster_number, cluster_dict in enumerate(below_threshold_data):
    cluster_num = cluster_number+1
    cluster_df = cluster_dict["cluster_data"]
    cluster_df_length = len(cluster_df)
    datapoint_count_in_clusters += cluster_df_length
    cluster_centroids_info.append(
        cluster_dict['cluster_centroid']+[cluster_dict['cluster_RMSE'], cluster_df_length]
    )
    cluster_df.to_csv(f"./data/clusters/clusters_threshold_{str(threshold_rmse)}/cluster_{str(cluster_num)}.csv", index=False)
    print(f"./data/clusters/cluster_{str(cluster_num)}.csv is saved!")
print()

print("Cluster centroid information")
cluster_centroids_info_df = pd.DataFrame(cluster_centroids_info, columns=top_features_names+['rmse', 'datapoint_count'])
display(cluster_centroids_info_df)
cluster_centroids_info_df.to_csv(f"./data/clusters/clusters_threshold_{str(threshold_rmse)}/ext/cluster_centroids_info.csv", index=False)
print(f"./data/clusters/clusters_threshold_{str(threshold_rmse)}/cluster_centroids_info.csv is saved!")

below_min_datapoints_df = pd.DataFrame([])
for part_df in below_min_datapoints:
    below_min_datapoints_df = pd.concat([below_min_datapoints_df, part_df], ignore_index=True)
below_min_datapoints_df.to_csv(f"./data/clusters/clusters_threshold_{str(threshold_rmse)}/ext/below_min_datapoints.csv", index=False)
print(f"./data/clusters/clusters_threshold_{str(threshold_rmse)}/below_min_datapoints.csv is saved!")
datapoint_count_not_in_clusters = len(below_min_datapoints_df)

print("Original count of datapoints: ", original_count_of_datapoints)
print("Datapoints in clusters: ", datapoint_count_in_clusters)
print("Datapoints not in clusters: ", datapoint_count_not_in_clusters)
print(f"Percentage of clustered datapoints: {(datapoint_count_in_clusters/original_count_of_datapoints)*100:.2f}", )
print(f"Percentage of non-clustered datapoints: {(datapoint_count_not_in_clusters/original_count_of_datapoints)*100:.2f}", )
print()

./data/clusters/cluster_1.csv is saved!
./data/clusters/cluster_2.csv is saved!
./data/clusters/cluster_3.csv is saved!
./data/clusters/cluster_4.csv is saved!
./data/clusters/cluster_5.csv is saved!
./data/clusters/cluster_6.csv is saved!
./data/clusters/cluster_7.csv is saved!
./data/clusters/cluster_8.csv is saved!
./data/clusters/cluster_9.csv is saved!
./data/clusters/cluster_10.csv is saved!
./data/clusters/cluster_11.csv is saved!
./data/clusters/cluster_12.csv is saved!
./data/clusters/cluster_13.csv is saved!
./data/clusters/cluster_14.csv is saved!
./data/clusters/cluster_15.csv is saved!
./data/clusters/cluster_16.csv is saved!
./data/clusters/cluster_17.csv is saved!
./data/clusters/cluster_18.csv is saved!
./data/clusters/cluster_19.csv is saved!
./data/clusters/cluster_20.csv is saved!
./data/clusters/cluster_21.csv is saved!
./data/clusters/cluster_22.csv is saved!
./data/clusters/cluster_23.csv is saved!
./data/clusters/cluster_24.csv is saved!
./data/clusters/cluster_2

Unnamed: 0,wtc,aircraft_type,flown_distance,groundspeed_max,latitude_min,altitude_25percentile,airline,flight_duration,longitude_max,vertical_rate_75percentile,...,actual_offblock_time_hour,longitude_median,month,altitude_std,latitude_count,taxiout_time,vertical_rate_median,month_day,rmse,datapoint_count
0,0.956569,11.829648,442.565542,397.954462,47.3887,6239.860295,14.857402,71.757262,12.565465,649.656486,...,11.624189,10.362145,7.156395,9587.032713,2846.484832,12.198955,-225.4603,200.683677,2611.7296,136568
1,0.97891,23.02109,632.699473,471.441125,48.263135,16470.353691,15.405975,97.634446,15.031651,842.601054,...,12.297012,10.13009,2.734622,12422.222112,5926.87522,11.836555,-23.95782,65.511424,2866.646089,569
2,0.923519,9.166623,804.438642,474.154883,45.147658,16201.72672,14.496756,118.879421,14.438153,262.623062,...,11.637407,7.377306,6.947493,12517.683572,6114.926928,13.146309,-9.238649,194.413292,2685.822137,115676
3,0.972997,10.423942,1809.534653,497.956796,35.765966,35735.064131,9.983798,258.851485,29.467212,5.357336,...,12.954095,15.205721,6.970297,9412.699866,15427.30423,13.163816,-0.1728173,195.191719,2264.499689,1111
4,0.96347,5.890411,1628.636986,487.76484,36.736309,35801.555365,14.454338,232.084475,22.179947,9.205479,...,12.406393,8.993067,7.057078,8991.254742,13753.760274,12.527397,-0.5844749,197.232877,2515.93011,438
5,0.928571,7.276786,1626.642857,479.348214,36.611201,35955.803571,12.473214,236.321429,26.040361,13.142857,...,13.607143,14.256698,6.571429,9345.924181,14051.928571,12.767857,0.0,183.125,2858.17336,112
6,0.96875,10.0625,1850.117188,508.351562,35.460236,36891.601562,8.25,247.75,29.83948,3.5,...,12.585938,15.710225,7.0,9420.218215,14565.601562,13.421875,0.0,196.492187,2253.161181,128
7,0.974026,8.675325,1464.363636,483.402597,36.756106,36502.069805,18.12987,207.188312,19.931791,15.792208,...,10.175325,9.782133,6.818182,9390.29578,11810.448052,11.519481,-1.246753,191.292208,2305.345777,154
8,0.885417,9.90625,1513.833333,471.520833,37.687869,35913.606771,14.447917,219.614583,25.34383,12.666667,...,13.104167,14.542698,7.375,9809.210257,13043.375,12.5625,-2.0,208.166667,2752.62754,96
9,0.9819,6.552036,1475.624434,491.927602,38.396275,35667.16629,16.918552,207.031674,18.066763,15.348416,...,10.918552,7.777898,7.040724,9586.061507,12025.678733,13.352941,-2.027149,196.945701,2649.046425,221


./data/clusters/clusters_threshold_3000/cluster_centroids_info.csv is saved!
./data/clusters/clusters_threshold_3000/below_min_datapoints.csv is saved!
Original count of datapoints:  369013
Datapoints in clusters:  273190
Datapoints not in clusters:  95823
Percentage of clustered datapoints: 74.03
Percentage of non-clustered datapoints: 25.97



In [6]:
display(cluster_centroids_info_df.describe()['rmse'])

count      35.000000
mean     2705.233430
std       264.621436
min      2070.385234
25%      2570.500360
50%      2800.733657
75%      2921.165505
max      2998.353794
Name: rmse, dtype: float64

In [7]:
below_threshold_data_df = pd.DataFrame([])
for part_df in below_threshold_data:
    below_threshold_data_df = pd.concat([below_threshold_data_df, part_df["cluster_data"]], ignore_index=True)

X = below_threshold_data_df[top_features_names]
y = below_threshold_data_df[target_name]
        
model = xgb.XGBRegressor(
colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)
kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
mse_scores = -scores
mean_mse = np.mean(mse_scores)
rmse = np.sqrt(mean_mse)

print("RMSE of clustered datapoints all together: ", rmse)

RMSE of clustered datapoints all together:  2572.6835380508105


In [8]:
X = below_min_datapoints_df[top_features_names]
y = below_min_datapoints_df[target_name]
        
model = xgb.XGBRegressor(
    colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)
kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
mse_scores = -scores
mean_mse = np.mean(mse_scores)
rmse = np.sqrt(mean_mse)

print("RMSE of non-clustered datapoints: ", rmse)

RMSE of non-clustered datapoints:  4145.756114093148
