In [1]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np

In [2]:
identifier_name = 'flight_id'

top_features_names = [
    'wtc', 
    'flown_distance', 
    'aircraft_type', 
    'groundspeed_max', 
    'airline', 
    'altitude_max', 
    'flight_duration', 
    'latitude_max', 
    'ades', 
    'adep', 
    'altitude_75percentile', 
    'altitude_median', 
    'longitude_max', 
    'vertical_rate_max', 
    'country_code_ades', 
    'longitude_min', 
    'latitude_min', 
    'vertical_rate_std', 
    'country_code_adep', 
    'longitude_std', 
    'altitude_25percentile', 
    'vertical_rate_75percentile', 
    'month', 
    'vertical_rate_25percentile', 
    'groundspeed_min', 
    'longitude_25percentile', 
    'latitude_25percentile', 
    'latitude_75percentile', 
    'taxiout_time', 
    'longitude_mean', 
    'longitude_median', 
    'month_day', 
    'latitude_mean', 
    'track_median', 
    'latitude_count', 
    'latitude_std', 
    'arrival_time_hour_minute', 
    'latitude_median', 
    'longitude_75percentile', 
    'track_75percentile', 
    'altitude_mean', 
    'temperature_min', 
    'actual_offblock_time_hour', 
    'vertical_rate_median', 
    'track_25percentile', 
    'vertical_rate_mean', 
    'arrival_time_hour'
]

target_name = 'tow'

global_random_state = 123

In [3]:
encoded_challenge_set = pd.read_csv('data/encoded_challenge_set.csv')
encoded_challenge_set = encoded_challenge_set.fillna(0)
df = encoded_challenge_set[top_features_names+[target_name]]

In [4]:
threshold_rmse = 3000
below_min_datapoints = []
below_threshold_data = []
above_threshold_data = [df]

while len(above_threshold_data) > 0:
    data_df = above_threshold_data.pop().copy()
    kmeans = KMeans(n_clusters=2, random_state=global_random_state)
    data_df['cluster'] = kmeans.fit_predict(data_df[top_features_names])
    
    for cluster_nr in range(0, 2, 1):
        cluster_df = data_df[data_df['cluster'] == cluster_nr]
        X = cluster_df[top_features_names]
        y = cluster_df[target_name]
        
        if len(X) < 90:
            below_min_datapoints.append(data_df)
            continue
        
        model = xgb.XGBRegressor(
            colsample_bytree=1.0,
            learning_rate=0.1, 
            max_depth=10, 
            n_estimators=91, 
            subsample=1.0,
            objective='reg:squarederror', 
            eval_metric='rmse',
            random_state=global_random_state
        )
        kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
        scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
        mse_scores = -scores
        mean_mse = np.mean(mse_scores)
        rmse = np.sqrt(mean_mse)
        
        if rmse < threshold_rmse:
            print(f"Below Threshold: Cluster {cluster_nr} has RMSE {rmse}")
            below_threshold_data.append(cluster_df)
        else:
            above_threshold_data.append(cluster_df)

        print(f"{len(above_threshold_data) = }")

print("Done")

found 0 physical cores < 1
  File "C:\Users\MOHAMMEDG\Anaconda3\envs\prc\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


len(above_threshold_data) = 1
Below Threshold: Cluster 1 has RMSE 2677.0644686090363
len(above_threshold_data) = 1
len(above_threshold_data) = 1
len(above_threshold_data) = 2
len(above_threshold_data) = 2
Below Threshold: Cluster 0 has RMSE 2784.7067705936793
len(above_threshold_data) = 1
len(above_threshold_data) = 1
Below Threshold: Cluster 1 has RMSE 2987.2815619868875
len(above_threshold_data) = 1
len(above_threshold_data) = 1
len(above_threshold_data) = 2
len(above_threshold_data) = 2
len(above_threshold_data) = 3
len(above_threshold_data) = 3
len(above_threshold_data) = 4
len(above_threshold_data) = 4
len(above_threshold_data) = 5
len(above_threshold_data) = 5
len(above_threshold_data) = 6
len(above_threshold_data) = 6
len(above_threshold_data) = 6
len(above_threshold_data) = 7
len(above_threshold_data) = 7
len(above_threshold_data) = 8
len(above_threshold_data) = 8
len(above_threshold_data) = 9
len(above_threshold_data) = 9
len(above_threshold_data) = 10
len(above_threshold_data

In [5]:
print(len(below_threshold_data))

18


In [6]:
datapoint_count_in_clusters = 0
for cluster in below_threshold_data:
    datapoint_count_in_clusters += len(cluster)
print(datapoint_count_in_clusters)
print("original count of datapoints: ", len(df))

263642
original count of datapoints:  369013


In [7]:
print("Percentage: ", datapoint_count_in_clusters/len(df))

Percentage:  0.7144517943812277


In [11]:
for cluster_number, cluster_df in enumerate(below_threshold_data):
    cluster_df.to_csv(f"./data/clusters/clusters_threshold_{str(threshold_rmse)}/cluster_{str(cluster_number+1)}.csv", index=False)
    print(f"./data/clusters/cluster_{str(cluster_number+1)}.csv is saved!")

./data/clusters/cluster_1.csv is saved!
./data/clusters/cluster_2.csv is saved!
./data/clusters/cluster_3.csv is saved!
./data/clusters/cluster_4.csv is saved!
./data/clusters/cluster_5.csv is saved!
./data/clusters/cluster_6.csv is saved!
./data/clusters/cluster_7.csv is saved!
./data/clusters/cluster_8.csv is saved!
./data/clusters/cluster_9.csv is saved!
./data/clusters/cluster_10.csv is saved!
./data/clusters/cluster_11.csv is saved!
./data/clusters/cluster_12.csv is saved!
./data/clusters/cluster_13.csv is saved!
./data/clusters/cluster_14.csv is saved!
./data/clusters/cluster_15.csv is saved!
./data/clusters/cluster_16.csv is saved!
./data/clusters/cluster_17.csv is saved!
./data/clusters/cluster_18.csv is saved!
