In [1]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
import xgboost as xgb
import pandas as pd
import numpy as np
import os

In [2]:
identifier_name = 'flight_id'

top_features_names = [
    'wtc', 
    'aircraft_type', 
    'flown_distance', 
    'groundspeed_max', 
    'latitude_min', 
    'altitude_25percentile', 
    'airline', 
    'flight_duration', 
    'longitude_max', 
    'vertical_rate_75percentile', 
    'altitude_median', 
    'ades', 
    'longitude_mean', 
    'altitude_75percentile', 
    'latitude_75percentile', 
    'vertical_rate_std', 
    'adep', 
    'latitude_std', 
    'vertical_rate_max', 
    'latitude_max', 
    'longitude_std', 
    'vertical_rate_25percentile', 
    'longitude_min', 
    'longitude_75percentile', 
    'altitude_mean', 
    'groundspeed_75percentile', 
    'country_code_adep', 
    'latitude_median', 
    'longitude_25percentile', 
    'groundspeed_min', 
    'country_code_ades', 
    'latitude_25percentile', 
    'actual_offblock_time_hour', 
    'longitude_median', 
    'month', 
    'altitude_std', 
    'latitude_count', 
    'taxiout_time', 
    'vertical_rate_median', 
    'month_day'
]

target_name = 'tow'

global_random_state = 123

In [3]:
folder_path = 'data/clusters/clusters_threshold_3000'
all_files = os.listdir(folder_path)
csv_files = [f for f in all_files if f.endswith('.csv')]
dataframes = []
for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)
combined_clusters_df = pd.concat(dataframes, ignore_index=True)
display(combined_clusters_df)

Unnamed: 0,wtc,aircraft_type,flown_distance,groundspeed_max,latitude_min,altitude_25percentile,airline,flight_duration,longitude_max,vertical_rate_75percentile,...,latitude_25percentile,actual_offblock_time_hour,longitude_median,month,altitude_std,latitude_count,taxiout_time,vertical_rate_median,month_day,tow
0,1,4,321,390.0,51.453232,9550.00,20,61,-0.443802,368.0,...,51.774945,13,-4.895513,1,11779.780447,3614.0,18,0.0,0,54748.000000
1,1,1,305,498.0,51.464942,7025.00,20,55,0.029984,768.0,...,51.682892,12,-1.853394,1,11661.560716,3265.0,14,-256.0,0,70318.447226
2,1,4,295,425.0,51.464905,7737.50,20,57,-0.061019,128.0,...,51.774136,7,-2.093523,1,12049.746662,3431.0,10,-64.0,0,56818.000000
3,1,5,257,560.0,40.158600,3450.00,14,44,33.130022,-640.0,...,40.158600,15,33.025635,1,1763.518004,809.0,25,-768.0,0,73571.000000
4,1,4,200,486.0,48.033809,7681.25,13,41,16.572653,0.0,...,48.068222,5,13.388322,1,11137.449462,2774.0,13,-640.0,0,56791.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273185,0,7,1497,492.0,40.276840,36000.00,14,212,28.724816,0.0,...,40.664886,11,12.210510,11,10115.546824,12277.0,16,0.0,313,192373.000000
273186,1,0,1562,500.0,38.273941,35975.00,18,217,17.930232,0.0,...,43.891314,9,3.075774,11,9302.054626,12966.0,11,0.0,330,72844.000000
273187,0,7,1613,474.0,41.315902,36000.00,14,223,28.725378,0.0,...,45.876113,5,13.929699,12,9753.898123,13309.0,16,0.0,335,195373.000000
273188,1,14,1348,440.0,40.144695,36000.00,14,222,33.027222,64.0,...,43.125663,11,20.065859,12,9646.272656,13282.0,10,0.0,357,62816.000000


In [4]:
X = combined_clusters_df[top_features_names]
y = combined_clusters_df[target_name]
        
model = xgb.XGBRegressor(
colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)
kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
mse_scores = -scores
mean_mse = np.mean(mse_scores)
rmse = np.sqrt(mean_mse)

print("RMSE of clustered datapoints all together: ", rmse)

RMSE of clustered datapoints all together:  2591.978655686011


In [5]:
non_clustered_df = pd.read_csv('data/clusters/clusters_threshold_3000/ext/below_min_datapoints.csv')

In [6]:
X = non_clustered_df[top_features_names]
y = non_clustered_df[target_name]

model = xgb.XGBRegressor(
colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)
kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
mse_scores = -scores
mean_mse = np.mean(mse_scores)
rmse = np.sqrt(mean_mse)

print("RMSE of clustered datapoints all together: ", rmse)

RMSE of clustered datapoints all together:  4145.756114093148


In [7]:
X = combined_clusters_df[top_features_names]
y = combined_clusters_df[target_name]

model = xgb.XGBRegressor(
colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)

model.fit(X, y)

y_pred = model.predict(non_clustered_df[top_features_names])

y_true = non_clustered_df[target_name].values

rmse = np.sqrt(mean_squared_error(y_true, y_pred))

print("Training on clusters datapoints and predicting non-clustered datapoints gives an RMSE:", rmse)

Training on clusters datapoints and predicting non-clustered datapoints gives an RMSE: 6402.611236263201


In [8]:
cluster_centroids_info_df = pd.read_csv('data/clusters/clusters_threshold_3000/ext/cluster_centroids_info.csv')

In [9]:
initial_centroids = np.array(cluster_centroids_info_df[top_features_names].values)
kmeans = KMeans(n_clusters=len(initial_centroids), init=initial_centroids, n_init=1)
non_clustered_df['cluster'] = kmeans.fit_predict(non_clustered_df[top_features_names])

In [10]:
cluster_models_dict = {}
folder_path = 'data/clusters/clusters_threshold_3000'

for cluster_nr in range(1, 36, 1):
    cluster_file_name = f"cluster_{str(cluster_nr)}.csv"
    file_path = os.path.join(folder_path, cluster_file_name)
    cluster_df = pd.read_csv(file_path)
    
    X = cluster_df[top_features_names]
    y = cluster_df[target_name]
    
    model = xgb.XGBRegressor(
    colsample_bytree=1.0,
        learning_rate=0.1, 
        max_depth=10, 
        n_estimators=91, 
        subsample=1.0,
        objective='reg:squarederror', 
        eval_metric='rmse',
        random_state=global_random_state
    )
    
    model.fit(X, y)

    cluster_models_dict[str(cluster_nr-1)] = model

print(f"Built {len(cluster_models_dict.keys())} clusters models.")

Built 35 clusters models.


In [11]:
# Function to score model using Root Mean Square Error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [12]:
pred_datapoints = []
for cluster_nr in range(0, 18, 1):
    this_cluster_df = non_clustered_df[non_clustered_df['cluster'] == cluster_nr].copy()
    model = cluster_models_dict[str(cluster_nr)]
    y_pred = model.predict(this_cluster_df[top_features_names])
    this_cluster_df['y_pred'] = y_pred
    pred_datapoints.append(this_cluster_df)
    
pred_datapoints_df = pd.concat(pred_datapoints, ignore_index=True)
y_true = pred_datapoints_df['tow'].values
y_pred = pred_datapoints_df['y_pred'].values
rmse_score = rmse(y_true, y_pred)
print("RMSE: ", rmse_score)

RMSE:  49978.36285332956


In [13]:
clustered_df = combined_clusters_df.copy()
clustered_df['cluster'] = kmeans.fit_predict(clustered_df[top_features_names])

In [14]:
pred_datapoints = []
for cluster_nr in range(0, 35, 1):
    this_cluster_df = clustered_df[clustered_df['cluster'] == cluster_nr].copy()
    model = cluster_models_dict[str(cluster_nr)]
    y_pred = model.predict(this_cluster_df[top_features_names])
    this_cluster_df['y_pred'] = y_pred
    pred_datapoints.append(this_cluster_df)
    
pred_datapoints_df = pd.concat(pred_datapoints, ignore_index=True)
y_true = pred_datapoints_df['tow'].values
y_pred = pred_datapoints_df['y_pred'].values
rmse_score = rmse(y_true, y_pred)
print("RMSE: ", rmse_score)

RMSE:  20030.195557908806


In [15]:
X = combined_clusters_df[top_features_names]
y = combined_clusters_df[target_name]

model = xgb.XGBRegressor(
colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)
kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
mse_scores = -scores
mean_mse = np.mean(mse_scores)
rmse_score = np.sqrt(mean_mse)

print("RMSE: ", rmse_score)

RMSE:  2591.978655686011


In [16]:
print((len(combined_clusters_df)/369013) * 100)

74.03262215694298


In [17]:
submission_df = pd.read_csv('data/encoded_submission_set.csv')

In [18]:
X = combined_clusters_df[top_features_names]
y = combined_clusters_df[target_name]

model = xgb.XGBRegressor(
colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)
model.fit(X, y)
y_pred = model.predict(submission_df[top_features_names].values)
submission_df['tow'] = y_pred
ready_submission_data = submission_df[['flight_id', 'tow']]
display(ready_submission_data)

ready_submission_data.to_csv(f"my_submission_v11.csv", index=False)
print(f"my_submission_v11.csv is saved!")

Unnamed: 0,flight_id,tow
0,248753821,68448.312500
1,248753822,212330.640625
2,248754498,219202.625000
3,248757623,59680.140625
4,248763603,64110.597656
...,...,...
105954,258066302,68846.945312
105955,258068609,167911.531250
105956,258068876,73787.648438
105957,258064675,61375.765625


my_submission_v11.csv is saved!
