In [1]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.cluster import KMeans
import xgboost as xgb
import pandas as pd
import numpy as np
import os

In [2]:
identifier_name = 'flight_id'

top_features_names = [
    'wtc', 
    'flown_distance', 
    'aircraft_type', 
    'groundspeed_max', 
    'airline', 
    'altitude_max', 
    'flight_duration', 
    'latitude_max', 
    'ades', 
    'adep', 
    'altitude_75percentile', 
    'altitude_median', 
    'longitude_max', 
    'vertical_rate_max', 
    'country_code_ades', 
    'longitude_min', 
    'latitude_min', 
    'vertical_rate_std', 
    'country_code_adep', 
    'longitude_std', 
    'altitude_25percentile', 
    'vertical_rate_75percentile', 
    'month', 
    'vertical_rate_25percentile', 
    'groundspeed_min', 
    'longitude_25percentile', 
    'latitude_25percentile', 
    'latitude_75percentile', 
    'taxiout_time', 
    'longitude_mean', 
    'longitude_median', 
    'month_day', 
    'latitude_mean', 
    'track_median', 
    'latitude_count', 
    'latitude_std', 
    'arrival_time_hour_minute', 
    'latitude_median', 
    'longitude_75percentile', 
    'track_75percentile', 
    'altitude_mean', 
    'temperature_min', 
    'actual_offblock_time_hour', 
    'vertical_rate_median', 
    'track_25percentile', 
    'vertical_rate_mean', 
    'arrival_time_hour'
]

target_name = 'tow'

global_random_state = 123

In [3]:
folder_path = 'data/clusters/clusters_threshold_3000'
all_files = os.listdir(folder_path)
csv_files = [f for f in all_files if f.endswith('.csv')]
dataframes = []
for csv_file in csv_files:
    file_path = os.path.join(folder_path, csv_file)
    df = pd.read_csv(file_path)
    dataframes.append(df)
combined_clusters_df = pd.concat(dataframes, ignore_index=True)
display(combined_clusters_df)

Unnamed: 0,wtc,flown_distance,aircraft_type,groundspeed_max,airline,altitude_max,flight_duration,latitude_max,ades,adep,...,longitude_75percentile,track_75percentile,altitude_mean,temperature_min,actual_offblock_time_hour,vertical_rate_median,track_25percentile,vertical_rate_mean,arrival_time_hour,tow
0,1,321,4,390.0,20,32025.0,61,52.028503,79,69,...,-2.418875,275.911467,20996.873271,222.613105,13,0.0,271.147685,12.768124,15,54748.000000
1,1,305,1,471.0,20,26100.0,55,53.426607,58,92,...,-5.538644,140.634945,14895.710572,229.947333,12,2752.0,94.522454,2754.662045,13,70318.447226
2,1,295,4,425.0,20,33050.0,57,54.596283,58,58,...,-0.601063,147.962436,19231.113033,220.891564,7,-64.0,125.063610,-58.076125,8,56818.000000
3,1,257,5,260.0,14,8200.0,44,40.394714,310,385,...,33.099854,216.469234,5252.163164,271.093667,15,-768.0,98.642803,-654.951792,16,73571.000000
4,1,200,4,390.0,13,41000.0,41,48.400681,23,321,...,15.850800,291.056706,15297.266497,203.858789,5,0.0,263.774171,-46.740198,6,56791.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263637,1,974,5,469.0,14,34050.0,142,48.694038,26,385,...,21.511431,302.524890,27723.342398,211.678693,9,0.0,293.369113,-34.959809,12,81682.000000
263638,1,956,5,439.0,14,34050.0,147,48.751226,26,385,...,21.757619,299.822273,28764.347382,217.179675,9,0.0,294.949763,-26.366533,12,81682.000000
263639,1,797,5,417.0,14,34050.0,137,42.100204,277,385,...,22.907776,282.062593,28934.034581,214.457463,10,0.0,264.079379,-61.015155,12,75682.000000
263640,1,1027,1,508.0,14,33050.0,146,48.714310,337,34,...,19.563177,125.855228,28525.880210,215.439067,10,0.0,108.737298,89.543019,13,81848.000000


In [4]:
X = combined_clusters_df[top_features_names]
y = combined_clusters_df[target_name]
        
model = xgb.XGBRegressor(
colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)
kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
mse_scores = -scores
mean_mse = np.mean(mse_scores)
rmse = np.sqrt(mean_mse)

print("RMSE of clustered datapoints all together: ", rmse)

RMSE of clustered datapoints all together:  2785.215563252263


In [5]:
non_clustered_df = pd.read_csv('data/clusters/clusters_threshold_3000/ext/below_min_datapoints.csv')

In [6]:
X = non_clustered_df[top_features_names]
y = non_clustered_df[target_name]

model = xgb.XGBRegressor(
colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)
kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
mse_scores = -scores
mean_mse = np.mean(mse_scores)
rmse = np.sqrt(mean_mse)

print("RMSE of clustered datapoints all together: ", rmse)

RMSE of clustered datapoints all together:  4056.916395619632


In [7]:
X = combined_clusters_df[top_features_names]
y = combined_clusters_df[target_name]

model = xgb.XGBRegressor(
colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)

model.fit(X, y)

y_pred = model.predict(non_clustered_df[top_features_names])

y_true = non_clustered_df[target_name].values

rmse = np.sqrt(mean_squared_error(y_true, y_pred))

print("Training on clusters datapoints and predicting non-clustered datapoints gives an RMSE:", rmse)

Training on clusters datapoints and predicting non-clustered datapoints gives an RMSE: 8581.66316846423


In [8]:
cluster_centroids_info_df = pd.read_csv('data/clusters/clusters_threshold_3000/ext/cluster_centroids_info.csv')

In [9]:
initial_centroids = np.array(cluster_centroids_info_df[top_features_names].values)
kmeans = KMeans(n_clusters=len(initial_centroids), init=initial_centroids, n_init=1)
non_clustered_df['cluster'] = kmeans.fit_predict(non_clustered_df[top_features_names])

found 0 physical cores < 1
  File "C:\Users\MOHAMMEDG\Anaconda3\envs\prc\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


In [10]:
cluster_models_dict = {}
folder_path = 'data/clusters/clusters_threshold_3000'

for cluster_nr in range(1, 19, 1):
    cluster_file_name = f"cluster_{str(cluster_nr)}.csv"
    file_path = os.path.join(folder_path, cluster_file_name)
    cluster_df = pd.read_csv(file_path)
    
    X = cluster_df[top_features_names]
    y = cluster_df[target_name]
    
    model = xgb.XGBRegressor(
    colsample_bytree=1.0,
        learning_rate=0.1, 
        max_depth=10, 
        n_estimators=91, 
        subsample=1.0,
        objective='reg:squarederror', 
        eval_metric='rmse',
        random_state=global_random_state
    )
    
    model.fit(X, y)

    cluster_models_dict[str(cluster_nr-1)] = model

print(f"Built {len(cluster_models_dict.keys())} clusters models.")

Built 18 clusters models.


In [11]:
# Function to score model using Root Mean Square Error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [12]:
pred_datapoints = []
for cluster_nr in range(0, 18, 1):
    this_cluster_df = non_clustered_df[non_clustered_df['cluster'] == cluster_nr].copy()
    model = cluster_models_dict[str(cluster_nr)]
    y_pred = model.predict(this_cluster_df[top_features_names])
    this_cluster_df['y_pred'] = y_pred
    pred_datapoints.append(this_cluster_df)
    
pred_datapoints_df = pd.concat(pred_datapoints, ignore_index=True)
y_true = pred_datapoints_df['tow'].values
y_pred = pred_datapoints_df['y_pred'].values
rmse_score = rmse(y_true, y_pred)
print("RMSE: ", rmse_score)

RMSE:  55474.33083303552


In [13]:
clustered_df = combined_clusters_df.copy()
clustered_df['cluster'] = kmeans.fit_predict(clustered_df[top_features_names])

In [14]:
pred_datapoints = []
for cluster_nr in range(0, 18, 1):
    this_cluster_df = clustered_df[clustered_df['cluster'] == cluster_nr].copy()
    model = cluster_models_dict[str(cluster_nr)]
    y_pred = model.predict(this_cluster_df[top_features_names])
    this_cluster_df['y_pred'] = y_pred
    pred_datapoints.append(this_cluster_df)
    
pred_datapoints_df = pd.concat(pred_datapoints, ignore_index=True)
y_true = pred_datapoints_df['tow'].values
y_pred = pred_datapoints_df['y_pred'].values
rmse_score = rmse(y_true, y_pred)
print("RMSE: ", rmse_score)

RMSE:  30824.543258005768


In [15]:
X = combined_clusters_df[top_features_names]
y = combined_clusters_df[target_name]

model = xgb.XGBRegressor(
colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)
kf = KFold(n_splits=3, shuffle=True, random_state=global_random_state)
scores = cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
mse_scores = -scores
mean_mse = np.mean(mse_scores)
rmse_score = np.sqrt(mean_mse)

print("RMSE: ", rmse_score)

RMSE:  2785.215563252263


In [16]:
print((len(combined_clusters_df)/369013) * 100)

71.44517943812278


In [17]:
submission_df = pd.read_csv('data/encoded_submission_set.csv')

In [18]:
X = combined_clusters_df[top_features_names]
y = combined_clusters_df[target_name]

model = xgb.XGBRegressor(
colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    n_estimators=91, 
    subsample=1.0,
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state
)
model.fit(X, y)
y_pred = model.predict(submission_df[top_features_names].values)
submission_df['tow'] = y_pred
ready_submission_data = submission_df[['flight_id', 'tow']]
display(ready_submission_data)

ready_submission_data.to_csv(f"my_submission_v11.csv", index=False)
print(f"my_submission_v11.csv is saved!")

Unnamed: 0,flight_id,tow
0,248753821,69040.085938
1,248753822,213377.578125
2,248754498,217194.343750
3,248757623,59094.429688
4,248763603,63539.832031
...,...,...
105954,258066302,70999.257812
105955,258068609,177699.968750
105956,258068876,74393.820312
105957,258064675,60084.917969


my_submission_v11.csv is saved!
