In [1]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from xgboost import plot_importance
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import pandas as pd
import numpy as np
import itertools
import time

In [2]:
challenge_set_df = pd.read_csv('../../data/encoded_challenge_set.csv')

In [3]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def scale_data(dataframe, features, scale_type):
    result_df = dataframe.copy()
    if scale_type == "Normalization":
        scaler = MinMaxScaler(feature_range=(0, 1))
    elif scale_type == "Standardization":
        scaler = StandardScaler()
    result_df[features] = scaler.fit_transform(result_df[features])
    return result_df

In [4]:
feature_columns = [
    'month-day', 'month', 'day_in_month', 'adep', 'country_code_adep', 'ades', 'country_code_ades', 'aircraft_type',
    'wtc', 'airline', 'flight_duration', 'taxiout_time', 'flown_distance', 'actual_offblock_time_hour', 'actual_offblock_time_minute',
    'actual_offblock_time_hour_minute', 'arrival_time_hour', 'arrival_time_minute', 'arrival_time_hour_minute', 'latitude_median', 
    'longitude_median', 'altitude_mean', 'groundspeed_mean', 'track_mean', 'vertical_rate_mean', 'track_unwrapped_mean', 'u_component_of_wind_mean', 
    'v_component_of_wind_mean', 'temperature_mean', 'specific_humidity_mean'
]

feature_columns = [
    'month-day', 'month', 'day_in_month', 'adep', 'country_code_adep', 'ades', 'country_code_ades', 'aircraft_type',
    'wtc', 'airline', 'flight_duration', 'taxiout_time', 'flown_distance', 'actual_offblock_time_hour', 'actual_offblock_time_minute',
    'actual_offblock_time_hour_minute', 'arrival_time_hour', 'arrival_time_minute', 'arrival_time_hour_minute'
]

target_column = 'tow'

df = challenge_set_df[feature_columns + [target_column]]
#display(df)
#display(df.describe())
#display(df.info())

In [5]:
standardized_df = scale_data(dataframe=df, features=feature_columns, scale_type="Standardization")

In [6]:
data_df = standardized_df.copy()
current_training_target = target_column

for KNN_iteration in range(1, 5, 1):
    print(f"{KNN_iteration = }")
    
    features = data_df[feature_columns]
    
    if KNN_iteration > 1:
        current_training_target = f'KNN_tow_{KNN_iteration-1}'

    target = data_df[current_training_target]
    print(f"Training for target {current_training_target}")
    
    mse_scores = []
    new_target_column = f'KNN_tow_{KNN_iteration}'
    data_df[new_target_column] = np.nan
    
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    for train_index, test_index in kf.split(features):
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        
        model = KNeighborsRegressor(n_neighbors=4)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse_scores.append(rmse(y_test, y_pred))
        
        data_df.loc[test_index, new_target_column] = y_pred

    print(f"KNN RMSE Score for iteration {KNN_iteration}: {np.mean(mse_scores)}")
    print(f"KNN RMSE Score with original tow column for iteration {KNN_iteration}: {rmse(data_df[target_column], data_df[new_target_column])}")
    print()
    
display(data_df)

KNN_iteration = 1
Training for target tow


found 0 physical cores < 1
  File "C:\Users\MOHAMMEDG\Anaconda3\envs\prc\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


KNN RMSE Score for iteration 1: 7727.740588677475
KNN RMSE Score with original tow column for iteration 1: 7727.750243924688

KNN_iteration = 2
Training for target KNN_tow_1
KNN RMSE Score for iteration 2: 3499.4405794686604
KNN RMSE Score with original tow column for iteration 2: 6995.6351452360195

KNN_iteration = 3
Training for target KNN_tow_2
KNN RMSE Score for iteration 3: 2064.632783530687
KNN RMSE Score with original tow column for iteration 3: 7823.385900283956

KNN_iteration = 4
Training for target KNN_tow_3
KNN RMSE Score for iteration 4: 1421.1465507668781
KNN RMSE Score with original tow column for iteration 4: 7756.329174360374



Unnamed: 0,month-day,month,day_in_month,adep,country_code_adep,ades,country_code_ades,aircraft_type,wtc,airline,...,actual_offblock_time_minute,actual_offblock_time_hour_minute,arrival_time_hour,arrival_time_minute,arrival_time_hour_minute,tow,KNN_tow_1,KNN_tow_2,KNN_tow_3,KNN_tow_4
0,-2.000759,-1.866859,-1.661343,-0.988474,-0.379690,-0.788094,-0.121588,-0.695025,0.404228,0.870696,...,0.957106,0.351027,0.381109,-1.423818,0.301709,54748.000000,53484.500000,55674.750000,55117.343750,55970.968750
1,-2.000759,-1.866859,-1.661343,0.171745,-0.505965,-0.033128,1.691462,0.815732,-2.473854,-0.704598,...,1.468655,-0.383524,1.152709,0.477167,1.179590,185441.000000,156791.000000,163963.687500,161559.562500,161193.406250
2,-2.000759,-1.866859,-1.661343,-0.451774,1.230319,-0.006792,1.691462,-0.371292,-2.473854,0.555638,...,0.559234,-0.434402,1.152709,-1.193396,1.086336,230396.000000,221396.500000,227646.625000,223458.296875,225544.648438
3,-2.000759,-1.866859,-1.661343,1.134648,-0.947928,0.001987,1.691462,1.139466,-2.473854,-0.704598,...,-1.430123,-0.164113,1.152709,0.189139,1.163512,157615.000000,181525.250000,167077.250000,179641.531250,175614.000000
4,-2.000759,-1.866859,-1.661343,-0.806943,-0.158708,-0.972446,-0.410028,-1.018759,0.404228,0.870696,...,0.388718,0.128436,-0.004691,0.880407,0.044455,70318.447226,55294.000000,59783.867536,56167.263046,58096.769681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369008,1.747757,1.588205,1.810492,0.463773,-0.411258,-0.033128,1.691462,1.139466,-2.473854,-0.704598,...,0.502396,-0.437582,1.152709,-1.481424,1.070257,163438.000000,159504.250000,166881.437500,167697.984375,168391.671875
369009,1.747757,1.588205,1.810492,1.505602,1.482869,-1.349930,-0.739673,-1.018759,0.404228,-0.074480,...,-0.122831,-0.472561,-0.197591,0.016322,-0.196721,78707.000000,80149.250000,77996.625000,78527.500000,77904.464844
369010,1.747757,1.588205,1.810492,-1.296287,-0.726946,-0.779315,-0.121588,-0.695025,0.404228,0.870696,...,1.298138,-0.393064,-0.390491,0.707590,-0.351074,62942.750000,63875.445924,63264.948219,63222.084199,62977.584277
369011,1.747757,1.588205,1.810492,0.463773,-0.411258,-0.779315,-0.121588,-1.018759,0.404228,0.870696,...,0.445557,-0.440762,-0.390491,-0.905368,-0.441113,72611.161024,72611.161024,68187.393071,69010.594496,67112.420477


In [10]:
data_df = standardized_df.copy()
current_training_target = target_column

for XGBoost_iteration in range(1, 11, 1):
    print(f"{XGBoost_iteration = }")
    
    features = data_df[feature_columns]
    
    if XGBoost_iteration > 1:
        current_training_target = f'XGBoost_tow_{XGBoost_iteration-1}'

    target = data_df[current_training_target]
    print(f"Training for target {current_training_target}")
    
    mse_scores = []
    new_target_column = f'XGBoost_tow_{XGBoost_iteration}'
    data_df[new_target_column] = np.nan
    
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    for train_index, test_index in kf.split(features):
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        
        model = xgb.XGBRegressor(objective='reg:squarederror')
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse_scores.append(rmse(y_test, y_pred))
        
        data_df.loc[test_index, new_target_column] = y_pred

    print(f"XGBoost RMSE Score for iteration {XGBoost_iteration}: {np.mean(mse_scores)}")
    print(f"XGBoost RMSE Score with original tow column for iteration {XGBoost_iteration}: {rmse(data_df[target_column], data_df[new_target_column])}")
    print()
    
display(data_df)

XGBoost_iteration = 1
Training for target tow
XGBoost RMSE Score for iteration 1: 3820.9302449483152
XGBoost RMSE Score with original tow column for iteration 1: 3820.9873522707676

XGBoost_iteration = 2
Training for target XGBoost_tow_1
XGBoost RMSE Score for iteration 2: 1269.8731150315127
XGBoost RMSE Score with original tow column for iteration 2: 3985.287693691827

XGBoost_iteration = 3
Training for target XGBoost_tow_2
XGBoost RMSE Score for iteration 3: 911.4246832385048
XGBoost RMSE Score with original tow column for iteration 3: 4148.086612005061

XGBoost_iteration = 4
Training for target XGBoost_tow_3
XGBoost RMSE Score for iteration 4: 769.4153286217673
XGBoost RMSE Score with original tow column for iteration 4: 4228.730703539551

XGBoost_iteration = 5
Training for target XGBoost_tow_4
XGBoost RMSE Score for iteration 5: 646.7055007325421
XGBoost RMSE Score with original tow column for iteration 5: 4310.876776284008

XGBoost_iteration = 6
Training for target XGBoost_tow_5
X

Unnamed: 0,month-day,month,day_in_month,adep,country_code_adep,ades,country_code_ades,aircraft_type,wtc,airline,...,XGBoost_tow_1,XGBoost_tow_2,XGBoost_tow_3,XGBoost_tow_4,XGBoost_tow_5,XGBoost_tow_6,XGBoost_tow_7,XGBoost_tow_8,XGBoost_tow_9,XGBoost_tow_10
0,-2.000759,-1.866859,-1.661343,-0.988474,-0.379690,-0.788094,-0.121588,-0.695025,0.404228,0.870696,...,56693.832031,58644.128906,58306.429688,59372.156250,59117.695312,59359.695312,59216.437500,59240.011719,59052.046875,59502.503906
1,-2.000759,-1.866859,-1.661343,0.171745,-0.505965,-0.033128,1.691462,0.815732,-2.473854,-0.704598,...,184737.578125,186203.531250,181523.421875,183738.765625,182383.687500,182739.078125,182446.187500,182463.953125,181965.312500,181828.656250
2,-2.000759,-1.866859,-1.661343,-0.451774,1.230319,-0.006792,1.691462,-0.371292,-2.473854,0.555638,...,217756.515625,218797.640625,217407.968750,218369.609375,219437.625000,219439.515625,219376.921875,219050.687500,217080.843750,217864.671875
3,-2.000759,-1.866859,-1.661343,1.134648,-0.947928,0.001987,1.691462,1.139466,-2.473854,-0.704598,...,153833.000000,153008.734375,149617.968750,153051.250000,151552.296875,151235.125000,156143.609375,153517.703125,156195.609375,151319.000000
4,-2.000759,-1.866859,-1.661343,-0.806943,-0.158708,-0.972446,-0.410028,-1.018759,0.404228,0.870696,...,68723.882812,68757.625000,69318.335938,70275.054688,69823.382812,70762.609375,70841.960938,70960.953125,70957.031250,70555.492188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369008,1.747757,1.588205,1.810492,0.463773,-0.411258,-0.033128,1.691462,1.139466,-2.473854,-0.704598,...,159540.843750,161855.468750,163784.406250,163566.359375,165111.781250,164592.656250,164678.390625,164992.062500,165312.390625,165869.437500
369009,1.747757,1.588205,1.810492,1.505602,1.482869,-1.349930,-0.739673,-1.018759,0.404228,-0.074480,...,77181.054688,77559.359375,75979.726562,76757.843750,76553.062500,76680.046875,77351.765625,78137.039062,78022.945312,78310.578125
369010,1.747757,1.588205,1.810492,-1.296287,-0.726946,-0.779315,-0.121588,-0.695025,0.404228,0.870696,...,61758.300781,61411.187500,61806.359375,60830.843750,61488.683594,61548.589844,61696.632812,61392.957031,61471.863281,61356.582031
369011,1.747757,1.588205,1.810492,0.463773,-0.411258,-0.779315,-0.121588,-1.018759,0.404228,0.870696,...,72053.320312,70897.000000,73667.367188,71563.531250,71037.171875,70763.328125,70994.054688,71669.898438,71242.937500,70243.976562
