In [1]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import StandardScaler
from xgboost import plot_importance
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

In [2]:
identifier_name = 'flight_id'

features_names = [
    'wtc', 
    'flown_distance', 
    'aircraft_type', 
    'groundspeed_max', 
    'groundspeed_75percentile', 
    'altitude_25percentile', 
    'airline', 
    'longitude_max', 
    'latitude_min', 
    'flight_duration', 
    'vertical_rate_std', 
    'altitude_median', 
    'longitude_mean', 
    'altitude_75percentile', 
    'longitude_std', 
    'latitude_max', 
    'latitude_std', 
    'vertical_rate_max', 
    'vertical_rate_25percentile', 
    'latitude_mean', 
    'longitude_min', 
    'longitude_25percentile', 
    'ades', 
    'adep', 
    'country_code_adep', 
    'vertical_rate_75percentile', 
    'latitude_median', 
    'altitude_mean', 
    'latitude_25percentile', 
    'country_code_ades', 
    'month_day', 
    'actual_offblock_time_hour', 
    'groundspeed_min', 
    'longitude_75percentile', 
    'vertical_rate_median'
]

target_name = 'tow'

global_random_state = 123

In [3]:
encoded_challenge_set = pd.read_csv('data/encoded_challenge_set.csv')
challenge_features = encoded_challenge_set[features_names]
challenge_target = encoded_challenge_set[target_name]

In [4]:
# Function to score model using Root Mean Square Error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
    
# Create a scorer
rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Make cross validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=global_random_state)

In [5]:
accurate_challenge_data = pd.DataFrame({})

number_of_training_data = 0

kfold_num = 0
for train_index, test_index in cv_strategy.split(challenge_features):
    kfold_num += 1
    print("k-fold number:", kfold_num)
    X_train, X_test = challenge_features.iloc[train_index], challenge_features.iloc[test_index]
    y_train, y_test = challenge_target.iloc[train_index], challenge_target.iloc[test_index]

    model = xgb.XGBRegressor(
        objective='reg:squarederror', 
        eval_metric='rmse',
        random_state=global_random_state,
        n_estimators=3000,
        colsample_bytree=1.0,
        learning_rate=0.1, 
        max_depth=10, 
        subsample=1.0
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    diff_df = X_test.copy()
    diff_df['tow'] = y_test
    diff_df['y_pred'] = y_pred
    diff_df['difference'] = (diff_df['y_pred'] - diff_df['tow']).abs()
    diff_df['percentage_difference'] = (diff_df['difference'] / diff_df['tow']) * 100

    max_percentage_difference = np.ceil(diff_df['percentage_difference'].max()) 

    for per_diff in range(0, int(max_percentage_difference), 1):
        
        per_diff_df = diff_df[(diff_df['percentage_difference'] <= per_diff+1) & (diff_df['percentage_difference'] >= per_diff)]
        
        if len(per_diff_df) > 0:
            per_diff_df = per_diff_df.drop(columns=['y_pred', 'difference', 'percentage_difference'])
            
            duplicated_dfs = []
            for duplicate in range(per_diff+1):
                duplicated_dfs.append(per_diff_df)
                number_of_training_data += len(per_diff_df)
            duplicated_df = pd.concat(duplicated_dfs, ignore_index=True)
            
            accurate_challenge_data = pd.concat([accurate_challenge_data, duplicated_df], ignore_index=True)

print("Total number of training data: ", number_of_training_data)

k-fold number: 1
k-fold number: 2
k-fold number: 3
k-fold number: 4
k-fold number: 5
Total number of training data:  1037416


In [6]:
display(accurate_challenge_data)

Unnamed: 0,wtc,flown_distance,aircraft_type,groundspeed_max,groundspeed_75percentile,altitude_25percentile,airline,longitude_max,latitude_min,flight_duration,vertical_rate_std,altitude_median,longitude_mean,altitude_75percentile,longitude_std,latitude_max,latitude_std,vertical_rate_max,vertical_rate_25percentile,latitude_mean,longitude_min,longitude_25percentile,ades,adep,country_code_adep,vertical_rate_75percentile,latitude_median,altitude_mean,latitude_25percentile,country_code_ades,month_day,actual_offblock_time_hour,groundspeed_min,longitude_75percentile,vertical_rate_median,tow
0,1,995,14,444.0,402.0,26750.00,14,29.280562,40.827849,160,814.471365,36000.0,19.438811,36000.0,6.271532,48.755788,2.439118,3328.0,-64.0,45.646331,9.191997,13.853554,26,384,96,64.0,46.279358,29682.508852,43.592726,19,0,8,102.0,24.867815,0.0,70770.000000
1,1,1499,1,475.0,410.0,35950.00,14,28.944397,41.182065,232,774.839658,36000.0,13.565320,36000.0,9.890095,53.458681,3.564781,3200.0,-64.0,47.912058,-2.282107,4.802232,47,385,96,64.0,48.336136,31733.006823,45.065358,27,0,7,110.0,22.327211,0.0,75707.000000
2,1,635,3,455.0,438.0,20775.00,22,11.113554,50.908630,99,1326.652532,36025.0,7.320624,38000.0,2.201079,60.319457,3.015543,4608.0,-128.0,55.259812,4.489325,5.066405,9,108,74,64.0,55.172362,28922.746835,52.474939,9,0,6,102.0,9.284637,0.0,59008.560186
3,1,998,4,494.0,477.0,28300.00,22,4.714842,36.577909,137,989.952264,36000.0,-0.945566,36000.0,2.715903,51.024465,4.652258,4672.0,-64.0,44.153781,-4.507141,-3.362427,9,223,33,64.0,44.245202,29878.621779,40.096207,9,0,19,99.0,1.005655,0.0,68318.356522
4,1,978,4,438.0,420.0,26350.00,22,4.492220,36.673828,150,947.295753,37000.0,-0.761313,37000.0,2.771787,50.909500,4.514980,3520.0,0.0,43.499961,-4.835387,-3.169434,197,17,11,0.0,43.500320,30549.035069,39.158253,24,0,10,89.0,1.722725,0.0,68318.356522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037411,1,735,13,455.0,450.0,20843.75,18,22.362448,50.105026,109,1202.443819,38975.0,17.112983,39000.0,3.460967,60.182361,3.227393,4096.0,-768.0,54.698700,11.092072,14.129728,108,108,74,64.0,54.621094,29591.091954,51.668753,60,162,7,142.0,20.142458,0.0,39788.000000
1037412,1,1489,4,448.0,412.0,29975.00,18,18.268275,40.394028,234,794.267479,30000.0,6.010203,30000.0,6.739345,59.805496,6.024300,4736.0,0.0,51.462263,-3.555263,0.253489,123,222,33,0.0,52.096377,27455.521504,46.316574,67,69,11,93.0,11.665177,0.0,50791.000000
1037413,1,735,13,455.0,450.0,20843.75,18,22.362448,50.105026,109,1202.443819,38975.0,17.112983,39000.0,3.460967,60.182361,3.227393,4096.0,-768.0,54.698700,11.092072,14.129728,108,108,74,64.0,54.621094,29591.091954,51.668753,60,162,7,142.0,20.142458,0.0,39788.000000
1037414,1,1489,4,448.0,412.0,29975.00,18,18.268275,40.394028,234,794.267479,30000.0,6.010203,30000.0,6.739345,59.805496,6.024300,4736.0,0.0,51.462263,-3.555263,0.253489,123,222,33,0.0,52.096377,27455.521504,46.316574,67,69,11,93.0,11.665177,0.0,50791.000000


In [7]:
accurate_challenge_features = accurate_challenge_data[features_names]
accurate_challenge_target = accurate_challenge_data[target_name]

In [8]:
rmse_scores = []
for train_index, test_index in cv_strategy.split(accurate_challenge_features):
    X_train, X_test = accurate_challenge_features.iloc[train_index], accurate_challenge_features.iloc[test_index]
    y_train, y_test = accurate_challenge_target.iloc[train_index], accurate_challenge_target.iloc[test_index]

    model = xgb.XGBRegressor(
        objective='reg:squarederror', 
        eval_metric='rmse',
        random_state=global_random_state,
        n_estimators=3000,
        colsample_bytree=1.0,
        learning_rate=0.1, 
        max_depth=10, 
        subsample=1.0
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse_score = rmse(y_test, y_pred)
    rmse_scores.append(rmse_score)

print(f"RMSE Score: ", np.mean(rmse_scores))

RMSE Score:  676.244128549343


In [9]:
model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state,
    n_estimators=3000,
    colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    subsample=1.0
)

model.fit(accurate_challenge_features, accurate_challenge_target)

In [10]:
encoded_submission_set = pd.read_csv('data/encoded_submission_set.csv')
submission_features = encoded_submission_set[features_names]

In [11]:
y_pred = model.predict(submission_features)

In [12]:
submission_df = submission_features.copy()
submission_df[identifier_name] = encoded_submission_set[identifier_name]
submission_df[target_name] = y_pred

In [13]:
display(submission_df[[identifier_name, target_name]])

Unnamed: 0,flight_id,tow
0,248753821,66262.671875
1,248753822,213131.093750
2,248754498,221374.484375
3,248757623,60014.468750
4,248763603,63396.765625
...,...,...
105954,258066302,69477.101562
105955,258068609,178342.703125
105956,258068876,74065.468750
105957,258064675,61196.664062


In [14]:
submission_df[[identifier_name, target_name]].to_csv('./submissions/my_submission_v17.csv', index=False)