In [1]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import StandardScaler
from xgboost import plot_importance
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np

In [2]:
identifier_name = 'flight_id'

features_names = [
     'wtc',
     'flown_distance',
     'aircraft_type',
     'longitude_max',
     'altitude_median',
     'groundspeed_max',
     'airline',
     'groundspeed_75percentile',
     'altitude_25percentile',
     'flight_duration',
     'latitude_min',
     'vertical_rate_std',
     'altitude_75percentile',
     'longitude_median',
     'longitude_std',
     'vertical_rate_25percentile',
     'longitude_min',
     'longitude_mean',
     'adep',
     'vertical_rate_max',
     'ades',
     'latitude_std',
     'latitude_max',
     'longitude_25percentile',
     'altitude_mean',
     'latitude_mean',
     'vertical_rate_75percentile',
     'latitude_median',
     'groundspeed_min',
     'country_code_adep',
     'country_code_ades',
     'latitude_25percentile',
     'longitude_count',
     'groundspeed_25percentile',
     'vertical_rate_min',
     'longitude_75percentile',
     'track_75percentile',
     'taxiout_time',
     'track_median',
     'vertical_rate_median',
     'latitude_75percentile',
     'track_25percentile',
     'month_day',
     'latitude_count',
     'altitude_std',
     'arrival_time_hour',
     'track_mean',
     'arrival_time_hour_minute',
     'vertical_rate_mean'
]

target_name = 'tow'

global_random_state = 123

In [3]:
encoded_challenge_set = pd.read_csv('data/encoded_challenge_set.csv')
challenge_features = encoded_challenge_set[features_names]
challenge_target = encoded_challenge_set[target_name]

In [4]:
model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state,
    n_estimators=3000,
    colsample_bytree=0.8,
    learning_rate=0.1, 
    max_depth=9, 
    subsample=1.0
)

model.fit(challenge_features, challenge_target)

In [5]:
encoded_final_submission_set = pd.read_csv('data/encoded_final_submission_set.csv')
submission_features = encoded_final_submission_set[features_names]

In [6]:
y_pred = model.predict(submission_features)

In [7]:
submission_df = submission_features.copy()
submission_df[identifier_name] = encoded_final_submission_set[identifier_name]
submission_df[target_name] = y_pred

In [8]:
display(submission_df[[identifier_name, target_name]])

Unnamed: 0,flight_id,tow
0,248753821,65550.617188
1,248753822,213582.781250
2,248754498,222410.593750
3,248763650,64629.542969
4,248763651,51358.574219
...,...,...
158144,258068876,75053.195312
158145,258064675,60217.363281
158146,258065436,190485.015625
158147,258058138,40803.511719


In [9]:
submission_df[[identifier_name, target_name]].to_csv('./submissions/my_submission_v19.csv', index=False)
print("./submissions/my_submission_v19.csv is saved!")

./submissions/my_submission_v19.csv is saved!
