In [10]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import StandardScaler
from xgboost import plot_importance
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

In [11]:
identifier_name = 'flight_id'

features_names = [
    'wtc', 
    'flown_distance', 
    'aircraft_type', 
    'groundspeed_max', 
    'groundspeed_75percentile', 
    'altitude_25percentile', 
    'airline', 
    'longitude_max', 
    'latitude_min', 
    'flight_duration', 
    'vertical_rate_std', 
    'altitude_median', 
    'longitude_mean', 
    'altitude_75percentile', 
    'longitude_std', 
    'latitude_max', 
    'latitude_std', 
    'vertical_rate_max', 
    'vertical_rate_25percentile', 
    'latitude_mean', 
    'longitude_min', 
    'longitude_25percentile', 
    'ades', 
    'adep', 
    'country_code_adep', 
    'vertical_rate_75percentile', 
    'latitude_median', 
    'altitude_mean', 
    'latitude_25percentile', 
    'country_code_ades', 
    'month_day', 
    'actual_offblock_time_hour', 
    'groundspeed_min', 
    'longitude_75percentile', 
    'vertical_rate_median'
]

target_name = 'tow'

global_random_state = 123

In [12]:
encoded_challenge_set = pd.read_csv('data/encoded_challenge_set.csv')
challenge_features = encoded_challenge_set[features_names]
challenge_target = encoded_challenge_set[target_name]

In [13]:
# Function to score model using Root Mean Square Error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))
    
# Create a scorer
rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Make cross validation strategy
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=global_random_state)

In [None]:
accurate_challenge_data = pd.DataFrame({})

kfold_num = 0
for train_index, test_index in cv_strategy.split(challenge_features):
    kfold_num += 1
    print("k-fold number:", kfold_num)
    X_train, X_test = challenge_features.iloc[train_index], challenge_features.iloc[test_index]
    y_train, y_test = challenge_target.iloc[train_index], challenge_target.iloc[test_index]

    model = xgb.XGBRegressor(
        objective='reg:squarederror', 
        eval_metric='rmse',
        random_state=global_random_state,
        n_estimators=3000,
        colsample_bytree=1.0,
        learning_rate=0.1, 
        max_depth=10, 
        subsample=1.0
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    diff_df = X_test.copy()
    diff_df['tow'] = y_test
    diff_df['y_pred'] = y_pred
    diff_df['difference'] = (diff_df['y_pred'] - diff_df['tow']).abs()
    diff_df['percentage_difference'] = (diff_df['difference'] / diff_df['tow']) * 100

    max_percentage_difference = np.ceil(diff_df['percentage_difference'].max()) 

    for per_diff in range(0, int(max_percentage_difference), 1):
        per_diff_df = diff_df[(diff_df['percentage_difference'] <= per_diff+1) & (diff_df['percentage_difference'] >= per_diff)]
        if len(per_diff_df) > 0:
            per_diff_df = per_diff_df.drop(columns=['y_pred', 'difference', 'percentage_difference'])
            duplicated_dfs = []
            for duplicate in range(per_diff+1):
                duplicated_dfs.append(per_diff_df)
            duplicated_df = pd.concat(duplicated_dfs, ignore_index=True)
            
    if len(per_diff_df) > 0:
        accurate_challenge_data = pd.concat([accurate_challenge_data, duplicated_df], ignore_index=True)

k-fold number:  1


In [None]:
display(accurate_challenge_data)

In [None]:
accurate_challenge_features = accurate_challenge_data[features_names]
accurate_challenge_target = accurate_challenge_data[target_name]

In [None]:
rmse_scores = []
for train_index, test_index in cv_strategy.split(accurate_challenge_features):
    X_train, X_test = accurate_challenge_features.iloc[train_index], accurate_challenge_features.iloc[test_index]
    y_train, y_test = accurate_challenge_target.iloc[train_index], accurate_challenge_target.iloc[test_index]

    model = xgb.XGBRegressor(
        objective='reg:squarederror', 
        eval_metric='rmse',
        random_state=global_random_state,
        n_estimators=3000,
        colsample_bytree=1.0,
        learning_rate=0.1, 
        max_depth=10, 
        subsample=1.0
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse_score = rmse(y_test, y_pred)
    rmse_scores.append(rmse_score)

print(f"RMSE Score: ", np.mean(rmse_scores))

In [None]:
model = xgb.XGBRegressor(
    objective='reg:squarederror', 
    eval_metric='rmse',
    random_state=global_random_state,
    n_estimators=3000,
    colsample_bytree=1.0,
    learning_rate=0.1, 
    max_depth=10, 
    subsample=1.0
)

model.fit(accurate_challenge_features, accurate_challenge_target)

In [None]:
encoded_submission_set = pd.read_csv('data/encoded_submission_set.csv')
submission_features = encoded_submission_set[features_names]

In [None]:
y_pred = model.predict(submission_features)

In [None]:
submission_df = submission_features.copy()
submission_df[identifier_name] = encoded_submission_set[identifier_name]
submission_df[target_name] = y_pred

In [None]:
display(submission_df[[identifier_name, target_name]])

In [None]:
submission_df[[identifier_name, target_name]].to_csv('./submissions/my_submission_v17.csv', index=False)