In [1]:
repo_root = "../"
import sys
sys.path.append(repo_root)

In [2]:
import numpy as numpy
import pandas as pd
import xgboost as xgb
import hyperopt as hpt
from preprocessing.country_and_airports_codes import compute_lon_lat,group_and_rename_countries, group_and_rename_airports, group_and_rename_aircraft_types
from preprocessing.encoding import one_hot_encoding,string_to_value_count, string_to_int_hashing
from preprocessing.local_time import add_localtime_to_train_and_test

In [3]:
space = {'max_depth': hpt.hp.quniform('max_depth', 2, 10, 1),
         'gamma': hpt.hp.uniform('gamma', 0.1, 5),
         'reg_alpha': hpt.hp.uniform('reg_alpha', 0.1, 5),
         'reg_lambda': hpt.hp.uniform('reg_lambda', 0.1, 5),
         'learning_rate': hpt.hp.uniform('learning_rate', 0.01, 0.2),
         'n_estimators': hpt.hp.quniform('n_estimators', 100, 1000, 1),
         'min_child_weight': hpt.hp.quniform('min_child_weight', 1, 10, 1),
         'subsample': hpt.hp.uniform('subsample', 0.5, 1),
         'colsample_bytree': hpt.hp.uniform('colsample_bytree', 0.5, 1),
         'seed': 0}

In [4]:
import os
print(os.getcwd())

/Users/leo/Desktop/Computer_fun/OpenSkyNet/repo3/ATOW_ML/notebooks


In [5]:
train_df = pd.read_csv("../data/challenge_set.csv")
test_df = pd.read_csv("../data/submission_set.csv")

In [6]:
add_localtime_to_train_and_test(train_df,test_df) #add localtime features (departures & arrival hours, day of years, weeks, month)
compute_lon_lat(train_df, test_df) # computes lon, lat for each airport
group_and_rename_countries(train_df, test_df) # simplify country_codes by group and rename countries
group_and_rename_airports(train_df, test_df) # simplify airport codes by group and rename airport
group_and_rename_aircraft_types(train_df, test_df) #regroup less used airlines and create "XXXX" category for unknown ones.


----------------------------------------------------------------------------------------------------
Columns for lon & lat: ['lon_adep', 'lat_adep', 'lon_ades', 'lat_ades'] successfully created !
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Country codes successfully grouped ! Different codes left : 47
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Airports codes successfully grouped ! Different codes left : 155
----------------------------------------------------------------------------------------------------


In [7]:
train_df

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,...,local_departure_time,local_arrival_hour,local_departure_hour,travel_day_of_week,travel_day_of_year,departure_month,lon_adep,lat_adep,lon_ades,lat_ades
0,248763780,2022-01-01,3840d84f25d3f5fcc0a1be3076bb4039,EGLL,London Heathrow,GB,EICK,Cork,IE,2022-01-01T13:46:00Z,...,2022-01-01 13:46:00,15,13,5,1,1,-0.46194,51.47060,-8.49111,51.84130
1,248760618,2022-01-01,f6f610e73002b8892a239a81321f7f1d,LEBL,Barcelona,ES,KMIA,Miami,US,2022-01-01T09:55:00Z,...,2022-01-01 10:55:00,14,10,5,1,1,2.07846,41.29710,-80.29012,25.79536
2,248753824,2022-01-01,139670936660762c230ca92556ba842b,ESSA,Stockholm Arlanda,SE,KORD,Chicago O'Hare,US,2022-01-01T09:39:00Z,...,2022-01-01 10:39:00,13,10,5,1,1,17.91860,59.65190,-87.90815,41.97694
3,248753852,2022-01-01,509dc61bb54fbab0e5406067c95603e2,LSZH,Zurich,CH,KPHL,Philadelphia,US,2022-01-01T11:04:00Z,...,2022-01-01 12:04:00,14,12,5,1,1,8.54917,47.46470,-75.24066,39.87208
4,248755934,2022-01-01,d0610d000dcf26b1d7bba8103ecc393d,EIDW,Dublin,IE,EGLL,London Heathrow,GB,2022-01-01T12:36:00Z,...,2022-01-01 12:36:00,13,12,5,1,1,-6.27007,53.42130,-0.46194,51.47060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369008,258058361,2022-12-31,85ee68e5b7b5acf24ba00d1318eca1e8,LFPG,Paris Charles de Gaulle,FR,KMIA,Miami,US,2022-12-31T09:38:00Z,...,2022-12-31 10:38:00,14,10,5,365,12,2.55000,49.01280,-80.29012,25.79536
369009,258071247,2022-12-31,570cf7d5ebbd691bcba63e7466607da7,LTFM,iGA Istanbul,TR,EDDB,Berlin Brandenburg,DE,2022-12-31T09:27:00Z,...,2022-12-31 12:27:00,13,12,5,365,12,28.75200,41.27533,13.50067,52.36217
369010,258059152,2022-12-31,5a7e43e4f981539ae3d3b1cb31591b7c,EDDL,Dusseldorf,DE,EIDW,Dublin,IE,2022-12-31T09:52:00Z,...,2022-12-31 10:52:00,11,10,5,365,12,6.76678,51.28950,-6.27007,53.42130
369011,258072276,2022-12-31,a1c078516f9f9e90cacec61854cad45b,LFPG,Paris Charles de Gaulle,FR,EIDW,Dublin,IE,2022-12-31T09:37:00Z,...,2022-12-31 10:37:00,11,10,5,365,12,2.55000,49.01280,-6.27007,53.42130


In [8]:
# encoding

columns_to_ohe = ['aircraft_type'] # A changer
one_hot_encoding(train_df, test_df, columns_to_ohe)

columns_to_hash = ['callsign','country_code_ades', 'country_code_adep', 'adep', 'ades', 'airline','wtc'] # A changer
string_to_int_hashing(train_df, test_df, columns_to_hash)

columns_to_vc = [] # A changer
string_to_value_count(train_df, test_df, columns_to_vc)


----------------------------------------------------------------------------------------------------
Columns ['aircraft_type'] sucessfully one hot encoded !
----------------------------------------------------------------------------------------------------
        flight_id        date  callsign  adep               name_adep  \
0       248753821  2022-01-01      2495    92  Istanbul Sabiha Gokcen   
1       248753822  2022-01-01      9518     1                Brussels   
2       248754498  2022-01-01      1902    50                   Miami   
3       248763650  2022-01-01      2259     1                Brussels   
4       248763651  2022-01-01     10029    89                  Zurich   
...           ...         ...       ...   ...                     ...   
158144  258068876  2022-12-31      8596    93            iGA Istanbul   
158145  258064675  2022-12-31        35    23               Amsterdam   
158146  258065436  2022-12-31      5684    58               Barcelona   
158147  2580

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,...,local_departure_time,local_arrival_hour,local_departure_hour,travel_day_of_week,travel_day_of_year,departure_month,lon_adep,lat_adep,lon_ades,lat_ades
0,248763780,2022-01-01,2356,19,London Heathrow,11,28,Cork,18,2022-01-01T13:46:00Z,...,2022-01-01 13:46:00,15,13,5,1,1,-0.46194,51.47060,-8.49111,51.84130
1,248760618,2022-01-01,10522,58,Barcelona,8,62,Miami,52,2022-01-01T09:55:00Z,...,2022-01-01 10:55:00,14,10,5,1,1,2.07846,41.29710,-80.29012,25.79536
2,248753824,2022-01-01,830,41,Stockholm Arlanda,34,64,Chicago O'Hare,52,2022-01-01T09:39:00Z,...,2022-01-01 10:39:00,13,10,5,1,1,17.91860,59.65190,-87.90815,41.97694
3,248753852,2022-01-01,3379,89,Zurich,3,65,Philadelphia,52,2022-01-01T11:04:00Z,...,2022-01-01 12:04:00,14,12,5,1,1,8.54917,47.46470,-75.24066,39.87208
4,248755934,2022-01-01,8863,25,Dublin,15,22,London Heathrow,13,2022-01-01T12:36:00Z,...,2022-01-01 12:36:00,13,12,5,1,1,-6.27007,53.42130,-0.46194,51.47060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369008,258058361,2022-12-31,5605,67,Paris Charles de Gaulle,10,62,Miami,52,2022-12-31T09:38:00Z,...,2022-12-31 10:38:00,14,10,5,365,12,2.55000,49.01280,-80.29012,25.79536
369009,258071247,2022-12-31,3642,93,iGA Istanbul,45,4,Berlin Brandenburg,8,2022-12-31T09:27:00Z,...,2022-12-31 12:27:00,13,12,5,365,12,28.75200,41.27533,13.50067,52.36217
369010,258059152,2022-12-31,3803,7,Dusseldorf,6,29,Dublin,18,2022-12-31T09:52:00Z,...,2022-12-31 10:52:00,11,10,5,365,12,6.76678,51.28950,-6.27007,53.42130
369011,258072276,2022-12-31,6817,67,Paris Charles de Gaulle,10,29,Dublin,18,2022-12-31T09:37:00Z,...,2022-12-31 10:37:00,11,10,5,365,12,2.55000,49.01280,-6.27007,53.42130


In [9]:
# drop unusefull column:
to_drop = ['flight_id','date','name_adep','name_ades','name_adep','actual_offblock_time','arrival_time','local_departure_time','local_arrival_time']
train_df = train_df.drop(columns= to_drop)
test_df = test_df.drop(columns= to_drop)

In [None]:
X = train_df.drop(columns=['tow'])
y = train_df['tow']

from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


print("Train shape: ", X_train.shape)
print("Test shape: ", X_test.shape)


print(y_test)

display(X_train)

In [None]:
def objective(params):
    clf = xgb.XGBRegressor(n_estimators=int(params['n_estimators']),
                            max_depth=int(params['max_depth']),
                            gamma=params['gamma'],
                            reg_alpha=params['reg_alpha'],
                            reg_lambda=params['reg_lambda'],
                            learning_rate=params['learning_rate'],
                            min_child_weight=params['min_child_weight'],
                            subsample=params['subsample'],
                            colsample_bytree=params['colsample_bytree'],
                            seed=0,
                            objective="reg:squarederror",
                            eval_metric="rmse",
                            early_stopping_rounds=10,
                            n_jobs=-1)
    
    eval_set = [(X_train, y_train), (X_test, y_test)]

    clf.fit(X_train, y_train, eval_set=eval_set, verbose=False)
    y_pred = clf.predict(X_test)

    score = root_mean_squared_error(y_test, y_pred)
    print("SCORE: ", score)
    return {'loss': score, 'status': hpt.STATUS_OK}

trials = hpt.Trials()
best = hpt.fmin(fn=objective, space=space, algo=hpt.tpe.suggest, max_evals=100, trials=trials)

In [None]:
display(best)

In [None]:
# train xgboost with best hyperparameters

clf = xgb.XGBRegressor(n_estimators=int(best['n_estimators']),
                        max_depth=int(best['max_depth']),
                        gamma=best['gamma'],
                        reg_alpha=best['reg_alpha'],
                        reg_lambda=best['reg_lambda'],
                        learning_rate=best['learning_rate'],
                        min_child_weight=best['min_child_weight'],
                        subsample=best['subsample'],
                        colsample_bytree=best['colsample_bytree'],
                        seed=0,
                        objective="reg:squarederror",
                        eval_metric="rmse",
                        early_stopping_rounds=10,
                        n_jobs=-1)

eval_set = [(X_train, y_train), (X_test, y_test)]

clf.fit(X_train, y_train, eval_set=eval_set, verbose=True)

# predict on test set

y_pred = clf.predict(X_test)

score = root_mean_squared_error(y_pred, y_test)

display(y_pred)
print("SCORE: ", score)

In [None]:
# compute the root mean squared error

print("RMSE: ", score)

In [None]:
import matplotlib.pyplot as plt

# compute average relative error

relative_error = abs(y_test - y_pred) / y_test

average_relative_error = relative_error.mean()

print(f"Average relative error: {average_relative_error}")

# compute average absolute error

absolute_error = abs(y_test - y_pred)

average_absolute_error = absolute_error.mean()

print(f"Average absolute error: {average_absolute_error}")

# plot residuals vs true values

plt.scatter(y_test, (y_test-y_pred))

plt.xlabel("True values")

plt.ylabel("Residuals")

plt.title("Residuals vs true values")

plt.show()

# plot residuals vs predicted values

plt.scatter(y_pred, (y_test-y_pred)/y_test)

plt.xlabel("Predicted values")

plt.ylabel("Relative residuals")

plt.title("Relative residuals vs predicted values")

plt.show()



In [None]:
#plot relative error distribution

plt.hist(relative_error, bins=100)

plt.xlabel("Relative error")

plt.ylabel("Count")

plt.title("Relative error distribution")

plt.show()

# plot absolute error distribution

In [None]:
print(relative_error.mean())

In [None]:
# same optimization on retriced set wtc = 1

x_train_wtc1 = X_train[X_train['wtc'] == 1]

y_train_wtc1 = y_train[X_train['wtc'] == 1]

x_test_wtc1 = X_test[X_test['wtc'] == 1]

y_test_wtc1 = y_test[X_test['wtc'] == 1]

space = {'max_depth': hpt.hp.quniform('max_depth', 2, 10, 1),
            'gamma': hpt.hp.uniform('gamma', 0.1, 5),
            'reg_alpha': hpt.hp.uniform('reg_alpha', 0.1, 5),
            'reg_lambda': hpt.hp.uniform('reg_lambda', 0.1, 5),
            'learning_rate': hpt.hp.uniform('learning_rate', 0.01, 0.2),
            'n_estimators': hpt.hp.quniform('n_estimators', 100, 1000, 1),
            'min_child_weight': hpt.hp.quniform('min_child_weight', 1, 10, 1),
            'subsample': hpt.hp.uniform('subsample', 0.5, 1),
            'colsample_bytree': hpt.hp.uniform('colsample_bytree', 0.5, 1),
            'seed': 0}

def objective(params):
    clf = xgb.XGBRegressor(n_estimators=int(params['n_estimators']),
                            max_depth=int(params['max_depth']),
                            gamma=params['gamma'],
                            reg_alpha=params['reg_alpha'],
                            reg_lambda=params['reg_lambda'],
                            learning_rate=params['learning_rate'],
                            min_child_weight=params['min_child_weight'],
                            subsample=params['subsample'],
                            colsample_bytree=params['colsample_bytree'],
                            seed=0,
                            objective="reg:squarederror",
                            eval_metric="rmse",
                            early_stopping_rounds=10,
                            n_jobs=-1)
    
    eval_set = [(x_train_wtc1, y_train_wtc1), (x_test_wtc1, y_test_wtc1)]

    clf.fit(x_train_wtc1, y_train_wtc1, eval_set=eval_set, verbose=False)
    y_pred = clf.predict(x_test_wtc1)

    score = root_mean_squared_error(y_test_wtc1, y_pred)
    print("SCORE: ", score)
    return {'loss': score, 'status': hpt.STATUS_OK}


trials = hpt.Trials()

best = hpt.fmin(fn=objective, space=space, algo=hpt.tpe.suggest, max_evals=20, trials=trials)

display(best)

In [None]:
# train xgboost with best hyperparameters

clf_wtc1 = xgb.XGBRegressor(n_estimators=int(best['n_estimators']),
                            max_depth=int(best['max_depth']),
                            gamma=best['gamma'],
                            reg_alpha=best['reg_alpha'],
                            reg_lambda=best['reg_lambda'],
                            learning_rate=best['learning_rate'],
                            min_child_weight=best['min_child_weight'],
                            subsample=best['subsample'],
                            colsample_bytree=best['colsample_bytree'],
                            seed=0,
                            objective="reg:squarederror",
                            eval_metric="rmse",
                            early_stopping_rounds=10,
                            n_jobs=-1)

eval_set = [(x_train_wtc1, y_train_wtc1), (x_test_wtc1, y_test_wtc1)]

clf_wtc1.fit(x_train_wtc1, y_train_wtc1, eval_set=eval_set, verbose=True)

# predict on test set

y_pred_wtc1 = clf_wtc1.predict(x_test_wtc1)

score = root_mean_squared_error(y_pred_wtc1, y_test_wtc1)
                       

In [None]:
# same optimization on retriced set wtc = 0

x_train_wtc0 = X_train[X_train['wtc'] == 0]

y_train_wtc0 = y_train[X_train['wtc'] == 0]

x_test_wtc0 = X_test[X_test['wtc'] == 0]

y_test_wtc0 = y_test[X_test['wtc'] == 0]

space = {'max_depth': hpt.hp.quniform('max_depth', 2, 10, 1),
            'gamma': hpt.hp.uniform('gamma', 0.1, 5),
            'reg_alpha': hpt.hp.uniform('reg_alpha', 0.1, 5),
            'reg_lambda': hpt.hp.uniform('reg_lambda', 0.1, 5),
            'learning_rate': hpt.hp.uniform('learning_rate', 0.01, 0.2),
            'n_estimators': hpt.hp.quniform('n_estimators', 100, 1000, 1),
            'min_child_weight': hpt.hp.quniform('min_child_weight', 1, 10, 1),
            'subsample': hpt.hp.uniform('subsample', 0.5, 1),
            'colsample_bytree': hpt.hp.uniform('colsample_bytree', 0.5, 1),
            'seed': 0}

def objective(params):
    clf = xgb.XGBRegressor(n_estimators=int(params['n_estimators']),
                            max_depth=int(params['max_depth']),
                            gamma=params['gamma'],
                            reg_alpha=params['reg_alpha'],
                            reg_lambda=params['reg_lambda'],
                            learning_rate=params['learning_rate'],
                            min_child_weight=params['min_child_weight'],
                            subsample=params['subsample'],
                            colsample_bytree=params['colsample_bytree'],
                            seed=0,
                            objective="reg:squarederror",
                            eval_metric="rmse",
                            early_stopping_rounds=10,
                            n_jobs=-1)
    
    eval_set = [(x_train_wtc0, y_train_wtc0), (x_test_wtc0, y_test_wtc0)]

    clf.fit(x_train_wtc0, y_train_wtc0, eval_set=eval_set, verbose=False)
    y_pred = clf.predict(x_test_wtc0)

    score = root_mean_squared_error(y_test_wtc0, y_pred)
    print("SCORE: ", score)
    return {'loss': score, 'status': hpt.STATUS_OK}


trials = hpt.Trials()

best = hpt.fmin(fn=objective, space=space, algo=hpt.tpe.suggest, max_evals=100, trials=trials)

display(best)

In [None]:
# train xgboost with best hyperparameters

clf_wtc0 = xgb.XGBRegressor(n_estimators=int(best['n_estimators']),
                            max_depth=int(best['max_depth']),
                            gamma=best['gamma'],
                            reg_alpha=best['reg_alpha'],
                            reg_lambda=best['reg_lambda'],
                            learning_rate=best['learning_rate'],
                            min_child_weight=best['min_child_weight'],
                            subsample=best['subsample'],
                            colsample_bytree=best['colsample_bytree'],
                            seed=0,
                            objective="reg:squarederror",
                            eval_metric="rmse",
                            early_stopping_rounds=10,
                            n_jobs=-1)

eval_set = [(x_train_wtc0, y_train_wtc0), (x_test_wtc0, y_test_wtc0)]

clf_wtc0.fit(x_train_wtc0, y_train_wtc0, eval_set=eval_set, verbose=True)

# predict on test set

y_pred_wtc0 = clf_wtc0.predict(x_test_wtc0)

score = root_mean_squared_error(y_pred_wtc0, y_test_wtc0)

# compute the root mean squared error

print("RMSE: ", score)

In [None]:
y_pred_wtc0 = clf_wtc0.predict(X_test)

y_pred_wtc1 = clf_wtc1.predict(X_test)

y_pred = clf.predict(X_test)

y_pred_aggregate = (y_pred_wtc0 * (X_test['wtc'] == 0) + y_pred_wtc1 * (X_test['wtc'] == 1) + y_pred)/2

In [None]:
# compute the root mean squared error

score = root_mean_squared_error(y_pred_aggregate, y_test)
print("RMSE: ", score)

In [None]:
# plot residuals vs true values

plt.scatter(y_test, (y_test-y_pred_aggregate))

plt.xlabel("True values")

plt.ylabel("Residuals")

plt.title("Residuals vs true values")

plt.show()

# plot relative error vs true values

plt.scatter(y_test, (y_test-y_pred_aggregate)/y_test)

plt.xlabel("True values")

plt.ylabel("Relative residuals")

plt.title("Relative residuals vs true values")

plt.show()



In [None]:
# plot relative error distribution

plt.hist(abs(y_test - y_pred_aggregate) / y_test, bins=100)

plt.xlabel("Relative error")

plt.ylabel("Count")

plt.title("Relative error distribution")

plt.show()



In [None]:
# print average relative error

relative_error = abs(y_test - y_pred_aggregate) / y_test

average_relative_error = relative_error.mean()

print(f"Average relative error: {average_relative_error}")

# print average absolute error

absolute_error = abs(y_test - y_pred_aggregate)

average_absolute_error = absolute_error.mean()

print(f"Average absolute error: {average_absolute_error}")

seuil = 0.05

print(f"Percentage of predictions with relative error below {seuil}: {100 * (relative_error < seuil).mean()}")

print(f"Percentage of predictions with relative error below {2*seuil}: {100 * (relative_error < 2*seuil).mean()}")

print(f"Percentage of predictions with relative error below {3*seuil}: {100 * (relative_error < 3*seuil).mean()}")

In [None]:
# retrain the model on all the dataset without early stopping

basic_param = {'colsample_bytree': 0.9362150768343058,
 'gamma': 2.022211195429398,
 'learning_rate': 0.07900609044575752,
 'max_depth': 10.0,
 'min_child_weight': 1.0,
 'n_estimators': 862.0,
 'reg_alpha': 1.6659492680583492,
 'reg_lambda': 4.4589665080717085,
 'subsample': 0.7470405034939882}

clf = xgb.XGBRegressor(n_estimators=int(basic_param['n_estimators']),
                            max_depth=int(basic_param['max_depth']),
                            gamma=basic_param['gamma'],
                            reg_alpha=basic_param['reg_alpha'],
                            reg_lambda=basic_param['reg_lambda'],
                            learning_rate=basic_param['learning_rate'],
                            min_child_weight=basic_param['min_child_weight'],
                            subsample=basic_param['subsample'],
                            colsample_bytree=basic_param['colsample_bytree'],
                            seed=0,
                            objective="reg:squarederror",
                            eval_metric="rmse",
                            n_jobs=-1)

eval_set = [(X, y)]

clf.fit(X, y, eval_set=eval_set, verbose=False)

wtc0_param = {'colsample_bytree': 0.8497066984064998,
 'gamma': 1.8567356656935454,
 'learning_rate': 0.041891010160377044,
 'max_depth': 9.0,
 'min_child_weight': 8.0,
 'n_estimators': 859.0,
 'reg_alpha': 3.709715865940738,
 'reg_lambda': 1.6089032820385571,
 'subsample': 0.8005286414941816}


clf_wtc0 = xgb.XGBRegressor(n_estimators=int(wtc0_param['n_estimators']),
                            max_depth=int(wtc0_param['max_depth']),
                            gamma=wtc0_param['gamma'],
                            reg_alpha=wtc0_param['reg_alpha'],
                            reg_lambda=wtc0_param['reg_lambda'],
                            learning_rate=wtc0_param['learning_rate'],
                            min_child_weight=wtc0_param['min_child_weight'],
                            subsample=wtc0_param['subsample'],
                            colsample_bytree=wtc0_param['colsample_bytree'],
                            seed=0,
                            objective="reg:squarederror",
                            eval_metric="rmse",
                            n_jobs=-1)

x_train_wtc0 = X[X['wtc'] == 0]

y_train_wtc0 = y[X['wtc'] == 0]

eval_set = [(x_train_wtc0, y_train_wtc0)]

clf_wtc0.fit(x_train_wtc0, y_train_wtc0, eval_set=eval_set, verbose=False)

wtc1_param = {'colsample_bytree': 0.9865346417369687,
 'gamma': 2.311730004603916,
 'learning_rate': 0.1355408626893648,
 'max_depth': 9.0,
 'min_child_weight': 2.0,
 'n_estimators': 890.0,
 'reg_alpha': 2.7683944404030436,
 'reg_lambda': 0.10259397496527922,
 'subsample': 0.6971029355383804}

clf_wtc1 = xgb.XGBRegressor(n_estimators=int(wtc1_param['n_estimators']),
                            max_depth=int(wtc1_param['max_depth']),
                            gamma=wtc1_param['gamma'],
                            reg_alpha=wtc1_param['reg_alpha'],
                            reg_lambda=wtc1_param['reg_lambda'],
                            learning_rate=wtc1_param['learning_rate'],
                            min_child_weight=wtc1_param['min_child_weight'],
                            subsample=wtc1_param['subsample'],
                            colsample_bytree=wtc1_param['colsample_bytree'],
                            seed=0,
                            objective="reg:squarederror",
                            eval_metric="rmse",
                            n_jobs=-1)

x_train_wtc1 = X[X['wtc'] == 1]

y_train_wtc1 = y[X['wtc'] == 1]

eval_set = [(x_train_wtc1, y_train_wtc1)]

clf_wtc1.fit(x_train_wtc1, y_train_wtc1, eval_set=eval_set, verbose=False)



y_pred_wtc0 = clf_wtc0.predict(test_df)

y_pred_wtc1 = clf_wtc1.predict(test_df)

y_pred = clf.predict(test_df)

y_pred_aggregate = (y_pred_wtc0 * (test_df['wtc'] == 0) + y_pred_wtc1 * (test_df['wtc'] == 1) + y_pred)/2





In [None]:
# plot clf model importance

import matplotlib.pyplot as plt

xgb.plot_importance(clf)

plt.show()

In [None]:
# add y_pred_aggregate to test_df

test_df['tow'] = y_pred_aggregate

display(test_df)