
Ваша задача – побить единственный бенчмарк в [соревновании](https://www.kaggle.com/c/flight-delays-2017) на Kaggle Inclass. Подробных инструкций не будет, будет только тезисно описано, как получен этот бенчмарк. Конечно, с помощью Xgboost. Надеюсь, на данном этапе курса вам достаточно бросить полтора взгляда на данные, чтоб понять, что это тот тип задачи, в которой затащит Xgboost. Но проверьте еще Catboost.

<img src='../../img/xgboost_meme.jpg' width=40% />

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict
import optuna
from optuna.samplers import TPESampler

In [2]:
train = pd.read_csv("data/flight_delays_train.csv")
test = pd.read_csv("data/flight_delays_test.csv")

In [3]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [4]:
test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


In [5]:
X = train.drop('dep_delayed_15min', axis=1)
y = train['dep_delayed_15min'].map({'Y':1, "N":0})

In [6]:
cat_features_idx = np.where(X.dtypes == "object")[0].tolist()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=17)

In [20]:
params = {
        "iterations": 719,
        "learning_rate": 0.09987617880274217,
        "depth": 7,
        "l2_leaf_reg": 6.636164403632722e-07,
        "bootstrap_type": "Bayesian",
        "random_strength": 7.69504806253415e-08,
        "bagging_temperature": 0.6834815725008856,
        "od_type": "Iter",
        "od_wait": 37,
        "random_state": 17,
        "verbose": False
         }

In [23]:
model = CatBoostClassifier(**params)

In [24]:
train_pool = Pool(data=X_train,
                  label=y_train,
                  cat_features=[0,1,2,4,5,6])

test_pool = Pool(data=test,
                 cat_features=[0,1,2,4,5,6])

In [10]:
# def objective(trial):
#     model = CatBoostClassifier(
#         iterations=trial.suggest_int("iterations", 100, 1000),
#         learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
#         depth=trial.suggest_int("depth", 4, 10),
#         l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
#         bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
#         random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
#         bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
#         od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
#         od_wait=trial.suggest_int("od_wait", 10, 50),
#         verbose=False
#     )
#     model.fit(train_pool)
#     y_pred = model.predict(X_test)
#     return roc_auc_score(y_test, y_pred)

In [11]:
# optuna.logging.set_verbosity(optuna.logging.WARNING)
# sampler = TPESampler(seed=1)
# study = optuna.create_study(study_name="catboost", direction="maximize", sampler=sampler)
# study.optimize(objective, n_trials=100)

In [13]:
# print("Number of finished trials: ", len(study.trials))
# print("Best trial:") 
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")
# for key, value in trial.params.items():
#     print("    {}: {}".format(key, value))

Number of finished trials:  100
Best trial:
  Value:  0.5807246272471757
  Params: 
    iterations: 719
    learning_rate: 0.09987617880274217
    depth: 7
    l2_leaf_reg: 6.636164403632722e-07
    bootstrap_type: Bayesian
    random_strength: 7.69504806253415e-08
    bagging_temperature: 0.6834815725008856
    od_type: Iter
    od_wait: 37


In [25]:
model.fit(train_pool)

<catboost.core.CatBoostClassifier at 0x1ac57f24ca0>

In [26]:
roc_auc_score(y_test,model.predict_proba(X_test)[:,1])

0.742502542928754

In [27]:
predict_value = model.predict_proba(test_pool)[:,1]

In [28]:
pd.Series(predict_value, name="dep_delayed_15min").to_csv(
    "CatBoost_predict.csv", index_label="id", header=True
)