In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif, VarianceThreshold

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm_notebook as tqdm

import pandas as pd
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

# I. Подготовка данных.

In [None]:
df = pd.read_excel('default of credit card clients.xls', skiprows=1)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.set_index('ID', inplace=True)

In [None]:
df.select_dtypes(exclude='object').head()

In [None]:
X = df.drop('default payment next month', axis=1)
y = df['default payment next month']

# II. Filter methods

## 1. Mutual information

In [None]:
mutual_information = mutual_info_classif(X, y)

In [None]:
plt.subplots(1, figsize=(26, 1))
sns.heatmap(mutual_information.reshape(1, -1), cmap='Blues', cbar=False, linewidths=1, annot=True)
plt.yticks([], [])
plt.gca().set_xticklabels(X.columns, rotation=45, ha='right', fontsize=12)
plt.suptitle("mutual information", fontsize=18, y=1.2)
plt.show()

In [None]:
mutual_information_threshold = 0.01
X_new = X[X.columns[mutual_information > mutual_information_threshold]]

In [None]:
X_new.shape

In [None]:
def gini(target, feature):
    return 2*roc_auc_score(target, feature) - 1

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_new, y, test_size=0.2, random_state=42, stratify=y)

# III. Modeling

In [None]:
xgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'n_jobs': -1,
    'objective': 'binary:logistic', 
    'tree_method': 'gpu_exact'
}

In [None]:
model = xgb.XGBClassifier(**xgb_params)

In [None]:
model

In [None]:
# %%time
# model.fit(X_new, y, early_stopping_rounds=10, eval_metric=['logloss', 'auc'], eval_set=[(X_train, y_train), (X_val, y_val)], 
#           verbose=True)

In [None]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [None]:
# param_grid = {
#                 'max_depth': [7, 9, 12, 15],
#                 'subsample': [0.7, 0.8, 0.9, 1],
#                 'colsample_bytree': [0.5, 0.8, 1],
#                 'colsample_bylevel': [0.5, 0.8, 1],
#                 'min_child_weight': [0.5, 1, 2, 5],
#     }

param_grid = {
                'max_depth': [4, 7, 9],
                'subsample': [0.8, 1],
    }

In [None]:
# fit_params = {
#                 "early_stopping_rounds": 10, 
#                 "eval_metric": ['logloss', 'auc'], 
#                 "eval_set": [(X_train, y_train), (X_val, y_val)],
#                 'verbose': True
#     }

In [None]:
# grid_search = GridSearchCV(model, param_grid=param_grid, scoring='roc_auc', fit_params=fit_params, cv=skf.split(X_train, y_train), verbose=2)

In [None]:
xgb_params = {
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'objective': 'binary:logistic', 
    'tree_method': 'gpu_hist'
}

In [None]:
model = xgb.XGBClassifier(**xgb_params)

In [None]:
grid_search = GridSearchCV(model, param_grid=param_grid, scoring='roc_auc', cv=skf.split(X_train, y_train), n_jobs=-1, verbose=3)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.cv_results_['params'][grid_search.best_index_]

In [None]:
grid_search.cv_results_['split0_test_score'][grid_search.best_index_]

In [None]:
col_list = X_new.columns.tolist()
i = 1
results = dict()
results['stage_0'] = dict()
# results['stage_0']['all'] = grid_search.best_score_
results['stage_0']['all'] = -1

while len(col_list) > 0:
    print('------------------------------------------------------------------------stage_' + str(i))
    results['stage_' + str(i)] = dict()
    for col in tqdm(col_list):
        print(col, '---------------------------------')
        X_tmp = X_train[col_list].drop(col, axis=1)
        candidate_model = xgb.XGBClassifier(**xgb_params)
        grid_search = GridSearchCV(candidate_model, 
                                   param_grid=param_grid, 
                                   scoring='roc_auc', 
                                   cv=skf.split(X_tmp, y_train),
                                   n_jobs=-1, 
                                   verbose=3)
        grid_search.fit(X_tmp, y_train)
        results['stage_' + str(i)][col] = grid_search.best_score_
    
    col_to_be_dropped = max(results, key=results.get)       
    col_list.remove(col_to_be_dropped)
    i += 1