In [12]:
!pip install xgboost --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
[K     |████████████████████████████████| 192.9 MB 56 kB/s 
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 0.90
    Uninstalling xgboost-0.90:
      Successfully uninstalled xgboost-0.90
Successfully installed xgboost-1.6.1


# Imports

In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

import xgboost as xgb

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score,balanced_accuracy_score, roc_auc_score
from sklearn.mixture import BayesianGaussianMixture
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from scipy.optimize import dual_annealing
from numpy.random import rand
 
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

In [2]:
class CFG:
    seed_bgm = 1
    seed = 42
    n_splits = 10
    n_clusters = 7
    threshold = 0.7
    PATH = 'drive/MyDrive/Kaggle/Clustering_072022/'

In [3]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [4]:
df = pd.read_csv(CFG.PATH + 'src/data.csv').drop('id', axis=1)
df

Unnamed: 0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28
0,-0.389420,-0.912791,0.648951,0.589045,-0.830817,0.733624,2.258560,2,13,14,5,13,6,6,-0.469819,0.358126,1.068105,-0.559650,-0.366905,-0.478412,-0.757002,-0.763635,-1.090369,1.142641,-0.884274,1.137896,1.309073,1.463002,0.813527
1,-0.689249,-0.453954,0.654175,0.995248,-1.653020,0.863810,-0.090651,2,3,6,4,6,16,9,0.591035,-0.396915,0.145834,-0.030798,0.471167,-0.428791,-0.089908,-1.784204,-0.839474,0.459685,1.759412,-0.275422,-0.852168,0.562457,-2.680541
2,0.809079,0.324568,-1.170602,-0.624491,0.105448,0.783948,1.988301,5,11,5,8,9,3,11,-0.679875,0.469326,0.349843,-0.288042,0.291470,-0.413534,-1.602377,1.190984,3.267116,-0.088322,-2.168635,-0.974989,1.335763,-1.110655,-3.630723
3,-0.500923,0.229049,0.264109,0.231520,0.415012,-1.221269,0.138850,6,2,13,8,9,6,4,-0.389456,0.626762,-1.074543,-1.521753,-1.150806,0.619283,1.287801,0.532837,1.036631,-2.041828,1.440490,-1.900191,-0.630771,-0.050641,0.238333
4,-0.671268,-1.039533,-0.270155,-1.830264,-0.290108,-1.852809,0.781898,8,7,5,3,1,13,11,-0.120743,-0.615578,-1.064359,0.444142,0.428327,-1.628830,-0.434948,0.322505,0.284326,-2.438365,1.473930,-1.044684,1.602686,-0.405263,-1.987263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97995,0.237591,1.657034,-0.689282,0.313710,-0.299039,0.329139,1.607378,5,7,8,5,7,6,7,0.362517,1.010965,-1.001519,0.409882,-0.504114,-0.290116,-0.258141,-0.973640,1.369508,0.391055,2.152426,-0.208944,-1.475403,0.298448,0.445039
97996,0.322696,0.710411,0.562625,-1.321713,-0.357708,0.182024,0.178558,3,9,2,5,3,11,12,0.683558,-1.238120,0.863433,1.318554,-1.125758,0.117687,1.388242,0.342400,1.680537,-0.860409,0.579165,1.162692,0.134994,0.994666,0.727642
97997,-0.249364,-0.459545,1.886122,-1.340310,0.195029,-0.559520,-0.379767,8,9,10,7,5,4,3,-1.337303,0.064310,0.612507,0.398968,-0.409608,-0.850223,-1.787648,-1.268115,-1.508330,1.945622,1.503645,0.194968,2.142693,1.646042,0.641466
97998,0.311408,2.185237,0.761367,0.436723,0.464967,0.062321,-0.334025,1,8,11,11,3,9,5,-0.612314,-0.416691,-0.750458,0.165038,0.333685,-0.010839,1.118906,1.565765,0.358480,0.547615,1.224439,-0.537998,-1.610954,-0.616227,-0.066211


In [5]:
all_scores = []
best_features = [f"f_{i:02d}" for i in list(range(7, 14)) + list(range(22, 29))]
features = df.columns

def scores(preds, lib, df=df[best_features], verbose=True, compute_silhouette=None): 
    # Silhouette is very slow
    sil = 0
    if compute_silhouette:
        sil = silhouette_score(df, preds, metric='euclidean')
    
    s = (lib,
         sil, 
         calinski_harabasz_score(df, preds), 
         davies_bouldin_score(df, preds))
    
    if verbose:
        print(f"{s[0]} : Silhouette : {s[1]:.1%} | Calinski Harabasz : {s[2]:.1f} | Davis Bouldin : {s[3]:.3f}")
        
    return s

In [6]:
seed_everything(CFG.seed_bgm)
df_scaled = pd.DataFrame(PowerTransformer().fit_transform(df[features]), columns=features)

BGM = BayesianGaussianMixture(n_components=CFG.n_clusters, covariance_type='full', random_state=CFG.seed_bgm, max_iter=300, n_init=1, tol=1e-3)
BGM.fit(df_scaled[best_features])

BGM_predict_proba = BGM.predict_proba(df_scaled[best_features])
BGM_predict = np.argmax(BGM_predict_proba, axis=1)

all_scores.append(scores(BGM_predict, lib="BayesianGaussianMixture after powertransformer"))

BayesianGaussianMixture after powertransformer : Silhouette : 0.0% | Calinski Harabasz : 8156.3 | Davis Bouldin : 2.656


In [7]:
# get trusted data to train LGB model.
proba_threshold = CFG.threshold

df_scaled['predict'] = BGM_predict
df_scaled['predict_proba'] = 0
for n in range(CFG.n_clusters):
    df_scaled[f'predict_proba_{n}'] = BGM_predict_proba[:, n]
    df_scaled.loc[df_scaled['predict']==n, 'predict_proba'] = df_scaled[f'predict_proba_{n}']
    
    
idxs = np.array([])
for n in range(CFG.n_clusters):
    median = df_scaled[df_scaled.predict==n]['predict_proba'].median()
    idx = df_scaled[(df_scaled.predict==n) & (df_scaled.predict_proba > proba_threshold)].index
    idxs = np.concatenate((idxs, idx))
    print(f'Class n{n}  |  Median : {median:.4f}  |  Training data : {len(idx)/len(df_scaled[(df_scaled.predict==n)]):.1%}')
    
X = df_scaled.loc[idxs][best_features].reset_index(drop=True)
y = df_scaled.loc[idxs]['predict'].reset_index(drop=True)

Class n0  |  Median : 0.9116  |  Training data : 75.2%
Class n1  |  Median : 0.8669  |  Training data : 72.0%
Class n2  |  Median : 0.8685  |  Training data : 71.8%
Class n3  |  Median : 0.9074  |  Training data : 77.1%
Class n4  |  Median : 0.7312  |  Training data : 54.3%
Class n5  |  Median : 0.9833  |  Training data : 87.9%
Class n6  |  Median : 0.9376  |  Training data : 78.6%


In [8]:
def get_score(labels, preds, probas):
    s = (balanced_accuracy_score(labels, preds),
        roc_auc_score(labels, probas, average="weighted", multi_class="ovo"))
    return s

In [9]:
params_xgb = {
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'learning_rate': 4e-2,
    'num_class': CFG.n_clusters,
    'seed': CFG.seed,
    'gpu_id': 0,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor'
    }

In [10]:
seed_everything(CFG.seed)

xgb_predict_proba = 0
etc_predict_proba = 0
qda_predict_proba = 0
svc_predict_proba = 0
knc_predict_proba = 0

classif_scores = []

skf = StratifiedKFold(CFG.n_splits, shuffle=True, random_state=CFG.seed)

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"===== fold{fold} =====")
    X_train, y_train = X.iloc[trn_idx], y.iloc[trn_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
    
        
    # XGBoost
    xgb_train = xgb.DMatrix(X_train, label=y_train)
    xgb_valid = xgb.DMatrix(X_valid, label=y_valid)

    model = xgb.train(params_xgb,
                      dtrain=xgb_train,
                      evals=[(xgb_train, 'train'),(xgb_valid, 'eval')],
                      verbose_eval=False,
                      num_boost_round=20000,
                      early_stopping_rounds=200,
                     )
    
    y_pred_proba = model.predict(xgb_valid, iteration_range=(0, model.best_ntree_limit))
    y_pred = np.argmax(y_pred_proba, axis=1)
    
    s = get_score(y_valid, y_pred, y_pred_proba)
    print(f"XGBoost    AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    xgb_predict_proba += model.predict(
        xgb.DMatrix(df_scaled[best_features]),
        iteration_range=(0, model.best_ntree_limit)
    ) / CFG.n_splits
    
    del xgb_train, xgb_valid, model, s, y_pred, y_pred_proba
    gc.collect()
    
    # ExtraTreesClassifier
    model = ExtraTreesClassifier(n_estimators=1000, random_state=CFG.seed)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_valid)
    y_pred_proba = model.predict_proba(X_valid)
    
    s = get_score(y_valid, y_pred, y_pred_proba)
    print(f"ExtraTree  AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    etc_predict_proba += model.predict_proba(df_scaled[best_features]) / CFG.n_splits

    del model, s, y_pred, y_pred_proba
    gc.collect()
    
    # QuadraticDiscriminantAnalysis
    model = QuadraticDiscriminantAnalysis(priors=CFG.n_clusters)
    model.fit(X_train, y_train) # on trusted data only
    
    y_pred = model.predict(X_valid)
    y_pred_proba = model.predict_proba(X_valid)
    
    s = get_score(y_valid, y_pred, y_pred_proba)
    print(f"QDA        AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    qda_predict_proba += model.predict_proba(df_scaled[best_features]) / CFG.n_splits

    del model, s, y_pred, y_pred_proba
    gc.collect()

    # SVC
    model = SVC(probability=True)
    model.fit(X_train, y_train) # on trusted data only
    
    y_pred = model.predict(X_valid)
    y_pred_proba = model.predict_proba(X_valid)
    
    s = get_score(y_valid, y_pred, y_pred_proba)
    print(f"SVC        AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    svc_predict_proba += model.predict_proba(df_scaled[best_features]) / CFG.n_splits

    del model, s, y_pred, y_pred_proba
    gc.collect()

    # KNeighborsClassifier
    model = KNeighborsClassifier(n_neighbors=20)
    model.fit(X_train, y_train) # on trusted data only
    
    y_pred = model.predict(X_valid)
    y_pred_proba = model.predict_proba(X_valid)
    
    s = get_score(y_valid, y_pred, y_pred_proba)
    print(f"KNeighbors AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    knc_predict_proba += model.predict_proba(df_scaled[best_features]) / CFG.n_splits

    del model, s, y_pred, y_pred_proba
    gc.collect()

===== fold0 =====
XGBoost    AUC : 1.000 | Accuracy : 99.1%
ExtraTree  AUC : 0.999 | Accuracy : 96.4%
QDA        AUC : 1.000 | Accuracy : 99.9%
SVC        AUC : 1.000 | Accuracy : 99.7%
KNeighbors AUC : 0.999 | Accuracy : 97.4%
===== fold1 =====
XGBoost    AUC : 1.000 | Accuracy : 99.2%
ExtraTree  AUC : 0.999 | Accuracy : 97.2%
QDA        AUC : 1.000 | Accuracy : 100.0%
SVC        AUC : 1.000 | Accuracy : 99.8%
KNeighbors AUC : 1.000 | Accuracy : 97.6%
===== fold2 =====
XGBoost    AUC : 1.000 | Accuracy : 99.2%
ExtraTree  AUC : 0.999 | Accuracy : 96.5%
QDA        AUC : 1.000 | Accuracy : 100.0%
SVC        AUC : 1.000 | Accuracy : 99.8%
KNeighbors AUC : 1.000 | Accuracy : 97.2%
===== fold3 =====
XGBoost    AUC : 1.000 | Accuracy : 99.2%
ExtraTree  AUC : 0.999 | Accuracy : 96.5%
QDA        AUC : 1.000 | Accuracy : 99.9%
SVC        AUC : 1.000 | Accuracy : 99.8%
KNeighbors AUC : 1.000 | Accuracy : 97.4%
===== fold4 =====
XGBoost    AUC : 1.000 | Accuracy : 99.1%
ExtraTree  AUC : 0.999 | A

In [13]:
def soft_voting(preds_probas, weights):
    pred_test = np.zeros((df.shape[0], CFG.n_clusters))
    
    for i, (p, w) in enumerate(zip(preds_probas, weights)):
        preds = np.argmax(p, axis=1)
        pred_idx = pd.Series(preds).value_counts().index.tolist()
        pred_test += p[:, pred_idx] * w
    
    return np.argmax(pred_test, axis=1)

In [14]:
for ITER in [25,50,75] :

    # the weight evaluation function for weighting classifiers (blending coef) :
    def objective(w):
        w1, w2, w3, w4, w5 = w

        sv_predict = soft_voting(
        [
         xgb_predict_proba, 
         etc_predict_proba, 
         qda_predict_proba, 
         svc_predict_proba, 
         knc_predict_proba],
        [w1, w2, w3, w4, w5])
        
        score = davies_bouldin_score(df, sv_predict)

        return score

    # the set of possible values of the variables (definition domain) :
    bounds = [[1, 100], 
              [1, 100],  
              [1, 100], 
              [1, 100], 
              [1, 100]]

    # the Dual Annealing optimization :
    result = dual_annealing(objective, bounds, maxiter = ITER, seed = 42 )

    # results:
    print(f'\n------------- maxiter:{ITER} -------------\n')
    print('Success :', result['success'])
    print('Total Evaluations: %d' % result['nfev'])

    solution = result['x']
    evaluation = objective(solution)
    print('Solution Weights : {} for minimum davies_bouldin_score = {}'.format((result['x'].astype(int)), evaluation))

    sv_predict = soft_voting(
    [
     xgb_predict_proba, 
     etc_predict_proba, 
     qda_predict_proba, 
     svc_predict_proba, 
     knc_predict_proba],
    solution.tolist())

    sub = pd.read_csv(CFG.PATH + "submissions/sample_submission.csv")
    sub['Predicted'] = sv_predict
    sub.to_csv(CFG.PATH + f"submissions/optimized_soft_voting/submission_iter-{ITER}.csv", index=False)


------------- maxiter:25 -------------

Success : True
Total Evaluations: 287
Solution Weights : [ 1 94  7  4 65] for minimum davies_bouldin_score = 2.7396752695087256

------------- maxiter:50 -------------

Success : True
Total Evaluations: 537
Solution Weights : [ 1 94  7  4 65] for minimum davies_bouldin_score = 2.7396752695087256

------------- maxiter:75 -------------

Success : True
Total Evaluations: 787
Solution Weights : [ 1 94  7  4 65] for minimum davies_bouldin_score = 2.7396752695087256
