In [49]:
import numpy as np
import pandas as pd
import pickle
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 200)

import matplotlib.pyplot as plt
import seaborn as sns

import gc, random, os

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import PowerTransformer, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, balanced_accuracy_score, roc_auc_score
from sklearn.mixture import BayesianGaussianMixture

import lightgbm as lgb
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.filterwarnings("ignore")

from drive.MyDrive.Kaggle.Clustering_072022.src.functions import *

In [2]:
SEED = 69
N_FOLDS = 10
N_CLUSTERS = 7
PATH = 'drive/MyDrive/Kaggle/Clustering_072022/'

In [42]:
def scores(preds, lib, df=data, verbose = True, compute_silhouette = True): 

    # Silhouette is very slow
    sil = 0
    if compute_silhouette:
        sil = silhouette_score(df, preds, metric='euclidean')

    s = (lib,
         sil, 
         calinski_harabasz_score(df, preds), 
         davies_bouldin_score(df, preds))

    if verbose:
        print(f"{s[0]} : Silhouette : {s[1]:.1%} | Calinski Harabasz : {s[2]:.1f} | Davis Bouldin : {s[3]:.3f}")

    return s

all_scores = []

In [101]:
data2 = pd.read_csv(PATH+'src/data_removed.csv', index_col='id')
data = pd.read_csv(PATH+'src/data.csv', index_col='id')
best_cols = data2.columns.tolist()
sub = pd.read_csv(PATH+'submissions/sample_submission.csv', index_col='Id')
cat_feats = data.columns[data.dtypes=='int'].tolist()
num_feats = data.columns[data.dtypes=='float'].tolist()

set_seed(3000)

In [61]:
ct = ColumnTransformer([('rs', RobustScaler(), num_feats)], remainder='passthrough')
pt = Pipeline([('ct', ct), ('pt', PowerTransformer())])

data = pd.DataFrame(pt.fit_transform(data), columns=num_feats+cat_feats)

In [90]:
best_proba = 0
best_median_mean = 0
for seed in range(1000, 3000, 20):
    BGM = BayesianGaussianMixture(
        n_components = N_CLUSTERS, 
        covariance_type = 'full', 
        random_state = seed, 
        n_init = 2, tol=.05)

    BGM.fit(data[best_feats])
    BGM_predict = BGM.predict(data[best_feats])
    BGM_predict_proba = BGM.predict_proba(data[best_feats])

    all_scores.append(scores(BGM_predict, lib="BayesianGaussianMixture after powertransformer"))
    # get trusted data to train LGB model.
    proba_threshold = .69

    data['predict'] = BGM_predict
    data['predict_proba'] = 0
    for n in range(N_CLUSTERS):
        data[f'predict_proba_{n}'] = BGM_predict_proba[:,n]
        data.loc[data.predict == n, 'predict_proba'] = data[f'predict_proba_{n}']
        
    median_list = []
    idxs = np.array([])
    for n in range(N_CLUSTERS):
        median = data[data.predict==n]['predict_proba'].median()
        median_list.append(median)
        idx = data[(data.predict==n) & (data.predict_proba > proba_threshold)].index
        idxs = np.concatenate((idxs, idx))
        print(f'Class n°{n}  |  Median : {median:.4f}  |  Training data : {len(idx)/len(data[(data.predict==n)]):.1%}')
    median_mean = np.mean(median_list)
    if median_mean > best_median_mean:
        best_median_mean = median_mean
        data['best_probas'] = data['predict_proba']
        data['best_preds'] = data['predict']
        best_idxs = idxs
    print(f'median mean: {best_median_mean}')
X = data.loc[best_idxs][best_feats]
y = data.loc[best_idxs]['predict']

BayesianGaussianMixture after powertransformer : Silhouette : -1.9% | Calinski Harabasz : 3624.3 | Davis Bouldin : 4.138
Class n°0  |  Median : 0.9376  |  Training data : 79.4%
Class n°1  |  Median : 0.9833  |  Training data : 88.5%
Class n°2  |  Median : 0.8677  |  Training data : 72.8%
Class n°3  |  Median : 0.8669  |  Training data : 73.2%
Class n°4  |  Median : 0.7294  |  Training data : 55.4%
Class n°5  |  Median : 0.9069  |  Training data : 77.9%
Class n°6  |  Median : 0.9119  |  Training data : 76.2%
median mean: 0.8862371269921613
BayesianGaussianMixture after powertransformer : Silhouette : -1.9% | Calinski Harabasz : 3624.2 | Davis Bouldin : 4.138
Class n°0  |  Median : 0.9377  |  Training data : 79.4%
Class n°1  |  Median : 0.9069  |  Training data : 77.9%
Class n°2  |  Median : 0.9833  |  Training data : 88.5%
Class n°3  |  Median : 0.8669  |  Training data : 73.2%
Class n°4  |  Median : 0.9119  |  Training data : 76.2%
Class n°5  |  Median : 0.8676  |  Training data : 72.8

In [91]:
params_lgb = {'learning_rate': 0.07,'objective': 'multiclass','boosting': 'gbdt','verbosity': -1,'n_jobs': -1, 'num_classes':N_CLUSTERS} 

lgbm_predict_proba = 0 ; classif_scores = []

gkf = StratifiedKFold(N_FOLDS, shuffle=True, random_state = SEED)
for fold, (trn_idx, val_idx) in enumerate(gkf.split(X,y)):   

    X_trn = lgb.Dataset(X.iloc[trn_idx], y.iloc[trn_idx])
    X_val = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])
    
    model = lgb.train(params = params_lgb, 
                train_set = X_trn, valid_sets =  X_val, 
                num_boost_round = 5000, 
                callbacks = [ lgb.early_stopping(stopping_rounds=100, verbose=True)])  

    y_pred_proba = model.predict(X.iloc[val_idx])
    y_pred = np.argmax(y_pred_proba, axis=1)
    
    s = (balanced_accuracy_score(y.iloc[val_idx], y_pred),
        roc_auc_score(y.iloc[val_idx], y_pred_proba, average="weighted", multi_class="ovo"))
    print(f"Fold n°{fold+1} on LGBM. AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}\n")
    classif_scores.append(s)

    lgbm_predict_proba += model.predict(data[best_feats]) / N_FOLDS
    
all_scores.append(scores(np.argmax(lgbm_predict_proba, axis=1), lib="LGBM after BayesianGaussianMixture - threshold 0.69"))

pd.DataFrame(classif_scores, columns = ["balanced_accuracy_score", "roc_auc_score"]).mean(0)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[619]	valid_0's multi_logloss: 0.0803497
[620]	valid_0's multi_logloss: 0.0803044
[621]	valid_0's multi_logloss: 0.0802554
[622]	valid_0's multi_logloss: 0.0802418
[623]	valid_0's multi_logloss: 0.0802319
[624]	valid_0's multi_logloss: 0.0801731
[625]	valid_0's multi_logloss: 0.0801694
[626]	valid_0's multi_logloss: 0.0801038
[627]	valid_0's multi_logloss: 0.0801201
[628]	valid_0's multi_logloss: 0.0801041
[629]	valid_0's multi_logloss: 0.0800753
[630]	valid_0's multi_logloss: 0.0800644
[631]	valid_0's multi_logloss: 0.0800081
[632]	valid_0's multi_logloss: 0.0799866
[633]	valid_0's multi_logloss: 0.0799587
[634]	valid_0's multi_logloss: 0.0799703
[635]	valid_0's multi_logloss: 0.0799475
[636]	valid_0's multi_logloss: 0.0799411
[637]	valid_0's multi_logloss: 0.0799607
[638]	valid_0's multi_logloss: 0.0799351
[639]	valid_0's multi_logloss: 0.0799171
[640]	valid_0's multi_logloss: 0.0798871
[641]	valid_0's multi_logloss: 0.

balanced_accuracy_score    0.964705
roc_auc_score              0.999277
dtype: float64

In [92]:
et_predict_proba = 0 ; classif_scores = []

gkf = StratifiedKFold(N_FOLDS, shuffle=True, random_state = SEED + 1)

for fold, (trn_idx, val_idx) in enumerate(gkf.split(X, y)):   

    X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    model = ExtraTreesClassifier(n_estimators=100, random_state=SEED)
    model.fit(X_trn, y_trn)
    
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)
    
    s = (balanced_accuracy_score(y_val, y_pred),
        roc_auc_score(y_val, y_pred_proba, average="weighted", multi_class="ovo"))
    print(f"Fold n°{fold+1} on Extratree. AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    et_predict_proba += model.predict_proba(data[best_feats]) / N_FOLDS

all_scores.append(scores(np.argmax(et_predict_proba, axis=1), lib="Extratree after BayesianGaussianMixture"))

pd.DataFrame(classif_scores, columns = ["balanced_accuracy_score", "roc_auc_score"]).mean(0)

Fold n°1 on Extratree. AUC : 0.996 | Accuracy : 91.4%
Fold n°2 on Extratree. AUC : 0.996 | Accuracy : 91.3%
Fold n°3 on Extratree. AUC : 0.996 | Accuracy : 91.2%
Fold n°4 on Extratree. AUC : 0.996 | Accuracy : 91.6%
Fold n°5 on Extratree. AUC : 0.996 | Accuracy : 91.6%
Fold n°6 on Extratree. AUC : 0.996 | Accuracy : 91.4%
Fold n°7 on Extratree. AUC : 0.996 | Accuracy : 91.5%
Fold n°8 on Extratree. AUC : 0.996 | Accuracy : 91.2%
Fold n°9 on Extratree. AUC : 0.996 | Accuracy : 91.8%
Fold n°10 on Extratree. AUC : 0.996 | Accuracy : 91.3%
Extratree after BayesianGaussianMixture : Silhouette : -1.6% | Calinski Harabasz : 3703.5 | Davis Bouldin : 4.043


balanced_accuracy_score    0.914121
roc_auc_score              0.996023
dtype: float64

In [93]:
qda_predict_proba = 0 ; classif_scores = []

gkf = StratifiedKFold(N_FOLDS, shuffle=True, random_state = SEED + 2)

for fold, (trn_idx, val_idx) in enumerate(gkf.split(X, y)):   

    X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = QuadraticDiscriminantAnalysis(reg_param=1)
    model.fit(X_trn, y_trn) # on trusted data only
    
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)
    
    s = (balanced_accuracy_score(y_val, y_pred),
        roc_auc_score(y_val, y_pred_proba, average="weighted", multi_class="ovo"))
    print(f"Fold n°{fold+1} on QDA. AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    qda_predict_proba += model.predict_proba(data[best_feats]) / N_FOLDS

all_scores.append(scores(np.argmax(qda_predict_proba, axis=1), lib="QuadraticDiscriminantAnalysis n°1 after BayesianGaussianMixture"))
pd.DataFrame(classif_scores, columns = ["balanced_accuracy_score", "roc_auc_score"]).mean(0)

Fold n°1 on QDA. AUC : 0.962 | Accuracy : 74.6%
Fold n°2 on QDA. AUC : 0.961 | Accuracy : 74.5%
Fold n°3 on QDA. AUC : 0.962 | Accuracy : 75.2%
Fold n°4 on QDA. AUC : 0.962 | Accuracy : 75.0%
Fold n°5 on QDA. AUC : 0.964 | Accuracy : 75.8%
Fold n°6 on QDA. AUC : 0.964 | Accuracy : 75.5%
Fold n°7 on QDA. AUC : 0.963 | Accuracy : 75.2%
Fold n°8 on QDA. AUC : 0.962 | Accuracy : 74.9%
Fold n°9 on QDA. AUC : 0.962 | Accuracy : 74.9%
Fold n°10 on QDA. AUC : 0.963 | Accuracy : 75.0%
QuadraticDiscriminantAnalysis n°1 after BayesianGaussianMixture : Silhouette : -0.2% | Calinski Harabasz : 4265.9 | Davis Bouldin : 3.414


balanced_accuracy_score    0.750681
roc_auc_score              0.962435
dtype: float64

In [94]:
qda_predict_proba = 0 ; classif_scores = []

gkf = StratifiedKFold(N_FOLDS, shuffle=True, random_state = SEED + 2)

for fold, (trn_idx, val_idx) in enumerate(gkf.split(X, y)):   

    X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = QuadraticDiscriminantAnalysis(reg_param=0)
    model.fit(X_trn, y_trn) # on trusted data only
    
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)
    
    s = (balanced_accuracy_score(y_val, y_pred),
        roc_auc_score(y_val, y_pred_proba, average="weighted", multi_class="ovo"))
    print(f"Fold n°{fold+1} on QDA. AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    qda_predict_proba += model.predict_proba(data[best_feats]) / N_FOLDS

all_scores.append(scores(np.argmax(qda_predict_proba, axis=1), lib="QuadraticDiscriminantAnalysis n°2 after BayesianGaussianMixture"))
pd.DataFrame(classif_scores, columns = ["balanced_accuracy_score", "roc_auc_score"]).mean(0)

Fold n°1 on QDA. AUC : 1.000 | Accuracy : 96.4%
Fold n°2 on QDA. AUC : 1.000 | Accuracy : 96.9%
Fold n°3 on QDA. AUC : 1.000 | Accuracy : 96.8%
Fold n°4 on QDA. AUC : 1.000 | Accuracy : 97.1%
Fold n°5 on QDA. AUC : 1.000 | Accuracy : 97.1%
Fold n°6 on QDA. AUC : 1.000 | Accuracy : 96.5%
Fold n°7 on QDA. AUC : 1.000 | Accuracy : 97.5%
Fold n°8 on QDA. AUC : 1.000 | Accuracy : 96.9%
Fold n°9 on QDA. AUC : 1.000 | Accuracy : 96.9%
Fold n°10 on QDA. AUC : 1.000 | Accuracy : 97.1%
QuadraticDiscriminantAnalysis n°2 after BayesianGaussianMixture : Silhouette : -1.8% | Calinski Harabasz : 3637.6 | Davis Bouldin : 4.070


balanced_accuracy_score    0.969162
roc_auc_score              0.999800
dtype: float64

In [95]:
GNB_predict_proba = 0 ; classif_scores = []

gkf = StratifiedKFold(N_FOLDS, shuffle=True, random_state = SEED + 2)

for fold, (trn_idx, val_idx) in enumerate(gkf.split(X, y)):   

    X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = GaussianNB(var_smoothing=.1)
    model.fit(X_trn, y_trn) # on trusted data only
    
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)
    
    s = (balanced_accuracy_score(y_val, y_pred),
        roc_auc_score(y_val, y_pred_proba, average="weighted", multi_class="ovo"))
    print(f"Fold n°{fold+1} on GaussianNB. AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    GNB_predict_proba += model.predict_proba(data[best_feats]) / N_FOLDS

all_scores.append(scores(np.argmax(GNB_predict_proba, axis=1), lib="GaussianNaïveBayes after BayesianGaussianMixture"))
pd.DataFrame(classif_scores, columns = ["balanced_accuracy_score", "roc_auc_score"]).mean(0)

Fold n°1 on GaussianNB. AUC : 0.987 | Accuracy : 86.1%
Fold n°2 on GaussianNB. AUC : 0.988 | Accuracy : 86.7%
Fold n°3 on GaussianNB. AUC : 0.987 | Accuracy : 86.0%
Fold n°4 on GaussianNB. AUC : 0.987 | Accuracy : 86.4%
Fold n°5 on GaussianNB. AUC : 0.988 | Accuracy : 86.1%
Fold n°6 on GaussianNB. AUC : 0.988 | Accuracy : 86.9%
Fold n°7 on GaussianNB. AUC : 0.988 | Accuracy : 86.7%
Fold n°8 on GaussianNB. AUC : 0.987 | Accuracy : 86.0%
Fold n°9 on GaussianNB. AUC : 0.988 | Accuracy : 86.7%
Fold n°10 on GaussianNB. AUC : 0.988 | Accuracy : 86.7%
GaussianNaïveBayes after BayesianGaussianMixture : Silhouette : -0.9% | Calinski Harabasz : 4034.6 | Davis Bouldin : 3.699


balanced_accuracy_score    0.864122
roc_auc_score              0.987573
dtype: float64

In [96]:
lda_predict_proba = 0 ; classif_scores = []

gkf = StratifiedKFold(N_FOLDS, shuffle=True, random_state = SEED + 2)

for fold, (trn_idx, val_idx) in enumerate(gkf.split(X, y)):   

    X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = LinearDiscriminantAnalysis()
    model.fit(X_trn, y_trn) # on trusted data only
    
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)
    
    s = (balanced_accuracy_score(y_val, y_pred),
        roc_auc_score(y_val, y_pred_proba, average="weighted", multi_class="ovo"))
    print(f"Fold n°{fold+1} on LDA. AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    lda_predict_proba += model.predict_proba(data[best_feats]) / N_FOLDS

all_scores.append(scores(np.argmax(lda_predict_proba, axis=1), lib="LinearDiscriminantAnalysis after BayesianGaussianMixture"))
pd.DataFrame(classif_scores, columns = ["balanced_accuracy_score", "roc_auc_score"]).mean(0)

Fold n°1 on LDA. AUC : 0.970 | Accuracy : 77.3%
Fold n°2 on LDA. AUC : 0.969 | Accuracy : 76.9%
Fold n°3 on LDA. AUC : 0.970 | Accuracy : 77.5%
Fold n°4 on LDA. AUC : 0.969 | Accuracy : 77.2%
Fold n°5 on LDA. AUC : 0.971 | Accuracy : 78.2%
Fold n°6 on LDA. AUC : 0.971 | Accuracy : 78.0%
Fold n°7 on LDA. AUC : 0.971 | Accuracy : 77.6%
Fold n°8 on LDA. AUC : 0.970 | Accuracy : 78.0%
Fold n°9 on LDA. AUC : 0.970 | Accuracy : 77.3%
Fold n°10 on LDA. AUC : 0.970 | Accuracy : 77.8%
LinearDiscriminantAnalysis after BayesianGaussianMixture : Silhouette : -0.6% | Calinski Harabasz : 4115.5 | Davis Bouldin : 3.595


balanced_accuracy_score    0.775678
roc_auc_score              0.970036
dtype: float64

In [97]:
def soft_voting(preds_probs):

    values = list(range(N_CLUSTERS))
    pred_test = pd.DataFrame(np.zeros((data.shape[0], 7)), columns = values)

    for i, p in enumerate(preds_probs):
    
        MAX = np.argmax(p, axis=1)
        data[f'pred_{i}'] = MAX
    
        # Sort of the prediction by same value of cluster
        pred_keys = data[f'pred_{i}'].value_counts().index.tolist()
        pred_dict = dict(zip(pred_keys, values))
        data[f'pred_{i}'] = data[f'pred_{i}'].map(pred_dict)

        pred_new = pd.DataFrame(p).rename(columns = pred_dict)
        pred_new = pred_new.reindex(sorted(pred_new.columns), axis=1)
        pred_test += pred_new # Soft voting by probabiliy addition

    return np.argmax(np.array(pred_test), axis=1)

In [98]:
sv1_predict = soft_voting([et_predict_proba, lgbm_predict_proba, qda_predict_proba, lda_predict_proba, GNB_predict_proba])
all_scores.append(scores(sv1_predict, lib="Soft voting n°1 : all"))

sv2_predict = soft_voting([et_predict_proba, lgbm_predict_proba, qda_predict_proba])
all_scores.append(scores(sv2_predict, lib="Soft voting n°2 : LGBM, extratree and QDA"))

sv3_predict = soft_voting([lgbm_predict_proba, qda_predict_proba])
all_scores.append(scores(sv3_predict, lib="Soft voting n°3 : LGBM and QDA"))

Soft voting n°1 : all : Silhouette : -1.5% | Calinski Harabasz : 3754.6 | Davis Bouldin : 3.936
Soft voting n°2 : LGBM, extratree and QDA : Silhouette : -1.8% | Calinski Harabasz : 3561.3 | Davis Bouldin : 4.203
Soft voting n°3 : LGBM and QDA : Silhouette : -1.8% | Calinski Harabasz : 3593.9 | Davis Bouldin : 4.130


In [99]:
pd.DataFrame(all_scores, columns=["Model", "silhouette", "Calinski_Harabasz", "Davis_Bouldin"])

Unnamed: 0,Model,silhouette,Calinski_Harabasz,Davis_Bouldin
0,BayesianGaussianMixture after powertransformer,-0.036132,3336.759921,4.426559
1,BayesianGaussianMixture after powertransformer,-0.018725,3624.26581,4.137904
2,BayesianGaussianMixture after powertransformer,-0.018721,3624.181187,4.13783
3,BayesianGaussianMixture after powertransformer,-0.018785,3621.539623,4.130396
4,BayesianGaussianMixture after powertransformer,-0.018717,3624.465123,4.137746
5,BayesianGaussianMixture after powertransformer,-0.02156,3647.61698,4.212987
6,BayesianGaussianMixture after powertransformer,-0.01872,3624.334671,4.137636
7,BayesianGaussianMixture after powertransformer,-0.021562,3647.576707,4.212884
8,BayesianGaussianMixture after powertransformer,-0.018718,3624.278487,4.137681
9,BayesianGaussianMixture after powertransformer,-0.018782,3623.989097,4.133281


In [100]:
sub['Predicted'] = np.argmax(lgbm_predict_proba, axis = 1)
sub.to_csv(PATH+"submissions/submission_lgbm2.csv",index = "Id")

sub['Predicted'] = np.argmax(et_predict_proba, axis = 1)
sub.to_csv(PATH+"submissions/submission_extratree2.csv", index = "Id")
 
sub['Predicted'] = np.argmax(qda_predict_proba, axis = 1)
sub.to_csv(PATH+"submissions/submission_qda2.csv", index = "Id")
 
sub['Predicted'] = np.argmax(lda_predict_proba, axis = 1)
sub.to_csv(PATH+"submissions/submission_lda2.csv", index = "Id")
 
sub['Predicted'] = np.argmax(GNB_predict_proba, axis = 1)
sub.to_csv(PATH+"submissions/submission_GNB2.csv", index = "Id")
 
sub['Predicted'] = sv1_predict
sub.to_csv(PATH+"submissions/submission_softvote12.csv", index = "Id")

sub['Predicted'] = sv2_predict
sub.to_csv(PATH+"submissions/submission_softvote22.csv", index = "Id")

sub['Predicted'] = sv3_predict
sub.to_csv(PATH+"submissions/submission_softvote32.csv", index = "Id")