In [None]:
import numpy as np
import pandas as pd
import pickle
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 200)

import matplotlib.pyplot as plt
import seaborn as sns

import gc, random, os

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import PowerTransformer, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, balanced_accuracy_score, roc_auc_score
from sklearn.mixture import BayesianGaussianMixture

import lightgbm as lgb
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.filterwarnings("ignore")

from drive.MyDrive.Kaggle.Clustering_072022.src.functions import *

In [None]:
SEED = 1024
N_FOLDS = 10
N_CLUSTERS = 7
PATH = 'drive/MyDrive/Kaggle/Clustering_072022/'

In [None]:
data2 = pd.read_csv(PATH+'src/data_removed.csv', index_col='id')
data = pd.read_csv(PATH+'src/data.csv', index_col='id')
best_feats = data2.columns.tolist()
sub = pd.read_csv(PATH+'submissions/sample_submission.csv', index_col='Id')
cat_feats = data.columns[data.dtypes=='int'].tolist()
num_feats = data.columns[data.dtypes=='float'].tolist()

set_seed(3000)

In [None]:
ct = ColumnTransformer([('rs', RobustScaler(), num_feats)], remainder='passthrough')
pt = Pipeline([('ct', ct), ('pt', PowerTransformer())])

data = pd.DataFrame(pt.fit_transform(data), columns=num_feats+cat_feats)

def scores(preds, lib, df=data, verbose = True, compute_silhouette = True): 

    # Silhouette is very slow
    sil = 0
    if compute_silhouette:
        sil = silhouette_score(df, preds, metric='euclidean')

    s = (lib,
         sil, 
         calinski_harabasz_score(df, preds), 
         davies_bouldin_score(df, preds))

    if verbose:
        print(f"{s[0]} : Silhouette : {s[1]:.1%} | Calinski Harabasz : {s[2]:.1f} | Davis Bouldin : {s[3]:.3f}")

    return s

all_scores = []

In [None]:
BGM = BayesianGaussianMixture(
    n_components = N_CLUSTERS, 
    covariance_type = 'full', 
    random_state = SEED, 
    n_init = 5, tol=.0001)

BGM.fit(data[best_feats])
BGM_predict = BGM.predict(data[best_feats])
BGM_predict_proba = BGM.predict_proba(data[best_feats])

all_scores.append(scores(BGM_predict, lib="BayesianGaussianMixture after powertransformer"))

BayesianGaussianMixture after powertransformer : Silhouette : 1.5% | Calinski Harabasz : 1639.9 | Davis Bouldin : 5.591


In [None]:
# get trusted data to train LGB model.
proba_threshold = .69

data['predict'] = BGM_predict
data['predict_proba'] = 0
for n in range(N_CLUSTERS):
    data[f'predict_proba_{n}'] = BGM_predict_proba[:,n]
    data.loc[data.predict == n, 'predict_proba'] = data[f'predict_proba_{n}']
    
median_list = []
idxs = np.array([])
for n in range(N_CLUSTERS):
    median = data[data.predict==n]['predict_proba'].median()
    median_list.append(median)
    idx = data[(data.predict==n) & (data.predict_proba > proba_threshold)].index
    idxs = np.concatenate((idxs, idx))
    print(f'Class n°{n}  |  Median : {median:.4f}  |  Training data : {len(idx)/len(data[(data.predict==n)]):.1%}')
    
print(f'median mean: {np.mean(median_list)}')
X = data.loc[idxs][cat_feats+num_feats]
y = data.loc[idxs]['predict']

Class n°0  |  Median : 0.8684  |  Training data : 72.9%
Class n°1  |  Median : 0.7311  |  Training data : 55.5%
Class n°2  |  Median : 0.8671  |  Training data : 73.2%
Class n°3  |  Median : 0.9833  |  Training data : 88.5%
Class n°4  |  Median : 0.9074  |  Training data : 78.0%
Class n°5  |  Median : 0.9119  |  Training data : 76.1%
Class n°6  |  Median : 0.9376  |  Training data : 79.4%
median mean: 0.8866725228749873


In [None]:
params_lgb = {'learning_rate': 0.07,'objective': 'multiclass','boosting': 'gbdt','verbosity': -1,'n_jobs': -1, 'num_classes':N_CLUSTERS} 

lgbm_predict_proba = 0 ; classif_scores = []

gkf = StratifiedKFold(N_FOLDS, shuffle=True, random_state = SEED)
for fold, (trn_idx, val_idx) in enumerate(gkf.split(X,y)):   

    X_trn = lgb.Dataset(X.iloc[trn_idx], y.iloc[trn_idx])
    X_val = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])
    
    model = lgb.train(params = params_lgb, 
                train_set = X_trn, valid_sets =  X_val, 
                num_boost_round = 5000, 
                callbacks = [ lgb.early_stopping(stopping_rounds=300, verbose=True)])  

    y_pred_proba = model.predict(X.iloc[val_idx])
    y_pred = np.argmax(y_pred_proba, axis=1)
    
    s = (balanced_accuracy_score(y.iloc[val_idx], y_pred),
        roc_auc_score(y.iloc[val_idx], y_pred_proba, average="weighted", multi_class="ovo"))
    print(f"Fold n°{fold+1} on LGBM. AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}\n")
    classif_scores.append(s)

    lgbm_predict_proba += model.predict(data[best_feats]) / N_FOLDS
    
all_scores.append(scores(np.argmax(lgbm_predict_proba, axis=1), lib="LGBM after BayesianGaussianMixture - threshold 0.69"))

pd.DataFrame(classif_scores, columns = ["balanced_accuracy_score", "roc_auc_score"]).mean(0)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[884]	valid_0's multi_logloss: 0.0252393
[885]	valid_0's multi_logloss: 0.0252232
[886]	valid_0's multi_logloss: 0.0252196
[887]	valid_0's multi_logloss: 0.0252185
[888]	valid_0's multi_logloss: 0.0252084
[889]	valid_0's multi_logloss: 0.0251964
[890]	valid_0's multi_logloss: 0.0251821
[891]	valid_0's multi_logloss: 0.0251912
[892]	valid_0's multi_logloss: 0.0251901
[893]	valid_0's multi_logloss: 0.0251461
[894]	valid_0's multi_logloss: 0.0251286
[895]	valid_0's multi_logloss: 0.025103
[896]	valid_0's multi_logloss: 0.0250866
[897]	valid_0's multi_logloss: 0.0251012
[898]	valid_0's multi_logloss: 0.0251227
[899]	valid_0's multi_logloss: 0.0251138
[900]	valid_0's multi_logloss: 0.0251057
[901]	valid_0's multi_logloss: 0.0251044
[902]	valid_0's multi_logloss: 0.0250923
[903]	valid_0's multi_logloss: 0.0251029
[904]	valid_0's multi_logloss: 0.0250921
[905]	valid_0's multi_logloss: 0.0250757
[906]	valid_0's multi_logloss: 0.0

balanced_accuracy_score    0.991155
roc_auc_score              0.999926
dtype: float64

In [None]:
et_predict_proba = 0 ; classif_scores = []

gkf = StratifiedKFold(N_FOLDS, shuffle=True, random_state = SEED + 1)

for fold, (trn_idx, val_idx) in enumerate(gkf.split(X, y)):   

    X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
    
    model = ExtraTreesClassifier(n_estimators=100, random_state=SEED)
    model.fit(X_trn, y_trn)
    
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)
    
    s = (balanced_accuracy_score(y_val, y_pred),
        roc_auc_score(y_val, y_pred_proba, average="weighted", multi_class="ovo"))
    print(f"Fold n°{fold+1} on Extratree. AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    et_predict_proba += model.predict_proba(data[best_feats]) / N_FOLDS

all_scores.append(scores(np.argmax(et_predict_proba, axis=1), lib="Extratree after BayesianGaussianMixture"))

pd.DataFrame(classif_scores, columns = ["balanced_accuracy_score", "roc_auc_score"]).mean(0)

In [None]:
qda_predict_proba = 0 ; classif_scores = []

gkf = StratifiedKFold(N_FOLDS, shuffle=True, random_state = SEED + 2)

for fold, (trn_idx, val_idx) in enumerate(gkf.split(X, y)):   

    X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = QuadraticDiscriminantAnalysis(reg_param=1)
    model.fit(X_trn, y_trn) # on trusted data only
    
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)
    
    s = (balanced_accuracy_score(y_val, y_pred),
        roc_auc_score(y_val, y_pred_proba, average="weighted", multi_class="ovo"))
    print(f"Fold n°{fold+1} on QDA. AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    qda_predict_proba += model.predict_proba(data[best_feats]) / N_FOLDS

all_scores.append(scores(np.argmax(qda_predict_proba, axis=1), lib="QuadraticDiscriminantAnalysis n°1 after BayesianGaussianMixture"))
pd.DataFrame(classif_scores, columns = ["balanced_accuracy_score", "roc_auc_score"]).mean(0)

In [None]:
qda_predict_proba = 0 ; classif_scores = []

gkf = StratifiedKFold(N_FOLDS, shuffle=True, random_state = SEED + 2)

for fold, (trn_idx, val_idx) in enumerate(gkf.split(X, y)):   

    X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = QuadraticDiscriminantAnalysis(reg_param=0)
    model.fit(X_trn, y_trn) # on trusted data only
    
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)
    
    s = (balanced_accuracy_score(y_val, y_pred),
        roc_auc_score(y_val, y_pred_proba, average="weighted", multi_class="ovo"))
    print(f"Fold n°{fold+1} on QDA. AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    qda_predict_proba += model.predict_proba(data[best_feats]) / N_FOLDS

all_scores.append(scores(np.argmax(qda_predict_proba, axis=1), lib="QuadraticDiscriminantAnalysis n°2 after BayesianGaussianMixture"))
pd.DataFrame(classif_scores, columns = ["balanced_accuracy_score", "roc_auc_score"]).mean(0)

In [None]:
GNB_predict_proba = 0 ; classif_scores = []

gkf = StratifiedKFold(N_FOLDS, shuffle=True, random_state = SEED + 2)

for fold, (trn_idx, val_idx) in enumerate(gkf.split(X, y)):   

    X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = GaussianNB(var_smoothing=.1)
    model.fit(X_trn, y_trn) # on trusted data only
    
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)
    
    s = (balanced_accuracy_score(y_val, y_pred),
        roc_auc_score(y_val, y_pred_proba, average="weighted", multi_class="ovo"))
    print(f"Fold n°{fold+1} on GaussianNB. AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    GNB_predict_proba += model.predict_proba(data[best_feats]) / N_FOLDS

all_scores.append(scores(np.argmax(GNB_predict_proba, axis=1), lib="GaussianNaïveBayes after BayesianGaussianMixture"))
pd.DataFrame(classif_scores, columns = ["balanced_accuracy_score", "roc_auc_score"]).mean(0)

In [None]:
lda_predict_proba = 0 ; classif_scores = []

gkf = StratifiedKFold(N_FOLDS, shuffle=True, random_state = SEED + 2)

for fold, (trn_idx, val_idx) in enumerate(gkf.split(X, y)):   

    X_trn, y_trn = X.iloc[trn_idx], y.iloc[trn_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = LinearDiscriminantAnalysis()
    model.fit(X_trn, y_trn) # on trusted data only
    
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)
    
    s = (balanced_accuracy_score(y_val, y_pred),
        roc_auc_score(y_val, y_pred_proba, average="weighted", multi_class="ovo"))
    print(f"Fold n°{fold+1} on LDA. AUC : {s[1]:.3f} | Accuracy : {s[0]:.1%}")
    classif_scores.append(s)

    lda_predict_proba += model.predict_proba(data[best_feats]) / N_FOLDS

all_scores.append(scores(np.argmax(lda_predict_proba, axis=1), lib="LinearDiscriminantAnalysis after BayesianGaussianMixture"))
pd.DataFrame(classif_scores, columns = ["balanced_accuracy_score", "roc_auc_score"]).mean(0)

In [None]:
def soft_voting(preds_probs):

    values = list(range(N_CLUSTERS))
    pred_test = pd.DataFrame(np.zeros((data.shape[0], 7)), columns = values)

    for i, p in enumerate(preds_probs):
    
        MAX = np.argmax(p, axis=1)
        data[f'pred_{i}'] = MAX
    
        # Sort of the prediction by same value of cluster
        pred_keys = data[f'pred_{i}'].value_counts().index.tolist()
        pred_dict = dict(zip(pred_keys, values))
        data[f'pred_{i}'] = data[f'pred_{i}'].map(pred_dict)

        pred_new = pd.DataFrame(p).rename(columns = pred_dict)
        pred_new = pred_new.reindex(sorted(pred_new.columns), axis=1)
        pred_test += pred_new # Soft voting by probabiliy addition

    return np.argmax(np.array(pred_test), axis=1)

In [None]:
sv1_predict = soft_voting([et_predict_proba, lgbm_predict_proba, qda_predict_proba, lda_predict_proba, GNB_predict_proba])
all_scores.append(scores(sv1_predict, lib="Soft voting n°1 : all"))

sv2_predict = soft_voting([et_predict_proba, lgbm_predict_proba, qda_predict_proba])
all_scores.append(scores(sv2_predict, lib="Soft voting n°2 : LGBM, extratree and QDA"))

sv3_predict = soft_voting([lgbm_predict_proba, qda_predict_proba])
all_scores.append(scores(sv3_predict, lib="Soft voting n°3 : LGBM and QDA"))

In [None]:
pd.DataFrame(all_scores, columns=["Model", "silhouette", "Calinski_Harabasz", "Davis_Bouldin"])

In [None]:
sub['Predicted'] = np.argmax(lgbm_predict_proba, axis = 1)
sub.to_csv(PATH+"submissions/submission_lgbm_all_feats.csv",index = "Id")

sub['Predicted'] = np.argmax(et_predict_proba, axis = 1)
sub.to_csv(PATH+"submissions/submission_extratree_all_feats.csv", index = "Id")
 
sub['Predicted'] = np.argmax(qda_predict_proba, axis = 1)
sub.to_csv(PATH+"submissions/submission_qda_all_feats.csv", index = "Id")
 
sub['Predicted'] = np.argmax(lda_predict_proba, axis = 1)
sub.to_csv(PATH+"submissions/submission_lda_all_feats.csv", index = "Id")
 
sub['Predicted'] = np.argmax(GNB_predict_proba, axis = 1)
sub.to_csv(PATH+"submissions/submission_GNB_all_feats.csv", index = "Id")
 
sub['Predicted'] = sv1_predict
sub.to_csv(PATH+"submissions/submission_softvote1_all_feats.csv", index = "Id")

sub['Predicted'] = sv2_predict
sub.to_csv(PATH+"submissions/submission_softvote2_all_feats.csv", index = "Id")

sub['Predicted'] = sv3_predict
sub.to_csv(PATH+"submissions/submission_softvote3_all_feats.csv", index = "Id")

In [None]:
sub

Unnamed: 0_level_0,Predicted
Id,Unnamed: 1_level_1
0,4
1,1
2,0
3,0
4,2
...,...
97995,0
97996,3
97997,2
97998,2
