# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, PowerTransformer, MinMaxScaler, MaxAbsScaler, QuantileTransformer
from sklearn.mixture import BayesianGaussianMixture

from drive.MyDrive.Kaggle.Clustering_072022.src.functions import *

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Reading, Seed & Feature Definition

In [None]:
PATH = 'drive/MyDrive/Kaggle/Clustering_072022/'
data = pd.read_csv(PATH+'src/data_removed.csv', index_col='id')

cat_feats = data.columns[data.dtypes=='int'].tolist()
num_feats = data.columns[data.dtypes=='float'].tolist()

set_seed(50)

# Tests
I am testing different pipelines to see which combination of scalers and transformers gives me the best leaderboard score.

In [None]:
bgm = BayesianGaussianMixture(
    n_components=7,
    covariance_type='full',
    max_iter=300,
    n_init=3,
    )

data[num_feats] = QuantileTransformer().fit_transform(data[num_feats])
data[cat_feats] = PowerTransformer().fit_transform(data[cat_feats])
data = pd.DataFrame(MinMaxScaler().fit_transform(data), columns=data.columns)

preds = bgm.fit_predict(data)
sub = pd.read_csv(PATH+'submissions/sample_submission.csv', index_col='Id')
sub['Predicted'] = preds
sub.to_csv(PATH + 'submissions/tests/BGM_numsQT_catsPT_allMMS.csv')

# Result Summary
- Using both features:
    - Just power transformer on all, seed 50: 0.60102
    - Robust scaler on num feats, power transformer on all, seed 50, weight prior 0.25: 0.60079
    - Robust scaler on num feats, then power transformer on all, seed 50: 0.60078
    - Robust scaler on num feats, power transformer on cat feats, seed 50: 0.60050
    - Max absolute scaler on num feats, power transformer on cat feats, seed 50: 0.60045
    - Minmax scaler on num feats, power transformer on cat feats, seed 50: 0.60038
- Using just cat features:
    - Power transformer, seed 50: 0.37492
    - Power transformer then robust scaler, seed 50: 0.37485
    - Minmax scaler, seed 50: 0.30510
    - Raw features, seed 50: 0.27872
- Using just num features:
    - Robust Scaler on using just num feats, seed 50: 0.04862

I was a little surprised at how well using the just the categorical features did and how poorly the floats did. This tells us there is interactions between these features which is important for clustering.

The best scoring combination makes sense because the number features contains outliers, the cat feats are far from normal distributed so they need the power transformer, if they were scalled

In [None]:
summary = {'Using Both Features': {'RS on Nums': {'PT on all': {'Weight Prior': {'Normal': 0.60078, '0.25': 0.60079}}}, 'PT on cats': 0.60050}}