In [None]:
import lib._util.visualplot as vp

# Pre-processing
from lib._class.DFBoxCoxTransformer import DFBoxCoxTransformer
from lib._class.DFDTypeTransformer import DFDTypeTransformer

# Feature encoding
from lib._class.DFOneHotEncoder import DFOneHotEncoder
from lib._class.DFBinaryEncoder import DFBinaryEncoder

# Feature scaling
from lib._class.DFStandardScaler import DFStandardScaler
from lib._class.DFMinMaxScaler import DFMinMaxScaler

# Feature extraction
from lib._class.DFPCA import DFPCA
from lib._class.DFMCA import DFMCA
from lib._class.DFIvis import DFIvis

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

# Scikit-learn
from sklearn.pipeline import Pipeline

# Plotly
import plotly.express as px

# Constant Variable

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH   = 'resources/output/graph/'

# Phase 1 - Data Loading
- Reference: https://www.kaggle.com/c/allstate-claims-severity/data
- Each row in this dataset represents an insurance claim.
- Variables prefaced with 'cat' are categorical, while those prefaced with 'cont' are continuous.
- You must predict the value for the 'loss' column.

In [None]:
def load_data(filename):
    df_chunks = pd.read_csv(f'{SOURCE_PATH_DATA}{filename}', sep=',', chunksize=50_000,
                            nrows=25_000)
    return pd.concat(df_chunks)

In [None]:
train_df = load_data('train.csv')

train_df.shape

In [None]:
train_df.head()

In [None]:
vp.faststat(train_df)

In [None]:
train_df['loss'].describe()

In [None]:
# Classification target
train_df['target'] = np.where(train_df['loss'] >= 10_000, 1, 0)

vp.value_count(train_df, 'target')

###### Histogram

In [None]:
vp.histogram(train_df.select_dtypes(include='number'),
             bin_algo='count',
             max_col=4,
             title='Phase 1 - Histogram - Numerical',
             out_path=OUT_PATH_GRAPH,
             layout_kwargs={'height': 1000})

In [None]:
vp.histogram(train_df.select_dtypes(include='object'),
             bin_algo='count',
             max_col=4,
             title='Phase 1 - Histogram - Categorical',
             out_path=OUT_PATH_GRAPH,
             layout_kwargs={'height': 5000})

# Phase 2 - Data Preparation
- Handle skewness

In [None]:
boxcox_transformer = DFBoxCoxTransformer(columns=['loss'])
train_df = boxcox_transformer.fit_transform(train_df)

train_df.shape

In [None]:
boxcox_transformer.stat_df

In [None]:
vp.faststat(train_df)

# Phase 3 - Data Preparation
- Feature encoding
- Feature scaling

In [None]:
X = train_df[[x for x in train_df.columns if x not in ['id', 'loss', 'target']]].copy()
y_regress = train_df['loss'].copy()
y_classif = train_df['target'].copy()

X.shape, y_regress.shape, y_classif.shape

In [None]:
# Feature encoding (low cardinality)
low_cardinalities = [x for x in X.select_dtypes(include='object').columns if len(X[x].unique()) < 10]
onehot_encoder    = DFOneHotEncoder(columns=low_cardinalities, dtype='byte', drop='first')

# Feature encoding (high cardinality)
high_cardinalities = [x for x in X.select_dtypes(include='object').columns if x not in low_cardinalities]
binary_encoder     = DFBinaryEncoder(columns=high_cardinalities, drop_invariant=True)

# Feature scaling
standard_scaler = DFStandardScaler(columns=X.select_dtypes(include='number').columns)

steps = [
    ('onehot_encoder', onehot_encoder),
    ('binary_encoder', binary_encoder),
    ('standard_scaler', standard_scaler),
]
X = Pipeline(steps).fit_transform(X)

X.shape

In [None]:
vp.faststat(X)

# Phase 4 - Data Preparation
- Feature extraction (PCA + MCA)

In [None]:
# Remain all dimensions to evaluate N dimensions needed for local PCA & MCA
dtype_transformer = DFDTypeTransformer(dtype_dict={
    'str': X.select_dtypes(include='int8').columns
})

numerics  = X.select_dtypes(include='float').columns
local_pca = DFPCA(columns=numerics,
                  n_components=len(numerics),
                  rescale_with_mean=False, rescale_with_std=False)

categories = X.select_dtypes(include='int8').columns
local_mca  = DFMCA(columns=categories,
                   n_components=X[categories].apply(lambda x: len(x.unique())).sum())

steps = [
    ('dtype_transformer', dtype_transformer),
    ('local_pca', local_pca),
    ('local_mca', local_mca),
]
pipeline = Pipeline(steps, verbose=True).fit(X)

###### Line

In [None]:
vp.line(local_pca.stat_df,
        xy_tuples=[('dimension', x) for x in ['explained_inertia', 'cumsum_explained_inertia']],
        title='Phase 4 - Inertia - Local PCA',
        out_path=OUT_PATH_GRAPH,
        scattergl=True)

In [None]:
vp.line(local_mca.stat_df,
        xy_tuples=[('dimension', x) for x in ['explained_inertia', 'cumsum_explained_inertia']],
        title='Phase 4 - Inertia - Local MCA',
        out_path=OUT_PATH_GRAPH,
        scattergl=True)

In [None]:
# Remain N dimensions to explain 100% inertia
local_pca = DFPCA(columns=numerics,
                  n_components=14,
                  rescale_with_mean=False, rescale_with_std=False)

local_mca  = DFMCA(columns=categories,
                   n_components=246)

steps = [
    ('dtype_transformer', dtype_transformer),
    ('local_pca', local_pca),
    ('local_mca', local_mca),
]
pca_df = Pipeline(steps, verbose=True).fit_transform(X)

pca_df.shape

In [None]:
vp.faststat(pca_df)

In [None]:
# Remain all dimensions to evaluate N dimensions needed for global PCA
global_standard_scaler = DFStandardScaler()
global_pca             = DFPCA(n_components=pca_df.shape[1],
                               rescale_with_mean=False, rescale_with_std=False)

steps = [
    ('global_standard_scaler', global_standard_scaler),
    ('global_pca', global_pca)
]
pipeline = Pipeline(steps, verbose=True).fit(pca_df)

###### Line

In [None]:
vp.line(global_pca.stat_df,
        xy_tuples=[('dimension', x) for x in ['explained_inertia', 'cumsum_explained_inertia']],
        title='Phase 4 - Inertia - Global PCA',
        out_path=OUT_PATH_GRAPH,
        scattergl=True)

In [None]:
# Remain N dimensions to explain 90% inertia
global_pca = DFPCA(n_components=225,
                   rescale_with_mean=False, rescale_with_std=False)

steps = [
    ('global_standard_scaler', global_standard_scaler),
    ('global_pca', global_pca)
]
pca_df = Pipeline(steps, verbose=True).fit_transform(pca_df)

pca_df.shape

In [None]:
vp.faststat(pca_df)

###### Scatter

In [None]:
vp.scatter(pd.concat([pca_df, y_regress], axis=1),
           xy_tuples=[('pca_0', x) for x in [x for x in pca_df.columns if x != 'pca_0'][:4]],
           color='loss',
           title='Phase 4 - Scatter - PCA - Regression',
           max_col=2,
           out_path=OUT_PATH_GRAPH)

In [None]:
tmp_df = pd.concat([pca_df, y_classif], axis=1)
tmp_df['target'] = tmp_df['target'].astype(str)

vp.scatter(tmp_df,
           xy_tuples=[('pca_0', x) for x in [x for x in pca_df.columns if x != 'pca_0'][:4]],
           color='target',
           title='Phase 4 - Scatter - PCA - Classification',
           max_col=2,
           out_path=OUT_PATH_GRAPH)

del tmp_df

# Phase 5 - Data Preparation
- Feature extraction (Ivis)

In [None]:
# Unsupervised ivis
minmax_scaler = DFMinMaxScaler()
ivis          = DFIvis(embedding_dims=5,
                       k=150, n_epochs_without_progress=10, model='szubert')

steps = [
    ('minmax_scaler', minmax_scaler),
    ('ivis', ivis),
]
ivis_df = Pipeline(steps, verbose=True).fit_transform(X)

In [None]:
vp.faststat(ivis_df)

###### Scatter

In [None]:
vp.scatter(pd.concat([ivis_df, y_regress], axis=1),
           xy_tuples=[('ivis_0', x) for x in [x for x in ivis_df.columns if x != 'ivis_0'][:4]],
           color='loss',
           title='Phase 5 - Scatter - Unsupervised Ivis - Regression',
           max_col=2,
           out_path=OUT_PATH_GRAPH)

In [None]:
tmp_df = pd.concat([ivis_df, y_classif], axis=1)
tmp_df['target'] = tmp_df['target'].astype(str)

vp.scatter(tmp_df,
           xy_tuples=[('ivis_0', x) for x in [x for x in ivis_df.columns if x != 'ivis_0'][:4]],
           color='target',
           title='Phase 5 - Scatter - Unsupervised Ivis - Classification',
           max_col=2,
           out_path=OUT_PATH_GRAPH)

del tmp_df

In [None]:
# Ivis (regression)
minmax_scaler = DFMinMaxScaler()
ivis          = DFIvis(embedding_dims=5,
                       k=150, n_epochs_without_progress=10, model='szubert',
                       supervision_weight=1, supervision_metric='mean_squared_error', distance='softmax_ratio')

steps = [
    ('minmax_scaler', minmax_scaler),
    ('ivis', ivis),
]
ivis_df = Pipeline(steps, verbose=True).fit_transform(X, y_regress)

In [None]:
vp.faststat(ivis_df)

###### Scatter

In [None]:
vp.scatter(pd.concat([ivis_df, y_regress], axis=1),
           xy_tuples=[('ivis_0', x) for x in [x for x in ivis_df.columns if x != 'ivis_0'][:4]],
           color='loss',
           title='Phase 5 - Scatter - Ivis - Regression',
           max_col=2,
           out_path=OUT_PATH_GRAPH)

In [None]:
# Ivis (classification)
minmax_scaler = DFMinMaxScaler()
ivis          = DFIvis(embedding_dims=5,
                       k=150, n_epochs_without_progress=10, model='szubert',
                       supervision_weight=1, supervision_metric='binary_crossentropy', distance='softmax_ratio_pn')

steps = [
    ('minmax_scaler', minmax_scaler),
    ('ivis', ivis),
]
ivis_df = Pipeline(steps, verbose=True).fit_transform(X, y_classif)

###### Scatter

In [None]:
tmp_df = pd.concat([ivis_df, y_classif], axis=1)
tmp_df['target'] = tmp_df['target'].astype(str)

vp.scatter(tmp_df,
           xy_tuples=[('ivis_0', x) for x in [x for x in ivis_df.columns if x != 'ivis_0'][:4]],
           color='target',
           title='Phase 5 - Scatter - Ivis - Classification',
           max_col=2,
           out_path=OUT_PATH_GRAPH)

del tmp_df