In [None]:
import lib._util.visualplot as vp
import lib._util.fileproc as fp

# Feature selection
from lib._class.feature_selection.filtering.DFChi2Threshold import DFChi2Threshold
from lib._class.feature_selection.filtering.DFMutualInfoClassifierThreshold import DFMutualInfoClassifierThreshold
from lib._class.feature_selection.filtering.DFANOVAClassifierThreshold import DFANOVAClassifierThreshold
from lib._class.feature_selection.filtering.DFROCAUCThreshold import DFROCAUCThreshold

from lib._class.feature_selection.embedding.DFLogisticRegressionSelector import DFLogisticRegressionSelector
from lib._class.feature_selection.embedding.DFRidgeClassifierSelector import DFRidgeClassifierSelector
from lib._class.feature_selection.embedding.DFDecisionTreeClassifierSelector import DFDecisionTreeClassifierSelector
from lib._class.feature_selection.embedding.DFExtraTreeClassifierSelector import DFExtraTreeClassifierSelector
from lib._class.feature_selection.embedding.DFRandomForestClassifierSelector import DFRandomForestClassifierSelector
from lib._class.feature_selection.embedding.DFXGBClassifierSelector import DFXGBClassifierSelector
from lib._class.feature_selection.embedding.DFCatBoostClassifierSelector import DFCatBoostClassifierSelector

# Feature encoding
from lib._class.DFOneHotEncoder import DFOneHotEncoder

# Feature scaling
from lib._class.DFMinMaxScaler import DFMinMaxScaler

# Feature extraction
from lib._class.DFIvis import DFIvis

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 100)

import numpy as np

# Plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.colors import DEFAULT_PLOTLY_COLORS

# Scikit-Learn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

# Tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall, AUC, CategoricalAccuracy
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.constraints import max_norm

# Useful Functions

In [None]:
COMPANY_CODE      = 'MAG'
TARGET            = 'target_d4'
SOURCE_PATH_TRANS = f'resources/output/eda_trans/file/{COMPANY_CODE}/'
OUT_PATH_GRAPH    = f'resources/output/ann_digit_trans/graph/{COMPANY_CODE}/'
OUT_PATH_FILE     = f'resources/output/ann_digit_trans/file/{COMPANY_CODE}/'

# Phase 1 - Feature Loading
- Load digit frequency

In [None]:
def load_feature(filename):
    source_file = f'{SOURCE_PATH_TRANS}{filename}'
    df_chunks   = pd.read_csv(source_file, sep=';',
                              parse_dates=['draw_date'],
                              date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),
                              chunksize=50_000)
    return pd.concat(df_chunks)

In [None]:
feature_df = load_feature(f'{COMPANY_CODE} - digit_frequency.csv')

vp.faststat(feature_df)

# Phase 2 - Target Loading
- Create target label

In [None]:
def load_target(filename):
    source_file = f'{SOURCE_PATH_TRANS}{filename}'
    df_chunks   = pd.read_csv(source_file, sep=';',
                              dtype={x: str for x in ['1st', '2nd', '3rd'] + \
                                                     [f'Sp{x +1}' for x in range(10)] + \
                                                     [f'Cons{x +1}' for x in range(10)]},
                              parse_dates=['draw_date'],
                              date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),
                              chunksize=50_000)
    return pd.concat(df_chunks)

In [None]:
target_df = load_target(f'{COMPANY_CODE} - transactions.csv')

vp.faststat(target_df)

In [None]:
# Take target from following period
target_df['target'] = target_df['1st'].shift(-1)

# Split price numbers & target into digits
categories = ['1st', '2nd', '3rd'] + [f'Sp{x +1}' for x in range(10)] + [f'Cons{x +1}' for x in range(10)]
for column in categories + ['target']:
    for index in [x for x in range(4)]:
        new_column = f'{column}_d{4 - index}'
        target_df[new_column] = target_df[column].apply(lambda x: x[index] if x == x else x)

target_df.head()

In [None]:
target_df.drop(columns=categories + ['target'], inplace=True)

# Phase 3 - Dataset
- Map target label to features

In [None]:
feature_df.shape, target_df.shape

In [None]:
data_df = feature_df.merge(target_df, on=['draw_date', 'draw_period'], how='inner')
data_df.rename(columns={x: f'_Freq_{x}'
                        for x in [str(x) for x in range(10)] + [str(x).zfill(2) for x in range(100)]}, inplace=True)

vp.faststat(data_df)

In [None]:
data_df.dropna(inplace=True)

columns = [x for x in data_df.columns if any([x.startswith(y) for y in categories + ['target']])]
data_df[columns] = data_df[columns].astype(int)

# Target distribution
print('Full dataset:')
vp.value_count(data_df, TARGET)

In [None]:
def balanced_target(df, target, n_remain, excludes=[], random_state=None):
    np.random.seed(random_state)
    
    dfs = []
    for target_label in np.unique(df[target]):
        indexes = df[df[target] == target_label].index
        indexes = [x for x in indexes if x not in excludes]
        
        choices = np.random.choice(indexes, size=n_remain, replace=False)
        dfs.append(df[df.index.isin(choices)].copy())
        
    return pd.concat(dfs)

In [None]:
# Split train dataset with balanced target label
train_df = balanced_target(data_df, target=TARGET, n_remain=500, random_state=10000)

# Remaining goes to test dataset
used_indexes = list(train_df.index)
test_df      = data_df[~data_df.index.isin(used_indexes)].copy()

# Shuffle dataset
train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True)
test_df  = test_df.sample(frac=1, random_state=0).reset_index(drop=True)

train_df.shape, test_df.shape

In [None]:
print('Train dataset:')
vp.value_count(train_df, TARGET)

print('\nTest dataset:')
vp.value_count(test_df, TARGET)

###### Bar

In [None]:
def sampling_period(df, title):
    sample_df = df.copy()
    sample_df['year_month'] = sample_df['draw_date'].dt.to_period('M').astype(str)
    sample_df = sample_df.groupby(['dataset', 'year_month']).agg(
        count=('year_month', 'count')
    ).reset_index()
    
    fig = px.bar(sample_df, x='year_month', y='count', facet_row='dataset')
    fig.update_layout(title=title)
    vp.generate_plot(fig,
                     out_path=OUT_PATH_GRAPH,
                     out_filename=title)

In [None]:
train_df['dataset'] = 'train'
test_df['dataset']  = 'test'

sampling_period(pd.concat([train_df, test_df]),
                title='Phase 2 - Bar - Draw Date (Sample)')

# Phase 4 - Feature Selection
- Classification method

In [None]:
def feature_target_split(df):
    X = df[[x for x in df.columns
            if all([not x.startswith(y) for y in ['draw_date', 'draw_period', 'dataset', 'target']])]]
    y = df[TARGET]
    
    return X, y

In [None]:
# Separate features & target
X_train, y_train = feature_target_split(train_df)
X_test,  y_test  = feature_target_split(test_df)

print('Train dataset:')
print(X_train.shape, y_train.shape)

print('\nTest dataset:')
print(X_test.shape, y_test.shape)

In [None]:
data_groups    = []
subplot_titles = []

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)

###### Mutual Information

In [None]:
mutual_info_threshold = DFMutualInfoClassifierThreshold(cv=cv)

steps = [
    ('mutual_info_threshold', mutual_info_threshold),
]
Pipeline(steps, verbose=2).fit(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True)
)

# Graph data
mutual_info_threshold.stat_df.sort_values(by='feature', inplace=True)
fig = go.Figure(
    data=[go.Bar(
            x=mutual_info_threshold.stat_df['feature'],
            y=mutual_info_threshold.stat_df['average_score'],
            marker_color=np.where(mutual_info_threshold.stat_df['support'],
                                  DEFAULT_PLOTLY_COLORS[2], DEFAULT_PLOTLY_COLORS[3])
    )]
)
data_groups.append(fig['data'])
subplot_titles.append('Mutual Information')

###### Univariate ANOVA

In [None]:
anova_threshold = DFANOVAClassifierThreshold(cv=cv)

steps = [
    ('anova_threshold', anova_threshold),
]
Pipeline(steps, verbose=2).fit(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True)
)

# Graph data
anova_threshold.stat_df.sort_values(by='feature', inplace=True)
fig = go.Figure(
    data=[go.Bar(
            x=anova_threshold.stat_df['feature'],
            y=anova_threshold.stat_df['average_score'],
            marker_color=np.where(anova_threshold.stat_df['support'],
                                  DEFAULT_PLOTLY_COLORS[2], DEFAULT_PLOTLY_COLORS[3])
    )]
)
data_groups.append(fig['data'])
subplot_titles.append('Univariate ANOV')

###### Chi Square (χ2)

In [None]:
chi2_threshold = DFChi2Threshold(cv=cv)

steps = [
    ('chi2_threshold', chi2_threshold),
]
Pipeline(steps, verbose=2).fit(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True)
)

# Graph data
chi2_threshold.stat_df.sort_values(by='feature', inplace=True)
fig = go.Figure(
    data=[go.Bar(
            x=chi2_threshold.stat_df['feature'],
            y=chi2_threshold.stat_df['average_score'],
            marker_color=np.where(chi2_threshold.stat_df['support'],
                                  DEFAULT_PLOTLY_COLORS[2], DEFAULT_PLOTLY_COLORS[3])
    )]
)
data_groups.append(fig['data'])
subplot_titles.append('Chi-Square')

###### Univariate ROC-AUC

In [None]:
rocauc_threshold = DFROCAUCThreshold(threshold=.5, cv=cv, multi_class='ovr')

steps = [
    ('rocauc_threshold', rocauc_threshold),
]
Pipeline(steps, verbose=2).fit(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True)
)

# Graph data
rocauc_threshold.stat_df.sort_values(by='feature', inplace=True)
fig = go.Figure(
    data=[go.Bar(
            x=rocauc_threshold.stat_df['feature'],
            y=rocauc_threshold.stat_df['average_score'],
            marker_color=np.where(rocauc_threshold.stat_df['support'],
                                  DEFAULT_PLOTLY_COLORS[2], DEFAULT_PLOTLY_COLORS[3])
    )]
)
data_groups.append(fig['data'])
subplot_titles.append('Univariate ROC-AUC')

###### Logistic Regression

In [None]:
logistic_selector = DFLogisticRegressionSelector(cv=cv, max_iter=1_000)

steps = [
    ('logistic_selector', logistic_selector),
]
Pipeline(steps, verbose=2).fit(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True)
)

# Graph data
logistic_selector.stat_df.sort_values(by='feature', inplace=True)
fig = go.Figure(
    data=[go.Bar(
            x=logistic_selector.stat_df['feature'],
            y=logistic_selector.stat_df['average_score'],
            marker_color=np.where(logistic_selector.stat_df['support'],
                                  DEFAULT_PLOTLY_COLORS[2], DEFAULT_PLOTLY_COLORS[3])
    )]
)
data_groups.append(fig['data'])
subplot_titles.append('Logistic Regression')

###### Ridge Classifier

In [None]:
ridge_selector = DFRidgeClassifierSelector(cv=cv)

steps = [
    ('ridge_selector', ridge_selector),
]
Pipeline(steps, verbose=2).fit(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True)
)

# Graph data
ridge_selector.stat_df.sort_values(by='feature', inplace=True)
fig = go.Figure(
    data=[go.Bar(
            x=ridge_selector.stat_df['feature'],
            y=ridge_selector.stat_df['average_score'],
            marker_color=np.where(ridge_selector.stat_df['support'],
                                  DEFAULT_PLOTLY_COLORS[2], DEFAULT_PLOTLY_COLORS[3])
    )]
)
data_groups.append(fig['data'])
subplot_titles.append('Ridge Classifier')

###### Decision Tree Classifier

In [None]:
decision_tree_selector = DFDecisionTreeClassifierSelector(cv=cv)

steps = [
    ('decision_tree_selector', decision_tree_selector),
]
Pipeline(steps, verbose=2).fit(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True)
)

# Graph data
decision_tree_selector.stat_df.sort_values(by='feature', inplace=True)
fig = go.Figure(
    data=[go.Bar(
            x=decision_tree_selector.stat_df['feature'],
            y=decision_tree_selector.stat_df['average_score'],
            marker_color=np.where(decision_tree_selector.stat_df['support'],
                                  DEFAULT_PLOTLY_COLORS[2], DEFAULT_PLOTLY_COLORS[3])
    )]
)
data_groups.append(fig['data'])
subplot_titles.append('Decision Tree Classifier')

###### Extra Tree Classifier

In [None]:
extra_tree_selector = DFExtraTreeClassifierSelector(cv=cv)

steps = [
    ('extra_tree_selector', extra_tree_selector),
]
Pipeline(steps, verbose=2).fit(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True)
)

# Graph data
extra_tree_selector.stat_df.sort_values(by='feature', inplace=True)
fig = go.Figure(
    data=[go.Bar(
            x=extra_tree_selector.stat_df['feature'],
            y=extra_tree_selector.stat_df['average_score'],
            marker_color=np.where(extra_tree_selector.stat_df['support'],
                                  DEFAULT_PLOTLY_COLORS[2], DEFAULT_PLOTLY_COLORS[3])
    )]
)
data_groups.append(fig['data'])
subplot_titles.append('Extra Tree Classifier')

###### Random Forest Classifier

In [None]:
random_forest_selector = DFRandomForestClassifierSelector(cv=cv)

steps = [
    ('random_forest_selector', random_forest_selector),
]
Pipeline(steps, verbose=2).fit(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True)
)

# Graph data
random_forest_selector.stat_df.sort_values(by='feature', inplace=True)
fig = go.Figure(
    data=[go.Bar(
            x=random_forest_selector.stat_df['feature'],
            y=random_forest_selector.stat_df['average_score'],
            marker_color=np.where(random_forest_selector.stat_df['support'],
                                  DEFAULT_PLOTLY_COLORS[2], DEFAULT_PLOTLY_COLORS[3])
    )]
)
data_groups.append(fig['data'])
subplot_titles.append('Random Forest Classifier')

###### XGBoost Classifier

In [None]:
xgboost_selector = DFXGBClassifierSelector(cv=cv)

steps = [
    ('xgboost_selector', xgboost_selector),
]
Pipeline(steps, verbose=2).fit(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True)
)

# Graph data
xgboost_selector.stat_df.sort_values(by='feature', inplace=True)
fig = go.Figure(
    data=[go.Bar(
            x=xgboost_selector.stat_df['feature'],
            y=xgboost_selector.stat_df['average_score'],
            marker_color=np.where(xgboost_selector.stat_df['support'],
                                  DEFAULT_PLOTLY_COLORS[2], DEFAULT_PLOTLY_COLORS[3])
    )]
)
data_groups.append(fig['data'])
subplot_titles.append('XGBoost Classifier')

###### CatBoost Classifier

In [None]:
categories        = ['1st', '2nd', '3rd'] + [f'Sp{x +1}' for x in range(10)] + [f'Cons{x +1}' for x in range(10)]
catboost_selector = DFCatBoostClassifierSelector(
                        cv=cv,
                        iterations=10_000,
                        early_stopping_rounds=200,
                        verbose=1_000,
                        task_type='GPU',
                        # eval_metric='MultiClassOneVsAll',
                        loss_function='MultiClassOneVsAll',
                        cat_features=[x for x in X_train.columns if any([x.startswith(f'{y}_') for y in categories])]
                    )

steps = [
    ('catboost_selector', catboost_selector),
]
Pipeline(steps, verbose=2).fit(
    pd.concat([X_train, X_test], axis=0, ignore_index=True),
    pd.concat([y_train, y_test], axis=0, ignore_index=True)
)

# Graph data
catboost_selector.stat_df.sort_values(by='feature', inplace=True)
fig = go.Figure(
    data=[go.Bar(
            x=catboost_selector.stat_df['feature'],
            y=catboost_selector.stat_df['average_score'],
            marker_color=np.where(catboost_selector.stat_df['support'],
                                  DEFAULT_PLOTLY_COLORS[2], DEFAULT_PLOTLY_COLORS[3])
    )]
)
data_groups.append(fig['data'])
subplot_titles.append('CatBoost Classifier')

In [None]:
out_path = f'{OUT_PATH_FILE}/classification/'
fp.generate_excel(mutual_info_threshold.stat_df,  out_path, 'mutual_info_threshold.xlsx')
fp.generate_excel(anova_threshold.stat_df,        out_path, 'anova_threshold.xlsx')
fp.generate_excel(chi2_threshold.stat_df,         out_path, 'chi2_threshold.xlsx')
fp.generate_excel(rocauc_threshold.stat_df,       out_path, 'rocauc_threshold.xlsx')
fp.generate_excel(logistic_selector.stat_df,      out_path, 'logistic_selector.xlsx')
fp.generate_excel(ridge_selector.stat_df,         out_path, 'ridge_selector.xlsx')
fp.generate_excel(decision_tree_selector.stat_df, out_path, 'decision_tree_selector.xlsx')
fp.generate_excel(extra_tree_selector.stat_df,    out_path, 'extra_tree_selector.xlsx')
fp.generate_excel(random_forest_selector.stat_df, out_path, 'random_forest_selector.xlsx')
fp.generate_excel(xgboost_selector.stat_df,       out_path, 'xgboost_selector.xlsx')
fp.generate_excel(catboost_selector.stat_df,      out_path, 'catboost_selector.xlsx')

###### Top N Features

In [None]:
# Compile feature selection scores
feature_df = mutual_info_threshold.stat_df.rename(columns={
    'average_score': 'mutual_info_score',
    'support':       'mutual_info_support'
}).merge(
    anova_threshold.stat_df[['feature', 'average_score', 'support']].rename(columns={
        'average_score': 'anova_score',
        'support':       'anova_support'
    }), on='feature', how='left'
).merge(
    chi2_threshold.stat_df[['feature', 'average_score', 'support']].rename(columns={
        'average_score': 'chi2_score',
        'support':       'chi2_support'
    }), on='feature', how='left'
).merge(
    rocauc_threshold.stat_df[['feature', 'average_score', 'support']].rename(columns={
        'average_score': 'roc_auc_score',
        'support':       'roc_auc_support'
    }), on='feature', how='left'
).merge(
    logistic_selector.stat_df[['feature', 'average_score', 'support']].rename(columns={
        'average_score': 'logistic_score',
        'support':       'logistic_support'
    }), on='feature', how='left'
).merge(
    ridge_selector.stat_df[['feature', 'average_score', 'support']].rename(columns={
        'average_score': 'ridge_score',
        'support':       'ridge_support'
    }), on='feature', how='left'
).merge(
    decision_tree_selector.stat_df[['feature', 'average_score', 'support']].rename(columns={
        'average_score': 'decision_tree_score',
        'support':       'decision_tree_support'
    }), on='feature', how='left'
).merge(
    extra_tree_selector.stat_df[['feature', 'average_score', 'support']].rename(columns={
        'average_score': 'extra_tree_score',
        'support':       'extra_tree_support'
    }), on='feature', how='left'
).merge(
    random_forest_selector.stat_df[['feature', 'average_score', 'support']].rename(columns={
        'average_score': 'random_forest_score',
        'support':       'random_forest_support'
    }), on='feature', how='left'
).merge(
    xgboost_selector.stat_df[['feature', 'average_score', 'support']].rename(columns={
        'average_score': 'xgboost_score',
        'support':       'xgboost_support'
    }), on='feature', how='left'
).merge(
    catboost_selector.stat_df[['feature', 'average_score', 'support']].rename(columns={
        'average_score': 'catboost_score',
        'support':       'catboost_support'
    }), on='feature', how='left'
)

# Normalize score
methods = [
    'mutual_info', 'anova', 'chi2', 'roc_auc',
    'logistic', 'ridge', 'decision_tree', 'extra_tree',
    'random_forest', 'xgboost', 'catboost'
]
for method in methods:
    feature_df[f'{method}_score'] = feature_df[f'{method}_score'] / feature_df[f'{method}_score'].sum()

# Average score
feature_df['average_score'] = feature_df[[f'{x}_score' for x in methods]].sum(axis=1) / len(methods)

# Support ratio
feature_df['support_ratio'] = feature_df[[f'{x}_support' for x in methods]].astype(int).sum(axis=1) / len(methods)

# Support score
feature_df['support_score'] = feature_df['average_score'] * feature_df['support_ratio']

# Supported features
feature_df['support'] = feature_df['support_score'] > feature_df['support_score'].mean()

# Top N supported features
top_k   = 20
rank_df = feature_df.copy()
rank_df['k_support'] = True
rank_df.sort_values(by='support_score', ascending=False, inplace=True)
rank_df = rank_df[['feature', 'k_support']][:top_k]

feature_df = feature_df.merge(rank_df, on='feature', how='left')
feature_df['k_support'].fillna(False, inplace=True)
feature_df.to_excel(f'{OUT_PATH_FILE}feature_selection.xlsx', index=False)

# Graph data
feature_df.sort_values(by='feature', inplace=True)
fig = go.Figure(
    data=[go.Bar(
            x=feature_df['feature'],
            y=feature_df['support_score'],
            marker_color=np.where(feature_df['support'], DEFAULT_PLOTLY_COLORS[2],
                         np.where(feature_df['k_support'], DEFAULT_PLOTLY_COLORS[1], DEFAULT_PLOTLY_COLORS[3]))
    )]
)
data_groups.append(fig['data'])
subplot_titles.append(f'Top {top_k} Supported Features')

###### Bar

In [None]:
vp.datagroups_subplots(
    data_groups,
    max_col=2,
    title='Phase 4 - Feature Selection - Classification',
    out_path=OUT_PATH_GRAPH,
    subplot_kwargs={
        'subplot_titles': subplot_titles,
        'vertical_spacing': .06
    },
    layout_kwargs={
        'height': 1500,
        'showlegend': False
    })

In [None]:
# Supported features
# features = feature_df[feature_df['support']].sort_values(by='support_score', ascending=False)['feature'].values
# features = ['Cons3_d2', 'Cons9_d4', '_Freq_0', 'Cons5_d1', 'Sp8_d2',
#             'Cons10_d4', 'Cons4_d3', 'Cons9_d3', '_Freq_3', '3rd_d4',
#             '1st_d1', '_Freq_77', '_Freq_2', 'Cons4_d2', '_Freq_6',
#             'Sp7_d2', '_Freq_9', 'Sp5_d2', 'Sp2_d2', 'Sp3_d2',
#             '1st_d4', 'Sp10_d3', 'Sp4_d2', 'Sp3_d4', 'Sp8_d4',
#             'Cons9_d2', '_Freq_11', 'Sp6_d2', 'Cons1_d1', '_Freq_4',
#             '_Freq_88', '_Freq_5', 'Cons2_d2', 'Cons8_d2', 'Sp4_d1',
#             '_Freq_00', 'Sp7_d1', 'Sp5_d1', '_Freq_66', '2nd_d1',
#             '1st_d3', 'Sp8_d1', '3rd_d1', '_Freq_7', 'Cons1_d2',
#             '2nd_d3', 'Sp10_d1', 'Cons7_d2', 'Cons1_d3', 'Cons3_d4',
#             'Cons1_d4', 'Sp6_d1', 'Cons10_d1', 'Sp7_d4', 'Cons5_d2',
#             '_Freq_1', 'Sp2_d1', 'Cons6_d1', 'Sp4_d4', 'Cons6_d3',
#             'Sp3_d3', 'Cons7_d3', '_Freq_55', 'Sp1_d4', 'Sp5_d4',
#             'Cons2_d1', 'Sp6_d4', 'Sp7_d3', 'Sp4_d3', '1st_d2',
#             'Cons2_d3', '_Freq_22', 'Cons7_d4', 'Cons3_d1', 'Cons3_d3',
#             '_Freq_07', 'Cons6_d4', '_Freq_8', 'Cons8_d3', 'Cons4_d1',
#             'Cons8_d1', 'Sp5_d3', 'Sp2_d4', '_Freq_36', 'Sp9_d3',
#             '3rd_d2', 'Cons10_d2', '3rd_d3', '2nd_d4', 'Sp1_d2',
#             '_Freq_35', 'Cons4_d4', '2nd_d2', 'Sp8_d3', 'Cons8_d4',
#             'Sp1_d1', 'Sp1_d3', 'Cons10_d3', '_Freq_68', 'Sp9_d1']

# Top N supported features
# features = feature_df[feature_df['k_support']].sort_values(by='support_score', ascending=False)['feature'].values
# features = ['Cons3_d2', 'Cons9_d4', '_Freq_0', 'Cons5_d1', 'Sp8_d2',
#             'Cons10_d4', 'Cons4_d3', 'Cons9_d3', '_Freq_3', '3rd_d4',
#             '1st_d1', '_Freq_77', '_Freq_2', 'Cons4_d2', '_Freq_6',
#             'Sp7_d2', '_Freq_9', 'Sp5_d2', 'Sp2_d2', 'Sp3_d2']

X_train = X_train[features].copy()
X_test  = X_test[features].copy()

X_train.shape, X_test.shape

# Phase 5 - Feature Extraction
- Separate dataset to features & target
- Feature encoding
- Ivis dimension reduction

In [None]:
# Binary crossentropy target
target_onehot_encoder = DFOneHotEncoder(dtype='byte')
y_train_binary = target_onehot_encoder.fit_transform(y_train.to_frame())

# Sparse categorical crossentropy target
y_train_sparse = y_train.copy()

y_train_binary.shape, y_train_sparse.shape

In [None]:
# Feature extraction
onehot_columns = [x for x in X_train.columns if any([x.startswith(y) for y in categories])]
onehot_encoder = DFOneHotEncoder(columns=onehot_columns, dtype='byte', drop='first')

minmax_scaler  = DFMinMaxScaler(columns=[x for x in X_train.columns if x not in onehot_columns])

# TODO - define own model to prevent overfitting & explosive stacked_triplets_loss
ivis_binary    = DFIvis(embedding_dims=2, epochs=100,
                        k=15, n_epochs_without_progress=10, model='szubert',
                        supervision_weight=1, supervision_metric='binary_crossentropy', distance='pn')

ivis_sparse    = DFIvis(embedding_dims=2, epochs=100,
                        k=15, n_epochs_without_progress=10, model='szubert',
                        supervision_weight=1, supervision_metric='sparse_categorical_crossentropy', distance='pn')

# Binary crossentropy
steps = [
    ('onehot_encoder', onehot_encoder),
    ('minmax_scaler', minmax_scaler),
    ('ivis_binary', ivis_binary),
]
ivis_binary_pipeline = Pipeline(steps, verbose=True)

X_train_binary = ivis_binary_pipeline.fit_transform(X_train, y_train_binary)
X_test_binary  = ivis_binary_pipeline.transform(X_test)

X_train_binary.shape, X_test_binary.shape

In [None]:
# Sparse categorical crossentropy
steps = [
    ('onehot_encoder', onehot_encoder),
    ('minmax_scaler', minmax_scaler),
    ('ivis_sparse', ivis_sparse),
]
ivis_sparse_pipeline = Pipeline(steps, verbose=True)

X_train_sparse = ivis_sparse_pipeline.fit_transform(X_train, y_train_sparse)
X_test_sparse  = ivis_sparse_pipeline.transform(X_test)

X_train_sparse.shape, X_test_sparse.shape

###### Line

In [None]:
names      = ['binary_loss', 'sparse_loss']
losses     = [ivis_binary.model.loss_history_, ivis_sparse.model.loss_history_]
max_length = max([len(x) for x in losses])

tmp_df = pd.DataFrame([x+1 for x in range(max_length)], columns=['epoch'])
for index, loss in enumerate(losses):
    loss += [np.nan] * (max_length - len(loss))
    tmp_df[names[index]] = loss

vp.line(tmp_df,
        xy_tuples=[('epoch', x) for x in names],
        title='Phase 5 - Line - Ivis Loss',
        out_path=OUT_PATH_GRAPH)

del tmp_df

###### Scatter

In [None]:
title  = 'Phase 5 - Scatter - Ivis - Binary'
tmp_df = pd.concat([X_train_binary, pd.Series(np.argmax(y_train_binary.values, axis=1), name=TARGET).astype(str)], axis=1)

fig = px.scatter(
    tmp_df.sort_values(by=TARGET),
    x='ivis_0',
    y='ivis_1',
    color=TARGET,
    marginal_x='histogram'
)
fig.update_layout(title=title)

vp.generate_plot(fig,
                 out_path=OUT_PATH_GRAPH,
                 out_filename=title)

del tmp_df

In [None]:
title  = 'Phase 5 - Scatter - Ivis - Sparse Categorical'
tmp_df = pd.concat([X_train_sparse, y_train_sparse.astype(str)], axis=1)

fig = px.scatter(
    tmp_df.sort_values(by=TARGET),
    x='ivis_0',
    y='ivis_1',
    color=TARGET,
    marginal_x='histogram'
)
fig.update_layout(title=title)

vp.generate_plot(fig,
                 out_path=OUT_PATH_GRAPH,
                 out_filename=title)

del tmp_df

In [None]:
title  = 'Phase 5 - Scatter - Ivis - Binary (Test dataset)'
tmp_df = pd.concat([X_test_binary, y_test.astype(str)], axis=1)

fig = px.scatter(
    tmp_df.sort_values(by=TARGET),
    x='ivis_0',
    y='ivis_1',
    color=TARGET,
    marginal_x='histogram'
)
fig.update_layout(title=title)

vp.generate_plot(fig,
                 out_path=OUT_PATH_GRAPH,
                 out_filename=title)

del tmp_df

In [None]:
title  = 'Phase 5 - Scatter - Ivis - Sparse Categorical (Test dataset)'
tmp_df = pd.concat([X_test_sparse, y_test.astype(str)], axis=1)

fig = px.scatter(
    tmp_df.sort_values(by=TARGET),
    x='ivis_0',
    y='ivis_1',
    color=TARGET,
    marginal_x='histogram'
)
fig.update_layout(title=title)

vp.generate_plot(fig,
                 out_path=OUT_PATH_GRAPH,
                 out_filename=title)

del tmp_df

# Phase 6 - Classification
- Binary crossentropy features

In [None]:
# Split train & validation dataset with balanced target label
train_df = balanced_target(pd.concat([X_train_binary, y_train], axis=1),
                           target=TARGET, n_remain=350, random_state=10000)
valid_df = balanced_target(pd.concat([X_train_binary, y_train], axis=1),
                           target=TARGET, n_remain=150, random_state=10000, excludes=train_df.index)

# Shuffle dataset
train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True)
valid_df = valid_df.sample(frac=1, random_state=0).reset_index(drop=True)

train_df.shape, valid_df.shape

In [None]:
print('Train dataset:')
vp.value_count(train_df, TARGET)

print('\nValidate dataset:')
vp.value_count(valid_df, TARGET)

In [None]:
# Separate features & target
X_train_binary, y_train_binary = feature_target_split(train_df)
X_valid_binary, y_valid_binary = feature_target_split(valid_df)

# Categorical crossentropy target
y_train_binary = target_onehot_encoder.transform(y_train_binary.to_frame())
y_valid_binary = target_onehot_encoder.transform(y_valid_binary.to_frame())
y_test_binary  = target_onehot_encoder.transform(y_test.to_frame())

print('Train dataset:')
print(X_train_binary.shape, y_train_binary.shape)

print('\nValidate dataset:')
print(X_valid_binary.shape, y_valid_binary.shape)

In [None]:
# Feature scaling
minmax_scaler  = DFMinMaxScaler()
X_train_binary = minmax_scaler.fit_transform(X_train_binary)
X_valid_binary = minmax_scaler.transform(X_valid_binary)
X_test_binary  = minmax_scaler.transform(X_test_binary)

X_train_binary.shape, X_valid_binary.shape, X_test_binary.shape

In [None]:
def compile_model(X):
    model = Sequential()
    model.add(Dense(128, input_dim=X.shape[1],
                    kernel_initializer='he_uniform',
                    kernel_constraint=max_norm(5),
                    use_bias=False))
    model.add(BatchNormalization(scale=False,
                                 renorm=True,
                                 renorm_clipping={ 'rmax': 1, 'rmin': 0, 'dmax': 0 }))
    model.add(Activation('relu'))
    model.add(Dropout(rate=.2))
    
    model.add(Dense(64,
                    kernel_initializer='he_uniform',
                    kernel_constraint=max_norm(5),
                    use_bias=False))
    model.add(BatchNormalization(scale=False,
                                 renorm=True,
                                 renorm_clipping={ 'rmax': 1, 'rmin': 0, 'dmax': 0 }))
    model.add(Activation('relu'))
    model.add(Dropout(rate=.2))
    
    model.add(Dense(10, activation='softmax'))
    
    # Referecence: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
    metrics = [
        CategoricalAccuracy(name='categorical_accuracy'),
        AUC(name='auc'),
        Precision(name='precision'),
        Recall(name='recall'),
    ]
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=.001, epsilon=.00001),
                  metrics=metrics)
    
    return model

In [None]:
# Reference: https://machinelearningmastery.com/understand-the-dynamics-of-learning-rate-on-deep-learning-neural-networks/
lrate = ReduceLROnPlateau(monitor='val_auc', factor=0.95, patience=15)

# Reference: https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/
es = EarlyStopping(monitor='val_auc', mode='max', verbose=1, patience=30, restore_best_weights=True)

model   = compile_model(X_train_binary)
history = model.fit(X_train_binary, y_train_binary,
                    validation_data=(X_valid_binary, y_valid_binary),
                    epochs=100,
                    callbacks=[lrate, es],
                    batch_size=32)

In [None]:
def eval_classif(y_true, y_pred):
    cofmat_df = pd.DataFrame(confusion_matrix(y_true, y_pred))
    cofmat_df.index.name   = 'True'
    cofmat_df.columns.name = 'Pred'

    print(cofmat_df)
    print()
    print(classification_report(y_true, y_pred, digits=5))

In [None]:
# Test set evaluation
eval_classif(
    np.argmax(y_test_binary.values, axis=1),
    np.argmax(model.predict(X_test_binary), axis=1)
)

In [None]:
# Train set evaluation
eval_classif(
    np.argmax(y_train_binary.values, axis=1),
    np.argmax(model.predict(X_train_binary), axis=1)
)

In [None]:
# Validation set evaluation
eval_classif(
    np.argmax(y_valid_binary.values, axis=1),
    np.argmax(model.predict(X_valid_binary), axis=1)
)

###### Line

In [None]:
# Learning Rate
data = []
data.append(go.Scatter(
    y=history.history['lr'],
    mode='lines',
    name='LR',
    marker={'color': DEFAULT_PLOTLY_COLORS[-1]},
))
fig1 = go.Figure(data=data)

# Loss
data = []
data.append(go.Scatter(
    y=history.history['loss'],
    mode='lines',
    name='loss',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scatter(
    y=history.history['val_loss'],
    mode='lines',
    name='val_loss',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig2 = go.Figure(data=data)

# Accuracy
data = []
data.append(go.Scattergl(
    y=history.history['categorical_accuracy'],
    mode='lines',
    name='accuracy',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scattergl(
    y=history.history['val_categorical_accuracy'],
    mode='lines',
    name='val_accuracy',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig3 = go.Figure(data=data)

# AUC
data = []
data.append(go.Scattergl(
    y=history.history['auc'],
    mode='lines',
    name='auc',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scattergl(
    y=history.history['val_auc'],
    mode='lines',
    name='val_auc',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig4 = go.Figure(data=data)

# Precision
data = []
data.append(go.Scatter(
    y=history.history['precision'],
    mode='lines',
    name='precision',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scatter(
    y=history.history['val_precision'],
    mode='lines',
    name='val_precision',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig5 = go.Figure(data=data)

# Recall
data = []
data.append(go.Scatter(
    y=history.history['recall'],
    mode='lines',
    name='recall',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scatter(
    y=history.history['val_recall'],
    mode='lines',
    name='val_recall',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig6 = go.Figure(data=data)

data_groups = [fig1['data'], fig2['data'], fig3['data'], fig4['data'], fig5['data'], fig6['data']]
vp.datagroups_subplots(data_groups,
                       max_col=3,
                       title='Phase 6 - Metrics - Ivis - Binary',
                       out_path=OUT_PATH_GRAPH,
                       subplot_kwargs={
                           'subplot_titles': ['Learning Rate', 'Loss', 'Accuracy', 'AUC', 'Precision', 'Recall']
                       })

# Phase 7 - Classification
- Sparse categorical crossentropy features

In [None]:
# Split train & validation dataset with balanced target label
train_df = balanced_target(pd.concat([X_train_sparse, y_train], axis=1),
                           target=TARGET, n_remain=350, random_state=10000)
valid_df = balanced_target(pd.concat([X_train_sparse, y_train], axis=1),
                           target=TARGET, n_remain=150, random_state=10000, excludes=train_df.index)

# Shuffle dataset
train_df = train_df.sample(frac=1, random_state=0).reset_index(drop=True)
valid_df = valid_df.sample(frac=1, random_state=0).reset_index(drop=True)

train_df.shape, valid_df.shape

In [None]:
print('Train dataset:')
vp.value_count(train_df, TARGET)

print('\nValidate dataset:')
vp.value_count(valid_df, TARGET)

In [None]:
# Separate features & target
X_train_sparse, y_train_sparse = feature_target_split(train_df)
X_valid_sparse, y_valid_sparse = feature_target_split(valid_df)

# Categorical crossentropy target
y_train_sparse = target_onehot_encoder.transform(y_train_sparse.to_frame())
y_valid_sparse = target_onehot_encoder.transform(y_valid_sparse.to_frame())
y_test_sparse  = target_onehot_encoder.transform(y_test.to_frame())

print('Train dataset:')
print(X_train_sparse.shape, y_train_sparse.shape)

print('\nValidate dataset:')
print(X_valid_sparse.shape, y_valid_sparse.shape)

In [None]:
# Feature scaling
minmax_scaler  = DFMinMaxScaler()
X_train_sparse = minmax_scaler.fit_transform(X_train_sparse)
X_valid_sparse = minmax_scaler.transform(X_valid_sparse)
X_test_sparse  = minmax_scaler.transform(X_test_sparse)

X_train_sparse.shape, X_valid_sparse.shape, X_test_sparse.shape

In [None]:
# Reference: https://machinelearningmastery.com/understand-the-dynamics-of-learning-rate-on-deep-learning-neural-networks/
lrate = ReduceLROnPlateau(monitor='val_auc', factor=0.95, patience=15)

# Reference: https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/
es = EarlyStopping(monitor='val_auc', mode='max', verbose=1, patience=30, restore_best_weights=True)

model   = compile_model(X_train_sparse)
history = model.fit(X_train_sparse, y_train_sparse,
                    validation_data=(X_valid_sparse, y_valid_sparse),
                    epochs=100,
                    callbacks=[lrate, es],
                    batch_size=32)

In [None]:
# Test set evaluation
eval_classif(
    np.argmax(y_test_sparse.values, axis=1),
    np.argmax(model.predict(X_test_sparse), axis=1)
)

In [None]:
# Train set evaluation
eval_classif(
    np.argmax(y_train_sparse.values, axis=1),
    np.argmax(model.predict(X_train_sparse), axis=1)
)

In [None]:
# Validation set evaluation
eval_classif(
    np.argmax(y_valid_sparse.values, axis=1),
    np.argmax(model.predict(X_valid_sparse), axis=1)
)

###### Line

In [None]:
# Learning Rate
data = []
data.append(go.Scatter(
    y=history.history['lr'],
    mode='lines',
    name='LR',
    marker={'color': DEFAULT_PLOTLY_COLORS[-1]},
))
fig1 = go.Figure(data=data)

# Loss
data = []
data.append(go.Scatter(
    y=history.history['loss'],
    mode='lines',
    name='loss',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scatter(
    y=history.history['val_loss'],
    mode='lines',
    name='val_loss',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig2 = go.Figure(data=data)

# Accuracy
data = []
data.append(go.Scattergl(
    y=history.history['categorical_accuracy'],
    mode='lines',
    name='accuracy',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scattergl(
    y=history.history['val_categorical_accuracy'],
    mode='lines',
    name='val_accuracy',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig3 = go.Figure(data=data)

# AUC
data = []
data.append(go.Scattergl(
    y=history.history['auc'],
    mode='lines',
    name='auc',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scattergl(
    y=history.history['val_auc'],
    mode='lines',
    name='val_auc',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig4 = go.Figure(data=data)

# Precision
data = []
data.append(go.Scatter(
    y=history.history['precision'],
    mode='lines',
    name='precision',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scatter(
    y=history.history['val_precision'],
    mode='lines',
    name='val_precision',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig5 = go.Figure(data=data)

# Recall
data = []
data.append(go.Scatter(
    y=history.history['recall'],
    mode='lines',
    name='recall',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scatter(
    y=history.history['val_recall'],
    mode='lines',
    name='val_recall',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig6 = go.Figure(data=data)

data_groups = [fig1['data'], fig2['data'], fig3['data'], fig4['data'], fig5['data'], fig6['data']]
vp.datagroups_subplots(data_groups,
                       max_col=3,
                       title='Phase 7 - Metrics - Ivis - Sparse Categorical',
                       out_path=OUT_PATH_GRAPH,
                       subplot_kwargs={
                           'subplot_titles': ['Learning Rate', 'Loss', 'Accuracy', 'AUC', 'Precision', 'Recall']
                       })

# Phase 8 - Classification
- Binary + Sparse categorical crossentropy features

In [None]:
X_train_combine = pd.concat([
    X_train_binary.rename(columns={'ivis_0': 'ivis_binary_0', 'ivis_1': 'ivis_binary_1'}),
    X_train_sparse.rename(columns={'ivis_0': 'ivis_sparse_0', 'ivis_1': 'ivis_sparse_1'}),
], axis=1)

X_valid_combine = pd.concat([
    X_valid_binary.rename(columns={'ivis_0': 'ivis_binary_0', 'ivis_1': 'ivis_binary_1'}),
    X_valid_sparse.rename(columns={'ivis_0': 'ivis_sparse_0', 'ivis_1': 'ivis_sparse_1'}),
], axis=1)

X_test_combine  = pd.concat([
    X_test_binary.rename(columns={'ivis_0': 'ivis_binary_0', 'ivis_1': 'ivis_binary_1'}),
    X_test_sparse.rename(columns={'ivis_0': 'ivis_sparse_0', 'ivis_1': 'ivis_sparse_1'}),
], axis=1)

X_train_combine.shape, X_valid_combine.shape, X_test_combine.shape

In [None]:
y_train_combine = y_train_binary.copy()
y_valid_combine = y_valid_binary.copy()
y_test_combine  = y_test_binary.copy()

y_train_combine.shape, y_valid_combine.shape, y_test_combine.shape

In [None]:
# Reference: https://machinelearningmastery.com/understand-the-dynamics-of-learning-rate-on-deep-learning-neural-networks/
lrate = ReduceLROnPlateau(monitor='val_auc', factor=0.95, patience=15)

# Reference: https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/
es = EarlyStopping(monitor='val_auc', mode='max', verbose=1, patience=30, restore_best_weights=True)

model   = compile_model(X_train_combine)
history = model.fit(X_train_combine, y_train_combine,
                    validation_data=(X_valid_combine, y_valid_combine),
                    epochs=100,
                    callbacks=[lrate, es],
                    batch_size=32)

In [None]:
# Test set evaluation
eval_classif(
    np.argmax(y_test_combine.values, axis=1),
    np.argmax(model.predict(X_test_combine), axis=1)
)

In [None]:
# Train set evaluation
eval_classif(
    np.argmax(y_train_combine.values, axis=1),
    np.argmax(model.predict(X_train_combine), axis=1)
)

In [None]:
# Validation set evaluation
eval_classif(
    np.argmax(y_valid_combine.values, axis=1),
    np.argmax(model.predict(X_valid_combine), axis=1)
)

###### Line

In [None]:
# Learning Rate
data = []
data.append(go.Scatter(
    y=history.history['lr'],
    mode='lines',
    name='LR',
    marker={'color': DEFAULT_PLOTLY_COLORS[-1]},
))
fig1 = go.Figure(data=data)

# Loss
data = []
data.append(go.Scatter(
    y=history.history['loss'],
    mode='lines',
    name='loss',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scatter(
    y=history.history['val_loss'],
    mode='lines',
    name='val_loss',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig2 = go.Figure(data=data)

# Accuracy
data = []
data.append(go.Scattergl(
    y=history.history['categorical_accuracy'],
    mode='lines',
    name='accuracy',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scattergl(
    y=history.history['val_categorical_accuracy'],
    mode='lines',
    name='val_accuracy',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig3 = go.Figure(data=data)

# AUC
data = []
data.append(go.Scattergl(
    y=history.history['auc'],
    mode='lines',
    name='auc',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scattergl(
    y=history.history['val_auc'],
    mode='lines',
    name='val_auc',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig4 = go.Figure(data=data)

# Precision
data = []
data.append(go.Scatter(
    y=history.history['precision'],
    mode='lines',
    name='precision',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scatter(
    y=history.history['val_precision'],
    mode='lines',
    name='val_precision',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig5 = go.Figure(data=data)

# Recall
data = []
data.append(go.Scatter(
    y=history.history['recall'],
    mode='lines',
    name='recall',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scatter(
    y=history.history['val_recall'],
    mode='lines',
    name='val_recall',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig6 = go.Figure(data=data)

data_groups = [fig1['data'], fig2['data'], fig3['data'], fig4['data'], fig5['data'], fig6['data']]
vp.datagroups_subplots(data_groups,
                       max_col=3,
                       title='Phase 8 - Metrics - Ivis - Binary + Sparse Categorical',
                       out_path=OUT_PATH_GRAPH,
                       subplot_kwargs={
                           'subplot_titles': ['Learning Rate', 'Loss', 'Accuracy', 'AUC', 'Precision', 'Recall']
                       })