In [None]:
import lib._util.visualplot as vp
import lib._util.fileproc as fp

from lib._class.DataGenerator import DataGenerator

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 100)

import numpy as np
import glob

# Plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.colors import DEFAULT_PLOTLY_COLORS

# Time measurement
import time
from datetime import timedelta, datetime

# Sound notification
import winsound

# Scikit-Learn
from sklearn.metrics import classification_report, confusion_matrix

# Tensorflow
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# Useful Functions

In [None]:
COMPANY_CODE      = 'MAG'
TARGET            = 'target4'
SOURCE_PATH_RFM   = f'resources/output/eda_rfm/file/{COMPANY_CODE}/Moving RFM/'
SOURCE_PATH_TRANS = f'resources/output/eda_trans/file/{COMPANY_CODE}/'
OUT_PATH_GRAPH    = f'resources/output/cnn_rfm/graph/{COMPANY_CODE}/'
OUT_PATH_FILE     = f'resources/output/cnn_rfm/file/{COMPANY_CODE}/'

In [None]:
def time_taken(seconds):
    print(f'\nTime Taken: {str(timedelta(seconds=seconds))}')
    winsound.Beep(frequency=1000, duration=100)
    winsound.Beep(frequency=1500, duration=50)

# Phase 1 - Feature Loading
- Load periods having all numbers occured at least once
- Convert features to transaction format

In [None]:
def load_moving_rfm(company_code, start_year=None, end_year=None):
    files      = glob.glob(f'{SOURCE_PATH_RFM}{company_code} - *.csv')
    files_dict = {x: int(x[x.index('.csv') - 4: x.index('.csv')]) for x in files}
    files      = [k for k,v in files_dict.items()
                  if (True if start_year is None else v >= start_year) and (True if end_year is None else v <= end_year)]
    
    dfs = []
    for file in files:
        print(file)
        df_chunks = pd.read_csv(file, sep=';', dtype={'number': str},
                                parse_dates=['date'],
                                date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),
                                chunksize=50_000)
        df = pd.concat(df_chunks)
        dfs.append(df)
        
    return pd.concat(dfs)

In [None]:
EXEC_START = time.time()

date       = '2013-11-17' if COMPANY_CODE == 'MAG' else '2016-11-27' if COMPANY_CODE == 'DMC' else '2015-05-27' if COMPANY_CODE == 'ST' else None
start_year = 2013 if COMPANY_CODE == 'MAG' else 2016 if COMPANY_CODE == 'DMC' else 2015 if COMPANY_CODE == 'ST' else None
end_year   = 2020

feature_df = load_moving_rfm(COMPANY_CODE, start_year=start_year, end_year=end_year)
feature_df = feature_df[feature_df['date'] >= date].reset_index(drop=True).copy()

vp.faststat(feature_df)

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)

In [None]:
# Convert features to transaction format
recency_df = pd.DataFrame(feature_df['recency'].values.reshape(-1, 10000))
recency_df.rename(columns={x: f'recency_{str(x).zfill(4)}' for x in recency_df.columns}, inplace=True)

frequency_df = pd.DataFrame(feature_df['frequency'].values.reshape(-1, 10000))
frequency_df.rename(columns={x: f'frequency_{str(x).zfill(4)}' for x in frequency_df.columns}, inplace=True)

monetary_df = pd.DataFrame(feature_df['monetary'].values.reshape(-1, 10000))
monetary_df.rename(columns={x: f'monetary_{str(x).zfill(4)}' for x in monetary_df.columns}, inplace=True)

date_df = pd.DataFrame(feature_df['date'].unique(), columns=['draw_date'])

del feature_df

recency_df.shape, frequency_df.shape, monetary_df.shape, date_df.shape

In [None]:
# Compile transaction features
feature_df = date_df.merge(recency_df, left_index=True, right_index=True, how='left')
feature_df = feature_df.merge(frequency_df, left_index=True, right_index=True, how='left')
feature_df = feature_df.merge(monetary_df, left_index=True, right_index=True, how='left')

del recency_df, frequency_df, monetary_df, date_df

vp.faststat(feature_df)

# Phase 2 - Target Loading
- Create target label

In [None]:
def load_target(filename):
    source_file = f'{SOURCE_PATH_TRANS}{filename}'
    df_chunks   = pd.read_csv(source_file, sep=';',
                              usecols=['draw_date', 'draw_period', '1st'],
                              dtype={'1st': str},
                              parse_dates=['draw_date'],
                              date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),
                              chunksize=50_000)
    return pd.concat(df_chunks)

In [None]:
target_df = load_target(f'{COMPANY_CODE} - transactions.csv')

vp.faststat(target_df)

In [None]:
# Take target from following period
target_df['target'] = target_df['1st'].shift(-1)

# Split target into digits
for index in [x for x in range(4)]:
    column = f'target{4 - index}'
    target_df[column] = target_df['target'].apply(lambda x: x[index] if x == x else x)
    target_df[column] = target_df[column].astype(float).astype('Int8')

target_df.head()

In [None]:
target_df.drop(columns=['1st', 'target'], inplace=True)

# Phase 3 - Dataset
- Map target label to features

In [None]:
feature_df.shape, target_df.shape

In [None]:
data_df = feature_df.merge(target_df, on='draw_date', how='inner')

del feature_df, target_df

vp.faststat(data_df)

In [None]:
data_df.dropna(inplace=True)

columns = [x for x in data_df.columns if any([x.startswith(y) for y in ['1st', 'target']])]
data_df[columns] = data_df[columns].astype(int)

# Target distribution
print('Full dataset:')
vp.value_count(data_df, TARGET)

In [None]:
def balanced_target(df, target, n_remain, excludes=[], random_state=None):
    np.random.seed(random_state)
    
    dfs = []
    for target_label in np.unique(df[target]):
        indexes = df[df[target] == target_label].index
        indexes = [x for x in indexes if x not in excludes]
        
        choices = np.random.choice(indexes, size=n_remain, replace=False)
        dfs.append(df[df.index.isin(choices)].copy())
        
    return pd.concat(dfs)

In [None]:
# Split train & validation dataset with balanced target label
train_df = balanced_target(data_df, target=TARGET, n_remain=55, random_state=10000)
valid_df = balanced_target(data_df, target=TARGET, n_remain=25, random_state=10000, excludes=train_df.index)

# Remaining goes to test dataset
used_indexes = list(train_df.index) + list(valid_df.index)
test_df      = data_df[~data_df.index.isin(used_indexes)].copy()

# Shuffle dataset
train_df = train_df.sample(frac=1, random_state=0)
valid_df = valid_df.sample(frac=1, random_state=0)
test_df  = test_df.sample(frac=1, random_state=0)

del data_df

train_df.shape, valid_df.shape, test_df.shape

In [None]:
print('Train dataset:')
vp.value_count(train_df, TARGET)

print('\nValidate dataset:')
vp.value_count(valid_df, TARGET)

print('\nTest dataset:')
vp.value_count(test_df, TARGET)

In [None]:
def sampling_period(df, title):
    sample_df = df.copy()
    sample_df['year_month'] = sample_df['draw_date'].dt.to_period('M').astype(str)
    sample_df = sample_df.groupby(['dataset', 'year_month']).agg(
        count=('year_month', 'count')
    ).reset_index()
    
    fig = px.bar(sample_df, x='year_month', y='count', facet_row='dataset')
    vp.generate_plot(fig,
                     out_path=OUT_PATH_GRAPH,
                     out_filename=title)

In [None]:
train_df['dataset'] = 'train'
valid_df['dataset'] = 'validate'
test_df['dataset']  = 'test'

sampling_period(pd.concat([train_df, valid_df, test_df]),
                title='Phase 3 - Bar - Draw Date (Sample)')

# Phase 4 - Classification
- Separate dataset to features & target
- Feature scaling
- Classification

In [None]:
def feature_scaling(X):
    new_X = []
    
    # NOTE: Normalize each matrix to range from 0 - 1 individually
    for x in X:
        _min  = np.amin(x)
        _max  = np.amax(x)
        new_x = (x - _min) / (_max - _min)
        new_X.append(new_x)
        
    return np.array(new_X)

def feature_target_split(df):
    recency_X   = df[[x for x in df.columns if x.startswith('recency_')]].values.reshape(-1, 100, 100)
    frequency_X = df[[x for x in df.columns if x.startswith('frequency_')]].values.reshape(-1, 100, 100)
    monetary_X  = df[[x for x in df.columns if x.startswith('monetary_')]].values.reshape(-1, 100, 100)
    
    # Feature scaling
    recency_X   = feature_scaling(recency_X)
    frequency_X = feature_scaling(frequency_X)
    monetary_X  = feature_scaling(monetary_X)
    
    X = np.stack([recency_X, frequency_X, monetary_X], axis=3)
    y = df[TARGET]
    
    return X, y

In [None]:
# Separate features & target
X_train, y_train = feature_target_split(train_df)
X_valid, y_valid = feature_target_split(valid_df)
X_test,  y_test  = feature_target_split(test_df)

del train_df, valid_df, test_df

print('Train dataset:')
print(X_train.shape, y_train.shape)

print('\nValidate dataset:')
print(X_valid.shape, y_valid.shape)

print('\nTest dataset:')
print(X_test.shape, y_test.shape)

In [None]:
def target_reshape(y):
    return to_categorical(y, dtype='int8')

In [None]:
# Target reshaping
y_train = target_reshape(y_train)
y_valid = target_reshape(y_valid)
y_test  = target_reshape(y_test)

print('Train target:')
print(y_train.shape)

print('\nValidate target:')
print(y_valid.shape)

print('\nTest target:')
print(y_test.shape)

In [None]:
def compile_model(X):
    input_shape = (X.shape[1], X.shape[2], X.shape[3])
    
    model = Sequential()
    model.add(Conv2D(16, kernel_size=(3,3), strides=(1,1),
                     padding='same', activation='relu', kernel_initializer='he_uniform',
                     input_shape=input_shape))
    model.add(Conv2D(32, kernel_size=(3,3), strides=(1,1),
                     padding='same', activation='relu', kernel_initializer='he_uniform'))
    model.add(Conv2D(64, kernel_size=(3,3), strides=(1,1),
                     padding='same', activation='relu', kernel_initializer='he_uniform'))
    model.add(Conv2D(128, kernel_size=(3,3), strides=(1,1),
                     padding='same', activation='relu', kernel_initializer='he_uniform'))
    model.add(MaxPool2D(pool_size=(2,2)))
    model.add(Dropout(rate=.5))
    model.add(Flatten())
    model.add(Dense(128, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(64, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(10, activation='softmax'))
    
    # Referecence: https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
    metrics = [
        'acc',
        AUC(name='auc'),
        Precision(name='precision'),
        Recall(name='recall'),
    ]
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=.001, epsilon=.00001),
                  metrics=metrics)
    
    return model

In [None]:
# Reference: https://machinelearningmastery.com/understand-the-dynamics-of-learning-rate-on-deep-learning-neural-networks/
lrate = ReduceLROnPlateau(monitor='val_auc', factor=0.95, patience=15)

# Reference: https://machinelearningmastery.com/how-to-stop-training-deep-neural-networks-at-the-right-time-using-early-stopping/
es = EarlyStopping(monitor='val_auc', mode='max', verbose=1, patience=30, restore_best_weights=True)

model   = compile_model(X_train)
history = model.fit_generator(
    DataGenerator(X_train, y_train, batch_size=1),
    validation_data=DataGenerator(X_valid, y_valid, batch_size=1),
    epochs=100,
    callbacks=[lrate, es]
)

In [None]:
def eval_classif(X, y, model):
    y_pred = model.predict(X)
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y, axis=1)
    
    cofmat_df = pd.DataFrame(confusion_matrix(y_true, y_pred))
    cofmat_df.index.name   = 'True'
    cofmat_df.columns.name = 'Pred'

    print(cofmat_df)
    print()
    print(classification_report(y_true, y_pred, digits=5))

In [None]:
# Test set evaluation
eval_classif(X_test, y_test, model)

In [None]:
# Train set evaluation
eval_classif(X_train, y_train, model)

In [None]:
# Validation set evaluation
eval_classif(X_valid, y_valid, model)

In [None]:
# Learning Rate
data = []
data.append(go.Scatter(
    y=history.history['lr'],
    mode='lines',
    name='LR',
    marker={'color': DEFAULT_PLOTLY_COLORS[-1]},
))
fig1 = go.Figure(data=data)

# Loss
data = []
data.append(go.Scatter(
    y=history.history['loss'],
    mode='lines',
    name='loss',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scatter(
    y=history.history['val_loss'],
    mode='lines',
    name='val_loss',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig2 = go.Figure(data=data)

# Accuracy
data = []
data.append(go.Scattergl(
    y=history.history['acc'],
    mode='lines',
    name='accuracy',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scattergl(
    y=history.history['val_acc'],
    mode='lines',
    name='val_accuracy',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig3 = go.Figure(data=data)

# AUC
data = []
data.append(go.Scattergl(
    y=history.history['auc'],
    mode='lines',
    name='auc',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scattergl(
    y=history.history['val_auc'],
    mode='lines',
    name='val_auc',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig4 = go.Figure(data=data)

# Precision
data = []
data.append(go.Scatter(
    y=history.history['precision'],
    mode='lines',
    name='precision',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scatter(
    y=history.history['val_precision'],
    mode='lines',
    name='val_precision',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig5 = go.Figure(data=data)

# Recall
data = []
data.append(go.Scatter(
    y=history.history['recall'],
    mode='lines',
    name='recall',
    marker={'color': DEFAULT_PLOTLY_COLORS[0]},
    legendgroup='train',
))
data.append(go.Scatter(
    y=history.history['val_recall'],
    mode='lines',
    name='val_recall',
    marker={'color': DEFAULT_PLOTLY_COLORS[1]},
    legendgroup='validate',
))
fig6 = go.Figure(data=data)

data_groups = [fig1['data'], fig2['data'], fig3['data'], fig4['data'], fig5['data'], fig6['data']]
vp.datagroups_subplots(data_groups,
                       max_col=3,
                       title='Phase 4 - Metrics',
                       out_path=OUT_PATH_GRAPH,
                       subplot_titles=['Learning Rate', 'Loss', 'Accuracy', 'AUC', 'Precision', 'Recall'])