In [None]:
import lib._util.visualplot as vp
import lib._util.fileproc as fp

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 100)

import numpy as np

# Plotly
import plotly.express as px
import plotly.graph_objects as go

# Time measurement
import time
from datetime import timedelta, datetime

# Sound notification
import winsound

# Useful Functions

In [None]:
COMPANY_CODE      = 'MAG'
TARGET            = 'target4'
SOURCE_PATH_TRANS = f'resources/output/eda_trans/file/{COMPANY_CODE}/'
OUT_PATH_GRAPH    = f'resources/output/cnn_00-99/graph/{COMPANY_CODE}/'
OUT_PATH_FILE     = f'resources/output/cnn_00-99/file/{COMPANY_CODE}/'

In [None]:
def time_taken(seconds):
    print(f'\nTime Taken: {str(timedelta(seconds=seconds))}')
    winsound.Beep(frequency=1000, duration=100)
    winsound.Beep(frequency=1500, duration=50)

# Phase 1 - Feature Loading
- Load digit frequency

In [None]:
def load_feature(filename):
    source_file = f'{SOURCE_PATH_TRANS}{filename}'
    df_chunks   = pd.read_csv(source_file, sep=';',
                              usecols=['draw_date', 'draw_period'] + [str(x).zfill(2) for x in range(100)],
                              parse_dates=['draw_date'],
                              date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),
                              chunksize=50_000)
    return pd.concat(df_chunks)

In [None]:
feature_df = load_feature(f'{COMPANY_CODE} - digit_frequency.csv')

vp.faststat(feature_df)

# Phase 2 - Target Loading
- Create target label

In [None]:
def load_target(filename):
    source_file = f'{SOURCE_PATH_TRANS}{filename}'
    df_chunks   = pd.read_csv(source_file, sep=';',
                              usecols=['draw_date', 'draw_period', '1st'],
                              dtype={'1st': str},
                              parse_dates=['draw_date'],
                              date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),
                              chunksize=50_000)
    return pd.concat(df_chunks)

In [None]:
target_df = load_target(f'{COMPANY_CODE} - transactions.csv')

vp.faststat(target_df)

In [None]:
# Take target from following period
target_df['target'] = target_df['1st'].shift(-1)

# Split target into digits
for index in [x for x in range(4)]:
    column = f'target{4 - index}'
    target_df[column] = target_df['target'].apply(lambda x: x[index] if x == x else x)
    target_df[column] = target_df[column].astype(float).astype('Int8')

target_df.head()

In [None]:
target_df.drop(columns=['1st', 'target'], inplace=True)

# Phase 3 - Dataset
- Map target label to features

In [None]:
feature_df.shape, target_df.shape

In [None]:
data_df = feature_df.merge(target_df, on=['draw_date', 'draw_period'], how='inner')

vp.faststat(data_df)

In [None]:
def val_dist(df, column):
    count_df = df[column].value_counts().to_frame(name='Count')
    ratio_df = df[column].value_counts(normalize=True).to_frame(name='Ratio')
    
    dist_df  = count_df.merge(ratio_df, left_index=True, right_index=True, how='left')
    print(dist_df)

In [None]:
# Target distribution
data_df.dropna(inplace=True)

print('Full dataset:')
val_dist(data_df, TARGET)

In [None]:
def balanced_target(df, target, n_remain, excludes=[]):
    np.random.seed(10000)
    
    dfs = []
    for target_label in np.unique(df[target]):
        indexes = df[df[target] == target_label].index
        indexes = [x for x in indexes if x not in excludes]
        
        choices = np.random.choice(indexes, size=n_remain, replace=False)
        dfs.append(df[df.index.isin(choices)].copy())
        
    return pd.concat(dfs)

In [None]:
# Split train & validation dataset with balanced target label
train_df = balanced_target(data_df, target=TARGET, n_remain=350)
valid_df = balanced_target(data_df, target=TARGET, n_remain=150, excludes=train_df.index)

# Remaining goes to test dataset
used_indexes = list(train_df.index) + list(valid_df.index)
test_df      = data_df[~data_df.index.isin(used_indexes)].copy()

# Shuffle dataset
train_df = train_df.sample(frac=1, random_state=0)
valid_df = valid_df.sample(frac=1, random_state=0)
test_df  = test_df.sample(frac=1, random_state=0)

train_df.shape, valid_df.shape, test_df.shape

In [None]:
print('Train dataset:')
val_dist(train_df, TARGET)

print('\nValidate dataset:')
val_dist(valid_df, TARGET)

print('\nTest dataset:')
val_dist(test_df, TARGET)

In [None]:
def sampling_period(df, title):
    sample_df = df.copy()
    sample_df['year_month'] = sample_df['draw_date'].dt.to_period('M').astype(str)
    sample_df = sample_df.groupby(['dataset', 'year_month']).agg(
        count=('year_month', 'count')
    ).reset_index()
    
    fig = px.bar(sample_df, x='year_month', y='count', facet_row='dataset')
    vp.generate_plot(fig,
                     out_path=OUT_PATH_GRAPH,
                     out_filename=title)

In [None]:
train_df['dataset'] = 'train'
valid_df['dataset'] = 'validate'
test_df['dataset']  = 'test'

sampling_period(pd.concat([train_df, valid_df, test_df]),
                title='Phase 3 - Bar - Draw Date (Sample)')

# Phase 4 - Classification
- Separate dataset to features & target
- Feature scaling
- Classification

In [None]:
# Separate features & target
X_train = train_df[[str(x).zfill(2) for x in range(100)]].values.reshape(-1, 10, 10)
X_valid = valid_df[[str(x).zfill(2) for x in range(100)]].values.reshape(-1, 10, 10)
X_test  = test_df[[str(x).zfill(2) for x in range(100)]].values.reshape(-1, 10, 10)

y_train = train_df[TARGET]
y_valid = valid_df[TARGET]
y_test  = test_df[TARGET]

print('Train dataset:')
print(X_train.shape, y_train.shape)

print('\nValidate dataset:')
print(X_valid.shape, y_valid.shape)

print('\nTest dataset:')
print(X_test.shape, y_test.shape)

In [None]:
def feature_scaling(X):
    new_X = []
    
    # NOTE: Normalize each matrix to range from 0 - 1 individually
    for x in X:
        _min  = np.amin(x)
        _max  = np.amax(x)
        new_x = (x - _min) / (_max - _min)
        new_X.append(new_x)
        
    return np.array(new_X)

In [None]:
X_train = feature_scaling(X_train)
X_valid = feature_scaling(X_valid)
X_test  = feature_scaling(X_test)

print('Train dataset:')
print(np.amin(X_train), np.amax(X_train))

print('\nValidate dataset:')
print(np.amin(X_valid), np.amax(X_valid))

print('\nTest dataset:')
print(np.amin(X_test), np.amax(X_test))

In [None]:
# TODO - create CNN model