In [None]:
import lib._util.visualplot as vp
import lib._util.fileproc as fp

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 100)

import copy
import tqdm

# Plotly
import plotly.express as px
import plotly.graph_objects as go

# Time measurement
import time
from datetime import timedelta, datetime

# Sound notification
import winsound

# Useful Functions

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH   = 'resources/output/eda_trans/graph/'
OUT_PATH_FILE    = 'resources/output/eda_trans/file/'

In [None]:
def time_taken(seconds):
    print(f'\nTime Taken: {str(timedelta(seconds=seconds))}')
    winsound.Beep(frequency=1000, duration=100)
    winsound.Beep(frequency=1500, duration=50)

def smart_title(text, sep=' '):
    return ' '.join(x if x.isupper() else x.title() for x in text.split(sep))

# Phase 1 - Data Loading
- Load timeseries result

In [None]:
def load_data(filename):
    source_file = f'{SOURCE_PATH_DATA}{filename}'
    df_chunks   = pd.read_csv(source_file, sep=';',
                              dtype={'number': str},
                              parse_dates=['draw_date'],
                              date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),
                              chunksize=50_000)
    df = pd.concat(df_chunks)
    
    # Separate by company
    df_dict = dict()
    for company in df['company_code'].unique():
        df_dict[company] = df[df['company_code'] == company].copy()
    
    return df_dict

In [None]:
df_dict = load_data('dataset.csv')
print(df_dict.keys())

In [None]:
def faststat(df_dict):
    for key, df in df_dict.items():
        print(key)
        vp.faststat(df)
        print()

In [None]:
faststat(df_dict)

In [None]:
def histogram(df_dict, title, columns=None,
              max_col=2, layout_kwargs={}, to_image=True):
    
    for key, df in df_dict.items():
        columns = df.columns if columns is None else columns
        vp.histogram(df[columns],
                     bin_algo='count',
                     title=f'{key} - {title}',
                     out_path=f'{OUT_PATH_GRAPH}{key}/',
                     max_col=max_col,
                     layout_kwargs=layout_kwargs,
                     to_image=to_image)

In [None]:
histogram(df_dict,
          title='Phase 1 - Histogram',
          max_col=3)

# Phase 2 - Data Preparation
- Drop non-informative field
- Handle invalid position

In [None]:
for key, df in df_dict.items():
    # Drop company column
    df.drop(columns=['company_code'], inplace=True)
    
    # Remove invalid number
    df = df[df['number'] != '----'].reset_index(drop=True).copy()
    
    df_dict[key] = df
    del df

In [None]:
# Handle invalid position on DMC due to duplication
tmp_df = df_dict['DMC'].copy()
tmp_df = tmp_df[tmp_df['position'] <= 10].reset_index(drop=True)
df_dict['DMC'] = tmp_df.copy()

del tmp_df

In [None]:
histogram(df_dict,
          title='Phase 2 - Histogram')

# Phase 3 - Data Preparation
- Convert to transactional format

In [None]:
def period_feature(df_dict):
    # Load date feature
    source_file = f'{SOURCE_PATH_DATA}4D_dates.csv'
    period_df   = pd.read_csv(source_file, sep=';',
                              parse_dates=['draw_date'],
                              date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'))
    
    newdf_dict = copy.deepcopy(df_dict)
    for key, df in newdf_dict.items():
        tmp_df = period_df[period_df['company_code'] == key].reset_index(drop=True)
        tmp_df.sort_values(by='draw_date', inplace=True)
        tmp_df['draw_period'] = tmp_df.index + 1
        
        df = df.merge(tmp_df[['draw_date', 'draw_period']], on='draw_date', how='left')
        newdf_dict[key] = df
    
    return newdf_dict

In [None]:
df_dict = period_feature(df_dict)

faststat(df_dict)

In [None]:
def to_transaction(df_dict, trans_value):
    newdf_dict = {}
    categories = ['1st', '2nd', '3rd'] + [f'Sp{x +1}' for x in range(10)] + [f'Cons{x +1}' for x in range(10)]
    
    for key, df in df_dict.items():
        count_df = df.groupby('draw_date').agg(
            count=(trans_value, 'count')
        ).reset_index()
        
        count = count_df['count'].min()
        assert count == len(categories), f'{key} - having invalid count'
        
        # Map draw date
        trans_df = pd.DataFrame(df[trans_value].values.reshape(-1, count), columns=categories)
        trans_df.index = df['draw_date'].unique()
        trans_df.reset_index(inplace=True)
        trans_df.rename(columns={'index': 'draw_date'}, inplace=True)
        
        # Map draw period
        period_df = df.groupby('draw_date').agg(
            draw_period=('draw_period', 'max')
        ).reset_index()
        trans_df = trans_df.merge(period_df, on='draw_date', how='left')
        
        # Re-order columns
        date_columns   = ['draw_date', 'draw_period']
        remain_columns = [x for x in trans_df.columns if x not in date_columns]
        trans_df       = pd.concat([trans_df[date_columns], trans_df[remain_columns]], axis=1)
        
        newdf_dict[key] = trans_df
    
    return newdf_dict

In [None]:
transdf_dict = to_transaction(df_dict, trans_value='number')

faststat(transdf_dict)

In [None]:
def transaction_heatmap(df_dict, title,
                        heatmap_kwargs={}, layout_kwargs={}, to_image=True):
    
    heatmap_kwargs['hovertemplate'] = 'Draw Period: %{x}<br>Position: %{y}<br>Digit: %{z}'
    categories = ['1st', '2nd', '3rd'] + [f'Sp{x +1}' for x in range(10)] + [f'Cons{x +1}' for x in range(10)]
    
    for key, df in df_dict.items():
        tmp_df = df.copy()
        
        # Split number to digits
        for category in categories:
            tmp_df[category] = tmp_df[category].apply(lambda x: [int(c) for c in x])
        
        # Heatmap for each digits
        category_dict = {
            0: 'Digit 4',
            1: 'Digit 3',
            2: 'Digit 2',
            3: 'Digit 1'
        }
        for digit_key, digit_value in category_dict.items():
            vp.heatmap(
                x=df['draw_period'],
                y=categories,
                z=df[categories].apply(lambda x: list(map(lambda y: y[digit_key], x))).T.values,
                title=f'{key} - {title} - {digit_value}',
                out_path=f'{OUT_PATH_GRAPH}{key}/',
                layout_kwargs=layout_kwargs,
                to_image=to_image,
                heatmap_kwargs=heatmap_kwargs
            )
        print()

In [None]:
transaction_heatmap(transdf_dict,
                    title=f'Phase 3 - Heatmap',
                    heatmap_kwargs={'colorscale': 'RdYlGn'},
                    to_image=False)

In [None]:
def transaction_export(df_dict):
    for key, df in df_dict.items():
        fp.generate_csv(df,
                        out_path=f'{OUT_PATH_FILE}{key}/',
                        out_filename=f'{key} - transactions.csv',
                        export_index=False)

In [None]:
# Export transaction
transaction_export(transdf_dict)

# Phase 4 - Feature Engineering
- Digit frequency:
  - Calculate frequency of 0 - 9 digits on each periods
  - Calculate frequency of 00 - 99 digit combinations on each periods

In [None]:
# Reference: https://www.youtube.com/watch?v=gY3KLGnJPWo
def digit_frequency(df_dict):
    newdf_dict = copy.deepcopy(df_dict)
    categories = ['1st', '2nd', '3rd'] + [f'Sp{x +1}' for x in range(10)] + [f'Cons{x +1}' for x in range(10)]
    
    for key, df in newdf_dict.items():
        # 0 - 9
        for x in tqdm.tqdm(range(10)):
            digit     = str(x)
            df[digit] = df[categories].apply(lambda x: x.str.count(digit)).sum(axis=1)
            
        # 00 - 99
        for x in tqdm.tqdm(range(100)):
            digit          = str(x).zfill(2)
            digit1, digit2 = [x for x in digit]
            
            if digit1 == digit2:
                df[digit] = df[categories].apply(lambda x: x.str.count(digit1) >= 2).sum(axis=1)
            else:
                df[digit] = df[categories].apply(lambda x: (x.str.count(digit1) >= 1) &
                                                           (x.str.count(digit2) >= 1)).sum(axis=1)
            
        df.drop(columns=categories, inplace=True)
        newdf_dict[key] = df
        
    return newdf_dict

In [None]:
EXEC_START = time.time()

digitdf_dict = digit_frequency(transdf_dict)

faststat(digitdf_dict)

EXEC_END = time.time()
time_taken(EXEC_END - EXEC_START)