In [None]:
import lib._util.visualplot as vp
import lib._util.fileproc as fp

In [None]:
import pandas as pd
pd.set_option('display.max_columns', 100)

import category_encoders as ce

# Plotly
import plotly.express as px
import plotly.graph_objects as go

# Time measurement
import time
from datetime import timedelta, datetime

# Sound notification
import winsound

# Useful Functions

In [None]:
SOURCE_PATH_DATA = 'resources/data/'
OUT_PATH_GRAPH   = 'resources/output/eda_basic/graph/'
OUT_PATH_FILE    = 'resources/output/eda_basic/file/'

In [None]:
def time_taken(seconds):
    print(f'\nTime Taken: {str(timedelta(seconds=seconds))}')
    winsound.Beep(frequency=1000, duration=100)
    winsound.Beep(frequency=1500, duration=50)

def smart_title(text, sep=' '):
    return ' '.join(x if x.isupper() else x.title() for x in text.split(sep))

# Phase 1 - EDA
- Number features

###### Number Characteristic

In [None]:
def load_categories():
    source_file = f'{SOURCE_PATH_DATA}number_category.csv'
    df          = pd.read_csv(source_file, sep=';', dtype={'number': str})
    
    return df

In [None]:
df = load_categories()

vp.faststat(df)

In [None]:
vp.histogram(df,
             bin_algo='count',
             title='Phase 1 - Histogram - Number Characteristic',
             out_path=f'{OUT_PATH_GRAPH}',
             max_col=4)

###### Number Binary Cluster

In [None]:
# Reference: https://www.youtube.com/watch?v=kVgP_UpBXqM
def binary_cluster():
    # Clustering on 1 - 9999
    df = pd.DataFrame([str(x).zfill(4) for x in range(1, 10000)], columns=['number'])
    df = pd.concat([
        df,
        ce.BinaryEncoder().fit_transform(df['number'])
    ], axis=1)
    
    # Rename columns
    columns     = [x for x in df.columns if x.startswith('number_')]
    count       = len(columns)
    rename_dict = {x: x.replace(f'number_{i}', f'binary_CL{count - i}') for i,x in enumerate(columns)}
    df.rename(columns=rename_dict, inplace=True)
    
    # Remove non-informative cluster
    last_cluster = f'binary_CL{count}'
    if len(df[last_cluster].unique()) == 1:
        df.drop(columns=[last_cluster], inplace=True)
        
    # Clustering on 0
    zero_df = pd.DataFrame({x: ['0000'] if x == 'number' else [0] for x in df.columns})
    df = pd.concat([zero_df, df])
    
    return df

In [None]:
df = binary_cluster()

vp.faststat(df)

In [None]:
# Split number to X & Y axis
df['left_digits']  = df['number'].str.slice(stop=2)
df['right_digits'] = df['number'].str.slice(start=2, stop=4)

In [None]:
def cluster_heatmap(df, title, max_col=1,
                    heatmap_kwargs={}, layout_kwargs={}, to_image=True):
    
    data_groups = []
    columns     = [x for x in df.columns if x not in ['number', 'left_digits', 'right_digits']]
    
    for column in columns:
        column_label = smart_title(column, sep='_')
        heatmap_kwargs['text']      = 'Number: ' + df['number'] + f'<br>{column_label}: ' + df[column].astype(str)
        heatmap_kwargs['hoverinfo'] = 'text'

        data=go.Heatmap(
            x=df['left_digits'],
            y=df['right_digits'],
            z=df[column].values,
            **heatmap_kwargs
        )
        fig = go.Figure(data=data)
        data_groups.append(fig['data'])
        
    vp.datagroups_subplots(
        data_groups,
        max_col=max_col,
        title=title,
        subplot_titles=columns,
        out_path=OUT_PATH_GRAPH,
        layout_kwargs=layout_kwargs,
        to_image=to_image
    )

In [None]:
cluster_heatmap(df,
                title=f'Phase 1 - Heatmap - Binary Cluster',
                max_col=5,
                heatmap_kwargs={
                    'colorscale': 'RdYlGn',
                    'showscale': False
                },
                layout_kwargs={'height': 1000},
                to_image=False)

###### Number Breakdown Cluster

In [None]:
# Reference: https://youtu.be/KQv0lEaDGco?t=117
def breakdown_cluster():
    df = pd.DataFrame([str(x).zfill(4) for x in range(10000)], columns=['number'])
    df['breakdown_CL'] = df['number'].apply(lambda x: [int(c) for c in x]).apply(lambda x: sum(x))
    
    while df['breakdown_CL'].max() > 9:
        df['breakdown_CL'] = df['breakdown_CL'].astype(str).apply(lambda x: [int(c) for c in x]).apply(lambda x: sum(x))
        
    return df

In [None]:
df = breakdown_cluster()

vp.faststat(df)

In [None]:
# Split number to X & Y axis
df['left_digits']  = df['number'].str.slice(stop=2)
df['right_digits'] = df['number'].str.slice(start=2, stop=4)

In [None]:
cluster_heatmap(df,
                title=f'Phase 1 - Heatmap - Breakdown Cluster',
                heatmap_kwargs={'colorscale': 'RdYlGn'},
                to_image=False)

# Phase 2 - EDA
- Draw dates

In [None]:
def load_dates():
    source_file = f'{SOURCE_PATH_DATA}4D_dates.csv'
    df          = pd.read_csv(source_file, sep=';',
                              parse_dates=['draw_date'],
                              date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'),)
    
    return df

In [None]:
df = load_dates()

vp.faststat(df)

In [None]:
vp.histogram(df,
             bin_algo='count',
             title='Phase 2 - Histogram - Draw Date',
             out_path=f'{OUT_PATH_GRAPH}')

In [None]:
df['year_month'] = df['draw_date'].dt.to_period('M').astype(str)

tmp_df = df.groupby(['company_code', 'year_month']).agg(
    count=('draw_date', 'count')
).reset_index()

fig = px.bar(tmp_df, x='year_month', y='count', facet_row='company_code')
vp.generate_plot(fig,
                 out_path=OUT_PATH_GRAPH,
                 out_filename='Phase 2 - Histogram - Draw Date (Company)')

del tmp_df