In [None]:
import os

import numpy as np
import pandas as pd
import itertools

# Bokeh
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import Legend, HoverTool, LinearColorMapper, BasicTicker, PrintfTickFormatter, ColorBar
from bokeh.palettes import Set1, Category10, Category20, Reds
from bokeh.layouts import gridplot
from bokeh.core.properties import value
from bokeh.io import show, output_notebook
output_notebook()

# 1. Data Extraction

In [None]:
GLOBAL_IN_PATH  = '../resources/data/'
GLOBAL_OUT_PATH = '../resources/output/'

IN_PATH  = 'resources/data/'
OUT_PATH = 'resources/output/'

In [None]:
file = f'{GLOBAL_IN_PATH}4D_result_2018-01-01_2018-12-31.csv'
raw_df = pd.read_csv(file, sep=';', dtype={'number': str},
                     parse_dates=['draw_date'],
                     date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'))
# raw_df.info()
# raw_df.head()

# 2. Data Transformation
### 2.1 increase data readability

In [None]:
data_df = raw_df.set_index('draw_date')
data_df.loc[data_df['company_code'] == 'DMC', 'company_code'] = 'Da Ma Cai'
data_df.loc[data_df['company_code'] == 'MAG', 'company_code'] = 'Magnum'
data_df.loc[data_df['company_code'] == 'ST', 'company_code'] = 'Sports Toto'

data_df.loc[data_df['category'] == 'FST', 'category'] = '1st'
data_df.loc[data_df['category'] == 'SCD', 'category'] = '2nd'
data_df.loc[data_df['category'] == 'TRD', 'category'] = '3rd'
data_df.loc[data_df['category'] == 'SP', 'category'] = 'Special'
data_df.loc[data_df['category'] == 'CONS', 'category'] = 'Consolation'

data_df.loc[data_df['number'] == '----', 'number'] = np.nan
data_df['number'] = data_df['number'].astype(float)
# data_df.head()

### 2.2 populate clustering categories

In [None]:
file = f'{GLOBAL_IN_PATH}number_category.csv'
category_df = pd.read_csv(file, sep=';', dtype={'number': float})
# category_df.info()
# category_df.head()

In [None]:
# Build dictionaries
pattern_dict = dict(zip(category_df['number'], category_df['pattern']))
group_4_dict = dict(zip(category_df['number'], category_df['group_4']))
group_3_dict = dict(zip(category_df['number'], category_df['group_3']))
group_2_dict = dict(zip(category_df['number'], category_df['group_2']))
group_1_dict = dict(zip(category_df['number'], category_df['group_1']))
odd_even_dict = dict(zip(category_df['number'], category_df['odd_even']))
big_small_dict = dict(zip(category_df['number'], category_df['big_small']))

# Dictionaries lookup
data_df['pattern'] = [pattern_dict[row['number']] if row['number'] in pattern_dict else ''
                      for index, row in data_df.iterrows()]

data_df['group_4'] = [group_4_dict[row['number']] if row['number'] in group_4_dict else ''
                      for index, row in data_df.iterrows()]

data_df['group_3'] = [group_3_dict[row['number']] if row['number'] in group_3_dict else ''
                      for index, row in data_df.iterrows()]

data_df['group_2'] = [group_2_dict[row['number']] if row['number'] in group_2_dict else ''
                      for index, row in data_df.iterrows()]

data_df['group_1'] = [group_1_dict[row['number']] if row['number'] in group_1_dict else ''
                      for index, row in data_df.iterrows()]

data_df['odd_even'] = [odd_even_dict[row['number']] if row['number'] in odd_even_dict else ''
                       for index, row in data_df.iterrows()]

data_df['big_small'] = [big_small_dict[row['number']] if row['number'] in big_small_dict else ''
                        for index, row in data_df.iterrows()]
# data_df.head()

### 2.3 distribute data to dictionary

In [None]:
data_df_dict = dict()

for company in data_df['company_code'].unique():
    data_df_dict[company] = data_df[(data_df['company_code'] == company)]
    
print(data_df_dict.keys())

# 3. Data Visualization


In [None]:
def showGrid(fig_list, link_x=True, link_y=True, ncols=1, plot_width=950, plot_height=500, legend_font_size='10pt'):
    for index, fig in enumerate(fig_list):
        if len(fig.legend) > 0:
            fig.legend.label_text_font_size = legend_font_size
        
        if index > 0:
            prev_fig = fig_list[index - 1]
            if link_x:
                fig.x_range = prev_fig.x_range
            if link_y:
                fig.y_range = prev_fig.y_range

    grid = gridplot(fig_list, ncols=ncols, plot_width=plot_width, plot_height=plot_height)
    show(grid)
    
def showGridByGroups(groups, fig_dict, grid_type='big', link_x=True, link_y=True):
    for group in groups:
        tmp_list = list(filter(lambda x: group in x, fig_dict.keys()))
        fig_list = [fig_dict[x] for x in tmp_list]

        if grid_type == 'small':
            showGrid(fig_list=fig_list, link_x=link_x, link_y=link_y,
                     ncols=3, plot_width=300, plot_height=300, legend_font_size='6pt')
        else:
            showGrid(fig_list=fig_list, link_x=link_x, link_y=link_y)

### 3.1 Lottery Rewards

In [None]:
def lotteryRewardsFigures():
    fig_dict = dict()

    for key in data_df_dict.keys():
        data_df = data_df_dict[key]

        palette = Set1
        palette_colors = itertools.cycle(palette[max(palette.keys())])

        fig = figure(title=f'{key} - Lottery Rewards',
                     x_axis_type='datetime',
                     x_axis_label='Date', y_axis_label='Number',
                     width=950, height=500,
                     toolbar_location='above')

        items = []
        for index, category in enumerate(data_df['category'].unique()):
            tmp_df = data_df[data_df['category'] == category]

            source = ColumnDataSource(data=dict(
                dates=tmp_df.index,
                numbers=tmp_df['number'],
                categories=tmp_df['category'],
            ))

            color = next(palette_colors)
            alpha = .5
            size = 10

            if index == 0:
                glyph = fig.triangle('dates', 'numbers', color=color, alpha=alpha, size=size, source=source)
            elif index == 1:
                glyph = fig.square('dates', 'numbers', color=color, alpha=alpha, size=size, source=source)
            elif index == 2:
                glyph = fig.circle('dates', 'numbers', color=color, alpha=alpha, size=size, source=source)
            elif index == 3:
                glyph = fig.diamond('dates', 'numbers', color=color, alpha=alpha, size=size, source=source)
            else:
                glyph = fig.inverted_triangle('dates', 'numbers', color=color, alpha=alpha, size=size, source=source)
            items.append((category, [glyph]))

        fig.add_layout(Legend(items=items, location='bottom_left', orientation='horizontal', click_policy='hide'), 'below')
        fig.add_tools(HoverTool(
            tooltips = [
                ('Date', '@dates{%F}'),
                ('Number', '@numbers'),
                ('Price', '@categories'),
            ],
            formatters={
                'dates': 'datetime',
            },
            mode='mouse'
        ))
        fig_dict[key] = fig
        
    return fig_dict

In [None]:
fig_dict = lotteryRewardsFigures()

showGrid(fig_list=list(fig_dict.values()))

### 3.2 ABCD Pattern

In [None]:
def stackedAreaFigures(data_df_dict, group_by, splits_list, split_by='default', title='', palette=Category10):
    fig_dict = dict()
    
    for key in data_df_dict.keys():
        data_df = data_df_dict[key].copy()
        data_df = data_df[np.isnan(data_df['number']) == False]
        
        groups_results = []
        tmp_dict = data_df.groupby([data_df.index, group_by]).size().to_dict()
        tmp_list = [x for x in tmp_dict.items()]

        groups = data_df[group_by].unique()
        groups.sort()
        for group in groups:
            tmp_data = list(filter(lambda x: x[0][1] == group, tmp_list))
            groups_results.append((group, [(x[0][0], x[1]) for x in tmp_data]))

        tmp_list = [[y[0] for y in x[1]] for x in groups_results]
        dates = list(set(sum(tmp_list, [])))
        dates.sort()

        splits_results = []
        for splits in splits_list:
            if split_by == 'none':
                splits_results.append(list(filter(lambda x: True, groups_results)))
            elif split_by == 'groups_index':
                splits_results.append(list(filter(lambda x: groups_results.index(x) in splits, groups_results)))
            else:
                splits_results.append(list(filter(lambda x: x[0] in splits, groups_results)))

        for index, results in enumerate(splits_results):
            tmp_list = sorted([x[0] for x in results])
            subtitle = ''
            
            if split_by == 'groups_index':
                subtitle = splits_list[index]
            else:
                for index2, splits in enumerate(splits_list):
                    if ((tmp_list == sorted(splits)) | (all(x in splits for x in tmp_list))) :
                        subtitle = index2 < len(splits_list) -1 and f'{splits[0]} Pattern' or 'Other Pattern'
                        break
                    
            tmp_dict = dict()
            tmp_dict['dates'] = dates
            
            for x in results:
                occurrences = [0 for date in dates]
                group_dates = [y[0] for y in x[1]]
                
                for y in x[1]:
                    if y[1] == 0:
                        continue
                    date = y[0]
                    occurrences[dates.index(date)] = y[1]

                tmp_dict[x[0]] = occurrences
               
            tmp_df = pd.DataFrame(tmp_dict)
            tmp_df.set_index('dates', inplace=True)
            
            
            palette_colors = palette[max(palette.keys())][:len(tmp_df.columns)]
            
            fig = figure(title=f'{key.title()} - {title}{subtitle}',
                         x_axis_type='datetime',
                         x_axis_label='Date', y_axis_label='Occurrence',
                         width=950, height=500,
                         toolbar_location='above')
            
            source = ColumnDataSource(data=tmp_df)
            
            stackers = list(tmp_df.columns)
            fig.varea_stack(stackers=stackers, x='dates', color=palette_colors, legend=[value(x) for x in stackers], source=source)

            fig.y_range.start = 0
            fig.x_range.range_padding = 0.01
            
            legend = fig.legend[0]
            legend.visible = False
            fig.add_layout(Legend(items=legend.items, location='bottom_left', orientation='horizontal', click_policy='hide'), 'below')
            
            fig_dict[f'{key}_{subtitle}'] = fig
            
    return fig_dict

In [None]:
def patternABCDFigures():
    pattern_list = list(set([x for x in pattern_dict.values()]))

    AAAB_patterns = ['AAAB', 'AABA', 'ABAA']
    AABB_patterns = ['AABB', 'ABAB', 'ABBA']
    AABC_patterns = ['AABC', 'ABAC', 'ABCA']
    ABBC_patterns = ['ABBC', 'ABCB']
    ABCD_other_patterns = list(
        filter(lambda x: x not in AAAB_patterns + AABB_patterns + AABC_patterns + ABBC_patterns, pattern_list)
    )

    splits_list = [AAAB_patterns, AABB_patterns, AABC_patterns, ABBC_patterns, ABCD_other_patterns]
    fig_dict = stackedAreaFigures(data_df_dict, group_by='pattern', splits_list=splits_list)
    
    return fig_dict

In [None]:
fig_dict = patternABCDFigures()

showGridByGroups(groups=data_df_dict.keys(), fig_dict=fig_dict, grid_type='small')

### 3.3 N\*** Pattern

In [None]:
def patternNXXXFigures(split_by):
    if split_by == 'groups_index':
        splits_list = [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]
        fig_dict = stackedAreaFigures(data_df_dict, group_by='group_4', splits_list=splits_list, title='N*** Pattern',
                                      split_by=split_by)
    else:
        splits_list = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
        fig_dict = stackedAreaFigures(data_df_dict, group_by='group_4', splits_list=splits_list, title='N*** Pattern',
                                      split_by='none', palette=Category20)
    
    return fig_dict

In [None]:
split_by = 'groups_index'
fig_dict = patternNXXXFigures(split_by=split_by)

if (split_by == 'groups_index'):
    showGridByGroups(groups=data_df_dict.keys(), fig_dict=fig_dict, grid_type='small')
else:
    showGridByGroups(groups=data_df_dict.keys(), fig_dict=fig_dict)

### 3.4 \*N** Pattern

In [None]:
def patternXNXXFigures(split_by):
    if split_by == 'groups_index':
        splits_list = [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]
        fig_dict = stackedAreaFigures(data_df_dict, group_by='group_3', splits_list=splits_list, title='*N** Pattern',
                                      split_by=split_by)
    else:
        splits_list = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
        fig_dict = stackedAreaFigures(data_df_dict, group_by='group_3', splits_list=splits_list, title='*N** Pattern',
                                      split_by='none', palette=Category20)
    
    return fig_dict

In [None]:
split_by = 'groups_index'
fig_dict = patternXNXXFigures(split_by=split_by)

if (split_by == 'groups_index'):
    showGridByGroups(groups=data_df_dict.keys(), fig_dict=fig_dict, grid_type='small')
else:
    showGridByGroups(groups=data_df_dict.keys(), fig_dict=fig_dict)

### 3.5 \*\*N* Pattern

In [None]:
def patternXXNXFigures(split_by):
    if split_by == 'groups_index':
        splits_list = [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]
        fig_dict = stackedAreaFigures(data_df_dict, group_by='group_2', splits_list=splits_list, title='**N* Pattern',
                                      split_by=split_by)
    else:
        splits_list = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
        fig_dict = stackedAreaFigures(data_df_dict, group_by='group_2', splits_list=splits_list, title='**N* Pattern',
                                      split_by='none', palette=Category20)
    
    return fig_dict

In [None]:
split_by = 'groups_index'
fig_dict = patternXXNXFigures(split_by=split_by)

if (split_by == 'groups_index'):
    showGridByGroups(groups=data_df_dict.keys(), fig_dict=fig_dict, grid_type='small')
else:
    showGridByGroups(groups=data_df_dict.keys(), fig_dict=fig_dict)

### 3.6 \*\*\*N Pattern

In [None]:
def patternXXXNFigures(split_by):
    if split_by == 'groups_index':
        splits_list = [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]
        fig_dict = stackedAreaFigures(data_df_dict, group_by='group_1', splits_list=splits_list, title='***N Pattern',
                                      split_by=split_by)
    else:
        splits_list = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
        fig_dict = stackedAreaFigures(data_df_dict, group_by='group_1', splits_list=splits_list, title='***N Pattern',
                                      split_by='none', palette=Category20)
    
    return fig_dict

In [None]:
split_by = 'groups_index'
fig_dict = patternXXXNFigures(split_by=split_by)

if (split_by == 'groups_index'):
    showGridByGroups(groups=data_df_dict.keys(), fig_dict=fig_dict, grid_type='small')
else:
    showGridByGroups(groups=data_df_dict.keys(), fig_dict=fig_dict)

### 3.7 OEOE Pattern

In [None]:
def patternOEOEFigures():
    odd_even_list = list(set([x for x in odd_even_dict.values()]))

    EEEO_patterns = ['EEEO', 'EEOE', 'EOEE', 'OEEE']
    OOOE_patterns = ['OOOE', 'OOEO', 'OEOO', 'EOOO']
    EEOO_patterns = ['EEOO', 'EOEO', 'EOOE']
    OOEE_patterns = ['OOEE', 'OEOE', 'OEEO']
    OEOE_other_patterns = list(
        filter(lambda x: x not in EEEO_patterns + OOOE_patterns + EEOO_patterns + OOEE_patterns, odd_even_list)
    )

    splits_list = [EEEO_patterns, OOOE_patterns, EEOO_patterns, OOEE_patterns, OEOE_other_patterns]
    fig_dict= stackedAreaFigures(data_df_dict, group_by='odd_even', splits_list=splits_list)
    
    return fig_dict

In [None]:
fig_dict = patternOEOEFigures()

showGridByGroups(groups=data_df_dict.keys(), fig_dict=fig_dict, grid_type='small')

### 3.8 BSBS Pattern

In [None]:
def patternBSBSFigures():
    big_small_list = list(set([x for x in big_small_dict.values()]))

    BBBS_patterns = ['BBBS', 'BBSB', 'BSBB', 'SBBB']
    SSSB_patterns = ['SSSB', 'SSBS', 'SBSS', 'BSSS']
    BBSS_patterns = ['BBSS', 'BSBS', 'BSSB']
    SSBB_patterns = ['SSBB', 'SBSB', 'SBBS']
    BSBS_other_patterns = list(
        filter(lambda x: x not in BBBS_patterns + SSSB_patterns + BBSS_patterns + SSBB_patterns, big_small_list)
    )

    splits_list = [BBBS_patterns, SSSB_patterns, BBSS_patterns, SSBB_patterns, BSBS_other_patterns]
    fig_dict = stackedAreaFigures(data_df_dict, group_by='big_small', splits_list=splits_list)
    
    return fig_dict

In [None]:
fig_dict = patternBSBSFigures()

showGridByGroups(groups=data_df_dict.keys(), fig_dict=fig_dict, grid_type='small')

### 3.9 Number Total Occurrence

In [None]:
def numberTotalOccurrenceFigures():
    fig_dict = dict()
    
    for key in data_df_dict.keys():
        tmp_df = data_df_dict[key].groupby(['number']).agg({
            'number': 'count'
        }).rename(columns={
            'number': 'occurrence'
        })

        tmp_df.index = tmp_df.index.astype(int).astype(str).str.pad(width=4, side='left', fillchar='0')
        tmp_df.reset_index(inplace=True)

        tmp_df['y_prefix'] = tmp_df['number'].str.slice(0, 2)
        tmp_df['x_suffix'] = tmp_df['number'].str.slice(2, 4)
        tmp_df.drop(columns=['number'], axis=1, inplace=True)


        palette = Reds
        palette_set = len(tmp_df['occurrence'].unique())
        if palette_set > max(palette.keys()):
            palette_set = max(palette.keys())

        elif palette_set < min(palette.keys()):
            palette_set = min(palette.keys())

        palette_colors = np.flip(palette[palette_set])
        mapper = LinearColorMapper(palette=palette_colors, low=tmp_df['occurrence'].min(), high=tmp_df['occurrence'].max())

        fig = figure(title=f'{key} - Number Total Occurrence',
                     width=950, height=500,
                     toolbar_location='above',
                     x_axis_location="above",
                     x_range=tmp_df['x_suffix'].unique(), y_range=np.flip(tmp_df['y_prefix'].unique()),
                     tooltips=[('Number', '@y_prefix @x_suffix'), ('Occurrence', '@occurrence')])

        fig.rect(x="x_suffix", y="y_prefix", width=1, height=1,
                 fill_color={'field': 'occurrence', 'transform': mapper},
                 line_color=None, source=tmp_df)

        fig.axis.axis_line_color = None
        fig.xaxis.major_label_orientation = np.pi / 3

        color_bar = ColorBar(color_mapper=mapper,
                             major_label_text_font_size="8pt", label_standoff=8,
                             border_line_color=None, location=(0, 0),
                             ticker=BasicTicker(desired_num_ticks=len(palette_colors)),
                             formatter=PrintfTickFormatter(format="%d"))
        fig.add_layout(color_bar, 'right')
        fig_dict[key] = fig
    
    return fig_dict

In [None]:
fig_dict = numberTotalOccurrenceFigures()

showGrid(fig_list=list(fig_dict.values()))

# 6. Candidates
### 6.1 Cluster's Count

In [None]:
def candidateCount(category_df, groupby):
    delimiter = ''
    output_list = []
    
    if groupby in category_df.columns:
        count_dict = category_df.groupby(groupby).size().to_dict()
    
    if groupby == 'pattern':
        title = 'ABCD Pattern'
        output_list = [AAAB_patterns, AABB_patterns, AABC_patterns, ABBC_patterns, ABCD_other_patterns]
        
    elif groupby == 'odd_even':
        title = 'OEOE Pattern'
        output_list = [EEEO_patterns, OOOE_patterns, EEOO_patterns, OOEE_patterns, OEOE_other_patterns]
        
    elif groupby == 'big_small':
        title = 'BSBS Pattern'
        output_list = [BBBS_patterns, SSSB_patterns, BBSS_patterns, SSBB_patterns, BSBS_other_patterns]
        
    elif groupby == 'digit_groups':
        title = 'Digit Groups'
        tmp_list = []
        for by in ['group_4', 'group_3', 'group_2', 'group_1']:
            count_dict = category_df.groupby(by).size().to_dict()
            tmp_list.append([f'{x}: {count_dict.get(x) : >5}' for x in count_dict.keys()])

        output_list = []
        for index, x in enumerate(tmp_list[0]):
            output_list.append([y[index] for y in tmp_list])
        
    
    print(f'\n{title}')
    for index, pattern in enumerate(output_list):
        if groupby == 'digit_groups':
            output = f'{delimiter:>5} '.join([f'{x}' for x in pattern])
        else:
            output = f'{delimiter:>5} '.join([f'{x}: {count_dict.get(x) : >5}' for x in pattern])
        print(f'{index+1:>2}. {output}')

In [None]:
candidateCount(category_df, 'pattern')
candidateCount(category_df, 'odd_even')
candidateCount(category_df, 'big_small')
candidateCount(category_df, 'digit_groups')

### 6.2 Selection

In [None]:
pattern_cand = [] + AABC_patterns + ABBC_patterns + ['ABBC', 'ABCD']
group_4_cand = []
group_3_cand = []
group_2_cand = []
group_1_cand = []
odd_even_cand = []
big_small_cand = []

candidates = category_df[((len(pattern_cand) <= 0) | (category_df['pattern'].isin(pattern_cand))) &
                         (((len(group_4_cand) <= 0) | category_df['group_4'].isin(group_4_cand))) &
                         (((len(group_3_cand) <= 0) | category_df['group_3'].isin(group_3_cand))) &
                         (((len(group_2_cand) <= 0) | category_df['group_2'].isin(group_2_cand))) &
                         (((len(group_1_cand) <= 0) | category_df['group_1'].isin(group_1_cand))) &
                         (((len(odd_even_cand) <= 0) | category_df['odd_even'].isin(odd_even_cand))) &
                         (((len(big_small_cand) <= 0) | category_df['big_small'].isin(big_small_cand)))]

if not os.path.exists(OUT_PATH):
    os.makedirs(OUT_PATH)

out_file = f'{OUT_PATH}candidates.csv'
candidates.to_csv(out_file, sep=';', index=None, header=True)
candidates.describe(include='all')