In [None]:
import pandas as pd
import itertools

# Bokeh
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import Legend, HoverTool
from bokeh.core.properties import value
from bokeh.palettes import Set1, Category20
from bokeh.io import show, output_notebook
output_notebook()

# 1. Data Extraction

In [None]:
filename = '../resources/data/4D_result_2018-01-01_2018-12-31.csv'
raw_df = pd.read_csv(filename, sep=';', dtype={'number': str},
                      parse_dates=['draw_date'],
                      date_parser=lambda x: pd.to_datetime(x, format='%Y-%m-%d'))
# raw_df.info()
# raw_df.head()

# 2. Data Transformation

In [None]:
transform_df = raw_df.set_index('draw_date')
transform_df.loc[transform_df['company_code'] == 'DMC', 'company_code'] = 'Da Ma Cai'
transform_df.loc[transform_df['company_code'] == 'MAG', 'company_code'] = 'Magnum'
transform_df.loc[transform_df['company_code'] == 'ST', 'company_code'] = 'Sports Toto'

transform_df.loc[transform_df['category'] == 'FST', 'category'] = '1st'
transform_df.loc[transform_df['category'] == 'SCD', 'category'] = '2nd'
transform_df.loc[transform_df['category'] == 'TRD', 'category'] = '3rd'
transform_df.loc[transform_df['category'] == 'SP', 'category'] = 'Special'
transform_df.loc[transform_df['category'] == 'CONS', 'category'] = 'Consolation'
# transform_df.head()

# 3. Data Filtering

In [None]:
data_df = transform_df[(transform_df['number'] != '----')].copy()

# 4. Data Manipulation
### 4.1 Populate clustering categories for analysis

In [None]:
filename = '../resources/data/number_category.csv'
category_df = pd.read_csv(filename, sep=';', dtype={'number': str})
# category_df.info()
# category_df.head()

# Build dictionaries
pattern_dict = dict(zip(category_df['number'], category_df['pattern']))
group_4_dict = dict(zip(category_df['number'], category_df['group_4']))
group_3_dict = dict(zip(category_df['number'], category_df['group_3']))
group_2_dict = dict(zip(category_df['number'], category_df['group_2']))
group_1_dict = dict(zip(category_df['number'], category_df['group_1']))
odd_even_dict = dict(zip(category_df['number'], category_df['odd_even']))
big_small_dict = dict(zip(category_df['number'], category_df['big_small']))

# Dictionaries lookup
data_df['pattern'] = [pattern_dict[row['number']] for index, row in data_df.iterrows()]
data_df['group_4'] = [group_4_dict[row['number']] for index, row in data_df.iterrows()]
data_df['group_3'] = [group_3_dict[row['number']] for index, row in data_df.iterrows()]
data_df['group_2'] = [group_2_dict[row['number']] for index, row in data_df.iterrows()]
data_df['group_1'] = [group_1_dict[row['number']] for index, row in data_df.iterrows()]
data_df['odd_even'] = [odd_even_dict[row['number']] for index, row in data_df.iterrows()]
data_df['big_small'] = [big_small_dict[row['number']] for index, row in data_df.iterrows()]
# data_df.head()

# 5. Data Visualization
### 5.1 Lottery Rewards

In [None]:
for company in data_df['company_code'].unique():
    colors = itertools.cycle(Set1[9])
    fig = figure(title=f'{company.title()} - Lottery Rewards',
                 x_axis_type='datetime',
                 x_axis_label='Dates', y_axis_label='Number',
                 width=950, height=500,
                 toolbar_location='above')
    
    items = []
    for category in data_df['category'].unique():
        tmp_df = data_df[(data_df['category'] == category) & (data_df['company_code'] == company)]
        
        source = ColumnDataSource(data=dict(
            dates=tmp_df.index,
            number=tmp_df['number'],
            category=tmp_df['category'],
        ))
        glyph = fig.circle('dates', 'number', color=next(colors), alpha=.5, source=source)
        items.append((category, [glyph]))

    fig.add_layout(Legend(items=items, location='bottom_left', orientation='horizontal', click_policy='hide'), 'below')
    fig.add_tools(HoverTool(
        tooltips = [
            ('Date', '@dates{%F}'),
            ('Number', '@number'),
            ('Price', '@category'),
        ],
        formatters={
            'dates': 'datetime',
        },
        mode='mouse'
    ))
    show(fig)

### 5.2 ABCD Pattern

In [None]:
def populateLineChart(data_df, group_by, splits_list, split_by='default', color_palette=Set1[9], title=''):
    for company in data_df['company_code'].unique():
        tmp_df = data_df[(data_df['company_code'] == company)]

        groups_results = []
        tmp_dict = tmp_df.groupby([tmp_df.index, group_by]).size().to_dict()
        tmp_list = [x for x in tmp_dict.items()]

        groups = tmp_df[group_by].unique()
        groups.sort()
        for group in groups:
            tmp_data = list(filter(lambda x: x[0][1] == group, tmp_list))
            groups_results.append((group, [(x[0][0], x[1]) for x in tmp_data]))

        tmp_list = [[y[0] for y in x[1]] for x in groups_results]
        dates = list(set(sum(tmp_list, [])))
        dates.sort()

        splits_results = []
        for splits in splits_list:
            if split_by == 'none':
                splits_results.append(list(filter(lambda x: True, groups_results)))
            elif split_by == 'groups_index':
                splits_results.append(list(filter(lambda x: groups_results.index(x) in splits, groups_results)))
            else:
                splits_results.append(list(filter(lambda x: x[0] in splits, groups_results)))

        for results in splits_results:
            tmp_list = sorted([x[0] for x in results])
            subtitle = ''
            for index, splits in enumerate(splits_list):
                if ((tmp_list == sorted(splits)) | (all(x in splits for x in tmp_list))) :
                    subtitle = index < len(splits_list) -1 and f'{splits[0]} Pattern' or 'Other Pattern'
                    break

            colors = itertools.cycle(color_palette)
            fig = figure(title=f'{company.title()} - {title}{subtitle}',
                         x_axis_type='datetime',
                         x_axis_label='Dates', y_axis_label='Number',
                         width=950, height=500,
                         toolbar_location='above')

            items = []
            for index, x in enumerate(results):
                occurrences = [0 for date in dates]
                group_dates = [y[0] for y in x[1]]
                
                for y in x[1]:
                    if y[1] == 0:
                        continue
                    date = y[0]
                    occurrences[dates.index(date)] = y[1]

                source = ColumnDataSource(data=dict(
                    dates=dates,
                    occurrences=occurrences,
                ))
                color = next(colors)
                glyph1 = fig.circle('dates', 'occurrences', color=color, alpha=.5, source=source)
                glyph2 = fig.line('dates', 'occurrences', color=color, alpha=.8, source=source)
                items.append((x[0], [glyph1, glyph2]))

            fig.add_layout(Legend(items=items, location='bottom_left', orientation='horizontal', click_policy='hide'), 'below')
            fig.add_tools(HoverTool(
                tooltips = [
                    ('Date', '@dates{%F}'),
                    ('Occurrence', '@occurrences'),
                ],
                formatters={
                    'dates': 'datetime',
                },
                mode='vline'
            ))
            show(fig)

In [None]:
pattern_list = list(set([x for x in pattern_dict.values()]))

AAAB_patterns = ['AAAB', 'AABA', 'ABAA']
AABB_patterns = ['AABB', 'ABAB', 'ABBA']
AABC_patterns = ['AABC', 'ABAC', 'ABCA']
ABBC_patterns = ['ABBC', 'ABCB']
ABCD_other_patterns = list(
    filter(lambda x: x not in AAAB_patterns + AABB_patterns + AABC_patterns + ABBC_patterns, pattern_list)
)

splits_list = [AAAB_patterns, AABB_patterns, AABC_patterns, ABBC_patterns, ABCD_other_patterns]
populateLineChart(data_df, group_by='pattern', splits_list=splits_list)

### 5.3 N*** Pattern

In [None]:
# splits_list = [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]
# populateLineChart(data_df, group_by='group_4', splits_list=splits_list, title='N*** Pattern', split_by='groups_index')

splits_list = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
populateLineChart(data_df, group_by='group_4', splits_list=splits_list, title='N*** Pattern',
                  split_by='none', color_palette=Category20[20])

### 5.4 \*N** Pattern

In [None]:
# splits_list = [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]
# populateLineChart(data_df, group_by='group_3', splits_list=splits_list, title='*N** Pattern', split_by='groups_index')

splits_list = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
populateLineChart(data_df, group_by='group_3', splits_list=splits_list, title='*N** Pattern',
                  split_by='none', color_palette=Category20[20])

### 5.5 \*\*N* Pattern Plotting

In [None]:
# splits_list = [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]
# populateLineChart(data_df, group_by='group_2', splits_list=splits_list, title='**N* Pattern', split_by='groups_index')

splits_list = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
populateLineChart(data_df, group_by='group_2', splits_list=splits_list, title='**N* Pattern',
                  split_by='none', color_palette=Category20[20])

### 5.6 \*\*\*N Pattern Plotting

In [None]:
# splits_list = [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]
# populateLineChart(data_df, group_by='group_1', splits_list=splits_list, title='***N Pattern', split_by='groups_index')

splits_list = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
populateLineChart(data_df, group_by='group_1', splits_list=splits_list, title='***N Pattern',
                  split_by='none', color_palette=Category20[20])

### 5.7 OEOE Pattern

In [None]:
odd_even_list = list(set([x for x in odd_even_dict.values()]))

EEEO_patterns = ['EEEO', 'EEOE', 'EOEE', 'OEEE']
OOOE_patterns = ['OOOE', 'OOEO', 'OEOO', 'EOOO']
EEOO_patterns = ['EEOO', 'EOEO', 'EOOE']
OOEE_patterns = ['OOEE', 'OEOE', 'OEEO']
OEOE_other_patterns = list(
    filter(lambda x: x not in EEEO_patterns + OOOE_patterns + EEOO_patterns + OOEE_patterns, odd_even_list)
)

splits_list = [EEEO_patterns, OOOE_patterns, EEOO_patterns, OOEE_patterns, OEOE_other_patterns]
populateLineChart(data_df, group_by='odd_even', splits_list=splits_list)

### 5.8 BSBS Pattern

In [None]:
big_small_list = list(set([x for x in big_small_dict.values()]))

BBBS_patterns = ['BBBS', 'BBSB', 'BSBB', 'SBBB']
SSSB_patterns = ['SSSB', 'SSBS', 'SBSS', 'BSSS']
BBSS_patterns = ['BBSS', 'BSBS', 'BSSB']
SSBB_patterns = ['SSBB', 'SBSB', 'SBBS']
BSBS_other_patterns = list(
    filter(lambda x: x not in BBBS_patterns + SSSB_patterns + BBSS_patterns + SSBB_patterns, big_small_list)
)

splits_list = [BBBS_patterns, SSSB_patterns, BBSS_patterns, SSBB_patterns, BSBS_other_patterns]
populateLineChart(data_df, group_by='big_small', splits_list=splits_list)

### 5.9 Number Summation

In [None]:
for company in data_df['company_code'].unique():
    colors = itertools.cycle(Category20[9])
    fig = figure(title=f'{company.title()} - Number Summation',
                 x_axis_type='datetime',
                 x_axis_label='Dates', y_axis_label='Sum',
                 width=950, height=500,
                 toolbar_location='above')
    
    items = []
    for category in data_df['category'].unique():
        tmp_df = data_df[(data_df['category'] == category) & (data_df['company_code'] == company)]
        tmp_dict = tmp_df.groupby(tmp_df.index).agg({'number': ', '.join}).to_dict('index')
        
        totals = []
        dates = tmp_df.index.unique()
        for date in dates:
            tmp_list = tmp_dict.get(date)['number'].replace(' ', '').split(',')
            totals.append(sum([int(x) for x in tmp_list]))
            
        source = ColumnDataSource(data=dict(
            dates=dates,
            totals=totals,
        ))
        glyph1 = fig.circle('dates', 'totals', color=next(colors), alpha=.5, source=source)
        glyph2 = fig.line('dates', 'totals', color=next(colors), alpha=.8, source=source)
        items.append((category, [glyph1, glyph2]))
        
    fig.add_layout(Legend(items=items, location='bottom_left', orientation='horizontal', click_policy='hide'), 'below')
    fig.add_tools(HoverTool(
        tooltips = [
            ('Date', '@dates{%F}'),
            ('Sum', '@totals'),
        ],
        formatters={
            'dates': 'datetime',
        },
        mode='vline'
    ))
    show(fig)

# 6. Candidates
### 6.1 Cluster's Count

In [None]:
def candidateCount(category_df, groupby):
    delimiter = ''
    output_list = []
    
    if groupby in category_df.columns:
        count_dict = category_df.groupby(groupby).size().to_dict()
    
    if groupby == 'pattern':
        title = 'ABCD Pattern'
        output_list = [AAAB_patterns, AABB_patterns, AABC_patterns, ABBC_patterns, ABCD_other_patterns]
        
    elif groupby == 'odd_even':
        title = 'OEOE Pattern'
        output_list = [EEEO_patterns, OOOE_patterns, EEOO_patterns, OOEE_patterns, OEOE_other_patterns]
        
    elif groupby == 'big_small':
        title = 'BSBS Pattern'
        output_list = [BBBS_patterns, SSSB_patterns, BBSS_patterns, SSBB_patterns, BSBS_other_patterns]
        
    elif groupby == 'digit_groups':
        title = 'Digit Groups'
        tmp_list = []
        for by in ['group_4', 'group_3', 'group_2', 'group_1']:
            count_dict = category_df.groupby(by).size().to_dict()
            tmp_list.append([f'{x}: {count_dict.get(x) : >5}' for x in count_dict.keys()])

        output_list = []
        for index, x in enumerate(tmp_list[0]):
            output_list.append([y[index] for y in tmp_list])
        
    
    print(f'\n{title}')
    for index, pattern in enumerate(output_list):
        if groupby == 'digit_groups':
            output = f'{delimiter:>5} '.join([f'{x}' for x in pattern])
        else:
            output = f'{delimiter:>5} '.join([f'{x}: {count_dict.get(x) : >5}' for x in pattern])
        print(f'{index+1:>2}. {output}')

In [None]:
candidateCount(category_df, 'pattern')
candidateCount(category_df, 'odd_even')
candidateCount(category_df, 'big_small')
candidateCount(category_df, 'digit_groups')

### 6.2 Selection

In [None]:
pattern_cand = [] + AABC_patterns + ABBC_patterns + ['ABBC', 'ABCD']
group_4_cand = []
group_3_cand = []
group_2_cand = []
group_1_cand = []
odd_even_cand = []
big_small_cand = []

candidates = category_df[((len(pattern_cand) <= 0) | (category_df['pattern'].isin(pattern_cand))) &
                         (((len(group_4_cand) <= 0) | category_df['group_4'].isin(group_4_cand))) &
                         (((len(group_3_cand) <= 0) | category_df['group_3'].isin(group_3_cand))) &
                         (((len(group_2_cand) <= 0) | category_df['group_2'].isin(group_2_cand))) &
                         (((len(group_1_cand) <= 0) | category_df['group_1'].isin(group_1_cand))) &
                         (((len(odd_even_cand) <= 0) | category_df['odd_even'].isin(odd_even_cand))) &
                         (((len(big_small_cand) <= 0) | category_df['big_small'].isin(big_small_cand)))]

# candidates.to_csv(f'candidates.csv', sep=';', index=None, header=True)
candidates.describe(include='all')