In [None]:
import pandas as pd
import itertools
from datetime import datetime as dt

# Bokeh
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import Legend, HoverTool
from bokeh.core.properties import value
from bokeh.palettes import Set1, Category10, Category20
from bokeh.io import show, output_notebook
output_notebook()

# 1. Data Extraction
### 1.1. Candidate Data

In [None]:
file = 'resources/data/candidates.csv'
candidate_df = pd.read_csv(file, sep=';', dtype={'number': str})
# candidate_df.info()
# candidate_df.head()

### 1.2 Result Data

In [None]:
file = 'resources/data/4D_result.csv'
result_df = pd.read_csv(file, sep=';', dtype={'number': str})
# result_df.info()
# result_df.head()

# 2. Data Transformation

In [None]:
transform_df = result_df.copy()
transform_df.loc[transform_df['company_code'] == 'DMC', 'company_code'] = 'Da Ma Cai'
transform_df.loc[transform_df['company_code'] == 'MAG', 'company_code'] = 'Magnum'
transform_df.loc[transform_df['company_code'] == 'ST', 'company_code'] = 'Sports Toto'

transform_df.loc[transform_df['category'] == 'FST', 'category'] = '1st'
transform_df.loc[transform_df['category'] == 'SCD', 'category'] = '2nd'
transform_df.loc[transform_df['category'] == 'TRD', 'category'] = '3rd'
transform_df.loc[transform_df['category'] == 'SP', 'category'] = 'Special'
transform_df.loc[transform_df['category'] == 'CONS', 'category'] = 'Consolation'
# transform_df.head()

# 3. Data Filtering

In [None]:
data_df = transform_df[(transform_df['number'] != '----')].copy()

# 4. Data Manipulation
### 4.1 Lookup for candidates occuring periods

In [None]:
all_company_candidate_df = None

for company in data_df['company_code'].unique():
    tmp_df = data_df[data_df['company_code'] == company]
    result_dict = tmp_df.groupby('number').agg({
        'draw_date' : ', '.join,
        'category' : ', '.join
    }).to_dict('index')
    
    company_candidate_df = candidate_df.copy()
    company_candidate_df['company_code'] = company
    
    company_candidate_df['category'] = [row['number'] in result_dict and result_dict[row['number']]['category'] or ''
                          for index, row in company_candidate_df.iterrows()]
    company_candidate_df['draw_date'] = [row['number'] in result_dict and result_dict[row['number']]['draw_date'] or ''
                           for index, row in company_candidate_df.iterrows()]
    
    out_file = f'resources/output/{company}_candidates_accurancy.csv'
    company_candidate_df.to_csv(out_file, sep=';', index=None, header=True)
    
    if all_company_candidate_df is None:
        all_company_candidate_df = company_candidate_df.copy()
    else:
        all_company_candidate_df = all_company_candidate_df.append(company_candidate_df)

### 4.2 Data Pre-Processing

In [None]:
for company in data_df['company_code'].unique():
    company_candidate_df = all_company_candidate_df[all_company_candidate_df['company_code'] == company]
    multi_period_candidate_indexes = list(company_candidate_df.index[company_candidate_df['category'].str.contains(',')])
    
    for candidate_index in multi_period_candidate_indexes:
        tmp_df = company_candidate_df.iloc[candidate_index]
        draw_dates = tmp_df['draw_date'].split(',')
        categories = tmp_df['category'].split(',')

        # Distribute numbers occurred on multiple periods to new rows
        # e.g. 8152 appeared at 2019-07-03 and 2019-07-27
        for index, draw_date in enumerate(draw_dates):
            company_candidate_df = company_candidate_df.append({
                'number': tmp_df['number'],
                'pattern': tmp_df['pattern'],
                'group_4': tmp_df['group_4'],
                'group_3': tmp_df['group_3'],
                'group_2': tmp_df['group_2'],
                'group_1': tmp_df['group_1'],
                'odd_even': tmp_df['odd_even'],
                'big_small': tmp_df['big_small'],
                'category': categories[index].strip(),
                'draw_date': draw_date.strip(),
                'company_code': company
            }, ignore_index=True)

    # Remove rows with aggregated periods
    company_candidate_df = company_candidate_df.drop(multi_period_candidate_indexes)
    company_candidate_df = company_candidate_df.sort_values(by=['number', 'draw_date'])
    
    all_company_candidate_df = all_company_candidate_df[all_company_candidate_df['company_code'] != company]
    all_company_candidate_df = all_company_candidate_df.append(company_candidate_df)
    
    out_file = f'resources/output/{company}_candidates_accurancy_preprocessed.csv'
    company_candidate_df.to_csv(out_file, sep=';', index=None, header=True)

# 5. Accuracy Calculation

In [None]:
def calculateAccuracy():
    accuracy_dict = dict()

    for company in data_df['company_code'].unique():
        company_candidate_df = all_company_candidate_df[all_company_candidate_df['company_code'] == company]

        matched_results = []
        total_price_count = 23
        total_candidate_count = company_candidate_df.groupby('number').count()['draw_date'].count()

        dates = data_df['draw_date'].unique()
        for date in dates:
            tmp_df = company_candidate_df[(company_candidate_df['category'] != '') &
                                          (company_candidate_df['draw_date'] == date)]
            matched_count = tmp_df.count()['number']

            matched_results.append({
                date: {
                    'match_count': matched_count,
                    'match_price_percent': round(matched_count / total_price_count, 4) * 100,
                    'match_number_percent': round(matched_count / total_candidate_count, 4) * 100,
                    'first_count': tmp_df[tmp_df['category'] == '1st'].count()['number'],
                    'second_count': tmp_df[tmp_df['category'] == '2nd'].count()['number'],
                    'third_count': tmp_df[tmp_df['category'] == '3rd'].count()['number'],
                    'special_count': tmp_df[tmp_df['category'] == 'Special'].count()['number'],
                    'consolation_count': tmp_df[tmp_df['category'] == 'Consolation'].count()['number'],
                }
            })

        matched_results = sorted(matched_results, key=lambda x: list(x.keys())[0], reverse=False)
        accuracy_dict[f'{company}_matched_results'] = matched_results
        accuracy_dict[f'{company}_candidate_count'] = total_candidate_count
        
    return accuracy_dict

In [None]:
accuracy_dict = calculateAccuracy()

# 6. Accuracy Visualization

In [None]:
for company in data_df['company_code'].unique():
    matched_results = accuracy_dict[f'{company}_matched_results']
    total_candidate_count = accuracy_dict[f'{company}_candidate_count']
    dates = [dt.strptime(list(matched_result.keys())[0], '%Y-%m-%d') for matched_result in matched_results]
    match_price_percent_list = [list(matched_result.values())[0]['match_price_percent'] for matched_result in matched_results]
    match_number_percent_list = [list(matched_result.values())[0]['match_number_percent'] for matched_result in matched_results]
    
    colors = itertools.cycle(Set1[9])
    fig = figure(title=f'{company.title()} - Accuracy of {total_candidate_count} Candidates',
                 x_axis_type='datetime',
                 x_axis_label='Dates', y_axis_label='Accuracy (%)',
                 width=950, height=500,
                 toolbar_location='above')
    
    source = ColumnDataSource(data=dict(
        dates=dates,
        accuracy=match_price_percent_list,
        fitness=match_number_percent_list,
    ))
    glyph1 = fig.line('dates', 'accuracy', color=next(colors), alpha=.8, source=source)
    glyph2 = fig.line('dates', 'fitness', color=next(colors), alpha=.8, source=source)
    
    items = []
    items.append(('Price Matched', [glyph1]))
    items.append(('Fitness Rate', [glyph2]))
    
    fig.add_layout(Legend(items=items, location='bottom_left', orientation='horizontal', click_policy='hide'), 'below')
    fig.add_tools(HoverTool(
        tooltips = [
            ('Date', '@dates{%F}'),
            ('Accuracy', '@accuracy%'),
            ('Fitness', '@fitness%'),
        ],
        formatters={
            'dates': 'datetime',
        },
        mode='vline'
    ))
    show(fig)

In [None]:
for company in data_df['company_code'].unique():
    matched_results = accuracy_dict[f'{company}_matched_results']
    
    dates = [dt.strptime(list(matched_result.keys())[0], '%Y-%m-%d') for matched_result in matched_results]
    first_count_list = [list(matched_result.values())[0]['first_count'] for matched_result in matched_results]
    second_count_list = [list(matched_result.values())[0]['second_count'] for matched_result in matched_results]
    third_count_list = [list(matched_result.values())[0]['third_count'] for matched_result in matched_results]
    special_count_list = [list(matched_result.values())[0]['special_count'] for matched_result in matched_results]
    consolation_count_list = [list(matched_result.values())[0]['consolation_count'] for matched_result in matched_results]
    match_count_list = [list(matched_result.values())[0]['match_count'] for matched_result in matched_results]
    
    colors = itertools.cycle(Category10[10])
    fig = figure(title=f'{company.title()} - Accuracy of {total_candidate_count} Candidates',
                 x_axis_type='datetime',
                 x_axis_label='Dates', y_axis_label='Price Matched',
                 width=950, height=500,
                 toolbar_location='above')
    
    source = ColumnDataSource(data=dict(
        dates=dates,
        first_count_list=first_count_list,
        second_count_list=second_count_list,
        third_count_list=third_count_list,
        special_count_list=special_count_list,
        consolation_count_list=consolation_count_list,
        match_count_list=match_count_list,
    ))

    glyph1 = fig.line('dates', 'first_count_list', color=next(colors), alpha=.8, source=source)
    glyph2 = fig.line('dates', 'second_count_list', color=next(colors), alpha=.8, source=source)
    glyph3 = fig.line('dates', 'third_count_list', color=next(colors), alpha=.8, source=source)
    glyph4 = fig.line('dates', 'special_count_list', color=next(colors), alpha=.8, source=source)
    glyph5 = fig.line('dates', 'consolation_count_list', color=next(colors), alpha=.8, source=source)
    glyph6 = fig.line('dates', 'match_count_list', color=next(colors), alpha=.8, source=source)
    
    items = []
    items.append(('1st', [glyph1]))
    items.append(('2nd', [glyph2]))
    items.append(('3rd', [glyph3]))
    items.append(('Special', [glyph4]))
    items.append(('Consolation', [glyph5]))
    items.append(('Total', [glyph6]))
    
    fig.add_layout(Legend(items=items, location='bottom_left', orientation='horizontal', click_policy='hide'), 'below')
    fig.add_tools(HoverTool(
        tooltips = [
            ('Date', '@dates{%F}'),
            ('1st', '@first_count_list'),
            ('2nd', '@second_count_list'),
            ('3rd', '@third_count_list'),
            ('Special', '@special_count_list'),
            ('Consolation', '@consolation_count_list'),
            ('Total', '@match_count_list'),
        ],
        formatters={
            'dates': 'datetime',
        },
        mode='vline'
    ))
    show(fig)