In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from datetime import datetime as dt

# plot pandas dates
from pandas.tseries import converter
pd.plotting.register_matplotlib_converters()

# interactive graphs on jupyter notebook
import mpld3

# 1. Data Extraction

In [None]:
filename = '../resources/data/4D_result_2018-01-01_2018-12-31.csv'
raw_data = pd.read_csv(filename, sep=';', dtype={'number': str})
# raw_data.info()
# raw_data.describe(include='all')

# 2. Data Transformation

In [None]:
transform_data = raw_data.copy()
transform_data.loc[transform_data['company_code'] == 'DMC', 'company_code'] = 'Da Ma Cai'
transform_data.loc[transform_data['company_code'] == 'MAG', 'company_code'] = 'Magnum'
transform_data.loc[transform_data['company_code'] == 'ST', 'company_code'] = 'Sports Toto'

transform_data.loc[transform_data['category'] == 'FST', 'category'] = '1st'
transform_data.loc[transform_data['category'] == 'SCD', 'category'] = '2nd'
transform_data.loc[transform_data['category'] == 'TRD', 'category'] = '3rd'
transform_data.loc[transform_data['category'] == 'SP', 'category'] = 'Special'
transform_data.loc[transform_data['category'] == 'CONS', 'category'] = 'Consolation'
# transform_data

# 3. Data Filtering

In [None]:
date_from = transform_data.min()['draw_date']
date_to = transform_data.max()['draw_date']

# date_from = '2019-07-01'
# date_to = '2019-07-30'

company_code = 'Magnum'
data = transform_data[(transform_data['number'] != '----') &
                      (transform_data['company_code'] == company_code) &
                      (transform_data['draw_date'] >= date_from) &
                      (transform_data['draw_date'] <= date_to)]
data = data.sort_values(by=['draw_date', 'company_code', 'position'])
# data

# 4. Data Manipulation
### 4.1 Populate time_position as sorting order to plot on line graph

In [None]:
tmp_dict = data.groupby(['draw_date', 'company_code']).agg({
    'number': ', '.join
}).to_dict('index')

for key in tmp_dict.keys():
    tmp_list = tmp_dict.get(key)['number'].replace(' ', '').split(',')
    tmp_list.sort()
    tmp_dict.get(key)['number'] = ', '.join(tmp_list)

for index, row in data.iterrows():
    key = (row['draw_date'], row['company_code'])
    tmp_list = tmp_dict.get(key)['number'].replace(' ', '').split(',')
    data.loc[index, 'time_position'] = str(tmp_list.index(row['number'])).zfill(2) + ':00'

data = data.sort_values(by=['draw_date', 'company_code', 'time_position'])

### 4.2 Populate clustering categories for analysis

In [None]:
filename = '../resources/data/number_category.csv'
category_data = pd.read_csv(filename, sep=';', dtype={'number': str})
# category_data.info()
# category_data.describe(include='all')

# Build dictionaries
pattern_dict = dict(zip(category_data['number'], category_data['pattern']))
group_4_dict = dict(zip(category_data['number'], category_data['group_4']))
group_3_dict = dict(zip(category_data['number'], category_data['group_3']))
group_2_dict = dict(zip(category_data['number'], category_data['group_2']))
group_1_dict = dict(zip(category_data['number'], category_data['group_1']))
odd_even_dict = dict(zip(category_data['number'], category_data['odd_even']))
big_small_dict = dict(zip(category_data['number'], category_data['big_small']))

# Dictionaries lookup
data['pattern'] = [pattern_dict[row['number']] for index, row in data.iterrows()]
data['group_4'] = [group_4_dict[row['number']] for index, row in data.iterrows()]
data['group_3'] = [group_3_dict[row['number']] for index, row in data.iterrows()]
data['group_2'] = [group_2_dict[row['number']] for index, row in data.iterrows()]
data['group_1'] = [group_1_dict[row['number']] for index, row in data.iterrows()]
data['odd_even'] = [odd_even_dict[row['number']] for index, row in data.iterrows()]
data['big_small'] = [big_small_dict[row['number']] for index, row in data.iterrows()]
# data

# 5. Data Visualization
### 5.1 Number Plotting

In [None]:
first_data = data[data['category'] == '1st']
second_data = data[data['category'] == '2nd']
third_data = data[data['category'] == '3rd']
special_data = data[data['category'] == 'Special']
consolation_data = data[data['category'] == 'Consolation']

str_datetimes = [str(row['draw_date']) + ' ' + str(row['time_position']) for index, row in first_data.iterrows()]
first_datetimes = [dt.strptime(str_datetime, '%Y-%m-%d %H:%M') for str_datetime in str_datetimes]

str_datetimes = [str(row['draw_date']) + ' ' + str(row['time_position']) for index, row in second_data.iterrows()]
second_datetimes = [dt.strptime(str_datetime, '%Y-%m-%d %H:%M') for str_datetime in str_datetimes]

str_datetimes = [str(row['draw_date']) + ' ' + str(row['time_position']) for index, row in third_data.iterrows()]
third_datetimes = [dt.strptime(str_datetime, '%Y-%m-%d %H:%M') for str_datetime in str_datetimes]

str_datetimes = [str(row['draw_date']) + ' ' + str(row['time_position']) for index, row in special_data.iterrows()]
special_datetimes = [dt.strptime(str_datetime, '%Y-%m-%d %H:%M') for str_datetime in str_datetimes]

str_datetimes = [str(row['draw_date']) + ' ' + str(row['time_position']) for index, row in consolation_data.iterrows()]
consolation_datetimes = [dt.strptime(str_datetime, '%Y-%m-%d %H:%M') for str_datetime in str_datetimes]

str_datetimes = [str(row['draw_date']) + ' ' + str(row['time_position']) for index, row in data.iterrows()]
data_datetimes = [dt.strptime(str_datetime, '%Y-%m-%d %H:%M') for str_datetime in str_datetimes]

first_numbers = [int(row['number']) for index, row in first_data.iterrows()]
second_numbers = [int(row['number']) for index, row in second_data.iterrows()]
third_numbers = [int(row['number']) for index, row in third_data.iterrows()]
special_numbers = [int(row['number']) for index, row in special_data.iterrows()]
consolation_numbers = [int(row['number']) for index, row in consolation_data.iterrows()]
data_numbers = [int(row['number']) for index, row in data.iterrows()]

mpld3.enable_notebook()
plt.rcParams['figure.figsize'] = [13, 7]
plt.plot(first_datetimes, first_numbers, marker='*', c='r', ls='none', ms='10', label='1st')
plt.plot(second_datetimes, second_numbers, marker='*', c='g', ls='none', ms='10', label='2nd')
plt.plot(third_datetimes, third_numbers, marker='*', c='b', ls='none', ms='10', label='3rd')
plt.plot(special_datetimes, special_numbers, marker='x', c='m', ls='none', ms='8', label='Special')
plt.plot(consolation_datetimes, consolation_numbers, marker='x', c='c', ls='none', ms='8', label='Consolation')
plt.plot(data_datetimes, data_numbers, marker='', c='k', ls='none')

plt.title(f'{company_code} Analysis ({date_from} to {date_to})', fontsize=15)
plt.xlabel('Draw Date', fontsize=12)
plt.ylabel('4D Number', fontsize=12)
plt.legend()
plt.grid()
plt.show()

### 5.2 ABCD Pattern Plotting

In [None]:
def populateLineChart(data, group_by, title, date_range, splits_list, split_by='default'):
    groups_results = []
    tmp_dict = data.groupby([group_by, 'draw_date']).size().to_dict()
    tmp_list = [x for x in tmp_dict.items()]
    
    groups = data[group_by].unique()
    for group in groups:
        tmp_data = list(filter(lambda x: x[0][0] == group, tmp_list))
        groups_results.append((group, [(x[0][1], x[1]) for x in tmp_data]))
    
    tmp_list = [[y[0] for y in x[1]] for x in groups_results]
    str_all_dates = list(set(sum(tmp_list, [])))
    str_all_dates.sort()
    
    splits_results = []
    for splits in splits_list:
        if split_by == 'index':
            splits_results.append(list(filter(lambda x: groups_results.index(x) in splits, groups_results)))
        else:
            splits_results.append(list(filter(lambda x: x[0] in splits, groups_results)))
    
    for results in splits_results:
        for index, x in enumerate(results):
            dates = [dt.strptime(str_date, '%Y-%m-%d') for str_date in str_all_dates]
            occurrences = [0 for date in dates]

            str_dates = [y[0] for y in x[1]]
            for y in x[1]:
                if y[1] == 0:
                    continue
                str_date = y[0]
                occurrences[str_all_dates.index(str_date)] = y[1]
                
            plt.plot(dates, occurrences, marker='o', ls='--', label=x[0])
            
        tmp_list = sorted([x[0] for x in results])
        subtitle = ''
        for index, splits in enumerate(splits_list):
            if ((tmp_list == sorted(splits)) | (all(x in splits for x in tmp_list))) :
                subtitle = index < len(splits_list) -1 and f'{splits[0]} Pattern' or 'Other Pattern'
                break
        
        plt.title(title + subtitle, fontsize=15)
        plt.xlabel(f'Draw Date ({date_range})', fontsize=12)
        plt.ylabel('Occurrence', fontsize=12)
        plt.legend()
        plt.grid()
        plt.show()

In [None]:
def populatePieChart(pie_data, group_by, title):
    pie_dict = pie_data.groupby(group_by).size().to_dict()
    pie_dict = dict(sorted(pie_dict.items(), key=lambda x: x[1]))
    pie_sizes = [x for x in pie_dict.values()]
    pie_labels = [x for x in pie_dict]
    pie_explode = [.0175 for element in pie_labels]

    plt.title(title, fontsize=15)
    plt.axis('equal')
    plt.pie(pie_sizes, labels=pie_labels, autopct='%0.2f%%', startangle=90, explode=pie_explode)
    plt.tight_layout()
    plt.show()

In [None]:
pattern_list = list(set([x for x in pattern_dict.values()]))

AAAB_patterns = ['AAAB', 'AABA', 'ABAA']
AABB_patterns = ['AABB', 'ABAB', 'ABBA']
AABC_patterns = ['AABC', 'ABAC', 'ABCA']
ABBC_patterns = ['ABBC', 'ABCB']
ABCD_other_patterns = list(
    filter(lambda x: x not in AAAB_patterns + AABB_patterns + AABC_patterns + ABBC_patterns, pattern_list)
)

title = f'{company_code} - '
date_range = f'{date_from} to {date_to}'
splits_list = [AAAB_patterns, AABB_patterns, AABC_patterns, ABBC_patterns, ABCD_other_patterns]
populateLineChart(data, group_by='pattern', title=title, date_range=date_range, splits_list=splits_list)

In [None]:
for pie_data in [first_data, second_data, third_data, special_data, consolation_data]:
    str_categories = ', '.join(pie_data['category'].unique())
    title = f'{company_code} {str_categories} Price - ABCD Pattern ({date_from} to {date_to})'
    populatePieChart(pie_data, group_by='pattern', title=title)

### 5.3 N*** Pattern Plotting

In [None]:
title = f'{company_code} - N*** Pattern'
date_range = f'{date_from} to {date_to}'
splits_list = [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]
populateLineChart(data, group_by='group_4', title=title, date_range=date_range, splits_list=splits_list, split_by='index')

In [None]:
for pie_data in [first_data, second_data, third_data, special_data, consolation_data]:
    str_categories = ', '.join(pie_data['category'].unique())
    title = f'{company_code} {str_categories} Price - N*** Pattern ({date_from} to {date_to})'
    populatePieChart(pie_data, group_by='group_4', title=title)

### 5.4 \*N** Pattern Plotting

In [None]:
title = f'{company_code} - *N** Pattern'
date_range = f'{date_from} to {date_to}'
splits_list = [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]
populateLineChart(data, group_by='group_3', title=title, date_range=date_range, splits_list=splits_list, split_by='index')

In [None]:
for pie_data in [first_data, second_data, third_data, special_data, consolation_data]:
    str_categories = ', '.join(pie_data['category'].unique())
    title = f'{company_code} {str_categories} Price - *N** Pattern ({date_from} to {date_to})'
    populatePieChart(pie_data, group_by='group_3', title=title)

### 5.5 \*\*N* Pattern Plotting

In [None]:
title = f'{company_code} - **N* Pattern'
date_range = f'{date_from} to {date_to}'
splits_list = [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]
populateLineChart(data, group_by='group_2', title=title, date_range=date_range, splits_list=splits_list, split_by='index')

In [None]:
for pie_data in [first_data, second_data, third_data, special_data, consolation_data]:
    str_categories = ', '.join(pie_data['category'].unique())
    title = f'{company_code} {str_categories} Price - **N* Pattern ({date_from} to {date_to})'
    populatePieChart(pie_data, group_by='group_2', title=title)

### 5.6 \*\*\*N Pattern Plotting

In [None]:
title = f'{company_code} - ***N Pattern'
date_range = f'{date_from} to {date_to}'
splits_list = [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9]]
populateLineChart(data, group_by='group_1', title=title, date_range=date_range, splits_list=splits_list, split_by='index')

In [None]:
for pie_data in [first_data, second_data, third_data, special_data, consolation_data]:
    str_categories = ', '.join(pie_data['category'].unique())
    title = f'{company_code} {str_categories} Price - ***N Pattern ({date_from} to {date_to})'
    populatePieChart(pie_data, group_by='group_1', title=title)

### 5.7 OEOE Pattern Plotting

In [None]:
odd_even_list = list(set([x for x in odd_even_dict.values()]))

EEEO_patterns = ['EEEO', 'EEOE', 'EOEE', 'OEEE']
OOOE_patterns = ['OOOE', 'OOEO', 'OEOO', 'EOOO']
EEOO_patterns = ['EEOO', 'EOEO', 'EOOE']
OOEE_patterns = ['OOEE', 'OEOE', 'OEEO']
OEOE_other_patterns = list(
    filter(lambda x: x not in EEEO_patterns + OOOE_patterns + EEOO_patterns + OOEE_patterns, odd_even_list)
)

title = f'{company_code} - '
date_range = f'{date_from} to {date_to}'
splits_list = [EEEO_patterns, OOOE_patterns, EEOO_patterns, OOEE_patterns, OEOE_other_patterns]
populateLineChart(data, group_by='odd_even', title=title, date_range=date_range, splits_list=splits_list)

In [None]:
for pie_data in [first_data, second_data, third_data, special_data, consolation_data]:
    str_categories = ', '.join(pie_data['category'].unique())
    title = f'{company_code} {str_categories} Price - OEOE Pattern ({date_from} to {date_to})'
    populatePieChart(pie_data, group_by='odd_even', title=title)

### 5.8 BSBS Pattern Plotting

In [None]:
big_small_list = list(set([x for x in big_small_dict.values()]))

BBBS_patterns = ['BBBS', 'BBSB', 'BSBB', 'SBBB']
SSSB_patterns = ['SSSB', 'SSBS', 'SBSS', 'BSSS']
BBSS_patterns = ['BBSS', 'BSBS', 'BSSB']
SSBB_patterns = ['SSBB', 'SBSB', 'SBBS']
BSBS_other_patterns = list(
    filter(lambda x: x not in BBBS_patterns + SSSB_patterns + BBSS_patterns + SSBB_patterns, big_small_list)
)

title = f'{company_code} - '
date_range = f'{date_from} to {date_to}'
splits_list = [BBBS_patterns, SSSB_patterns, BBSS_patterns, SSBB_patterns, BSBS_other_patterns]
populateLineChart(data, group_by='big_small', title=title, date_range=date_range, splits_list=splits_list)

In [None]:
for pie_data in [first_data, second_data, third_data, special_data, consolation_data]:
    str_categories = ', '.join(pie_data['category'].unique())
    title = f'{company_code} {str_categories} Price - BSBS Pattern ({date_from} to {date_to})'
    populatePieChart(pie_data, group_by='big_small', title=title)

### 5.9 Number Summation Plotting

In [None]:
first_dict = first_data.groupby('draw_date').agg({'number': ', '.join}).to_dict('index')
second_dict = second_data.groupby('draw_date').agg({'number': ', '.join}).to_dict('index')
third_dict = third_data.groupby('draw_date').agg({'number': ', '.join}).to_dict('index')
special_dict = special_data.groupby('draw_date').agg({'number': ', '.join}).to_dict('index')
consolation_dict = consolation_data.groupby('draw_date').agg({'number': ', '.join}).to_dict('index')
data_dict = data.groupby('draw_date').agg({'number': ', '.join}).to_dict('index')

dates = []
first_totals = []
second_totals = []
third_totals = []
special_totals = []
consolation_totals = []
data_totals = []

for key in data_dict.keys():
    dates.append(dt.strptime(key, '%Y-%m-%d'))
    
    tmp_list = first_dict.get(key)['number'].replace(' ', '').split(',')
    first_totals.append(sum([int(x) for x in tmp_list]))
    
    tmp_list = second_dict.get(key)['number'].replace(' ', '').split(',')
    second_totals.append(sum([int(x) for x in tmp_list]))
    
    tmp_list = third_dict.get(key)['number'].replace(' ', '').split(',')
    third_totals.append(sum([int(x) for x in tmp_list]))
    
    tmp_list = special_dict.get(key)['number'].replace(' ', '').split(',')
    special_totals.append(sum([int(x) for x in tmp_list]))
    
    tmp_list = consolation_dict.get(key)['number'].replace(' ', '').split(',')
    consolation_totals.append(sum([int(x) for x in tmp_list]))
    
    tmp_list = data_dict.get(key)['number'].replace(' ', '').split(',')
    data_totals.append(sum([int(x) for x in tmp_list]))

plt.plot(dates, first_totals, marker='None', c='r', ls='-', label='1st')
plt.plot(dates, second_totals, marker='None', c='g', ls='-', label='2nd')
plt.plot(dates, third_totals, marker='None', c='b', ls='-', label='3rd')
plt.plot(dates, special_totals, marker='None', c='m', ls='-', label='Special')
plt.plot(dates, consolation_totals, marker='None', c='c', ls='-', label='Consolation')
plt.plot(dates, data_totals, marker='None', c='k', ls='-', label='Total')

plt.title(f'{company_code} - Number Summation Analysis ({date_from} to {date_to})', fontsize=15)
plt.xlabel('Draw Date', fontsize=12)
plt.ylabel('Sum', fontsize=12)
plt.legend()
plt.grid()
plt.show()

# 6. Cluster's Candidates Count

In [None]:
def numberCount(category_data, groupby):
    delimiter = ''
    output_list = []
    
    if groupby in category_data.columns:
        count_dict = category_data.groupby(groupby).size().to_dict()
    
    if groupby == 'pattern':
        title = 'ABCD Pattern'
        output_list = [AAAB_patterns, AABB_patterns, AABC_patterns, ABBC_patterns, ABCD_other_patterns]
        
    elif groupby == 'odd_even':
        title = 'OEOE Pattern'
        output_list = [EEEO_patterns, OOOE_patterns, EEOO_patterns, OOEE_patterns, OEOE_other_patterns]
        
    elif groupby == 'big_small':
        title = 'BSBS Pattern'
        output_list = [BBBS_patterns, SSSB_patterns, BBSS_patterns, SSBB_patterns, BSBS_other_patterns]
        
    elif groupby == 'digit_groups':
        title = 'Digit Groups'
        tmp_list = []
        for by in ['group_4', 'group_3', 'group_2', 'group_1']:
            count_dict = category_data.groupby(by).size().to_dict()
            tmp_list.append([f'{x}: {count_dict.get(x) : >5}' for x in count_dict.keys()])

        output_list = []
        for index, x in enumerate(tmp_list[0]):
            output_list.append([y[index] for y in tmp_list])
        
    
    print(f'\n{title}')
    for index, pattern in enumerate(output_list):
        if groupby == 'digit_groups':
            output = f'{delimiter:>5} '.join([f'{x}' for x in pattern])
        else:
            output = f'{delimiter:>5} '.join([f'{x}: {count_dict.get(x) : >5}' for x in pattern])
        print(f'{index+1:>2}. {output}')

In [None]:
numberCount(category_data, 'pattern')
numberCount(category_data, 'odd_even')
numberCount(category_data, 'big_small')
numberCount(category_data, 'digit_groups')

# 7. Candidates Selection

In [None]:
pattern_cand = [] + AABC_patterns + ABBC_patterns + ['ABBC', 'ABCD']
group_4_cand = []
group_3_cand = []
group_2_cand = []
group_1_cand = []
odd_even_cand = []
big_small_cand = []

candidates = category_data[((len(pattern_cand) <= 0) | (category_data['pattern'].isin(pattern_cand))) &
                           (((len(group_4_cand) <= 0) | category_data['group_4'].isin(group_4_cand))) &
                           (((len(group_3_cand) <= 0) | category_data['group_3'].isin(group_3_cand))) &
                           (((len(group_2_cand) <= 0) | category_data['group_2'].isin(group_2_cand))) &
                           (((len(group_1_cand) <= 0) | category_data['group_1'].isin(group_1_cand))) &
                           (((len(odd_even_cand) <= 0) | category_data['odd_even'].isin(odd_even_cand))) &
                           (((len(big_small_cand) <= 0) | category_data['big_small'].isin(big_small_cand)))]

# candidates.to_csv(f'{company_code}_candidates_based_on_data_{date_from}_{date_to}.csv', sep=';', index=None, header=True)
candidates.describe(include='all')