In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, CuDNNLSTM, Dropout

# change tensorflow default behavior (where it uses all of the memory at the outset)
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime as dt

import pandas as pd
# plot pandas dates
from pandas.tseries import converter
pd.plotting.register_matplotlib_converters()

# interactive graphs on jupyter notebook
import mpld3

# 1. Data Extraction

In [None]:
filename = '../resources/data/4D_result_2018-01-01_2018-12-31.csv'
raw_data = pd.read_csv(filename, sep=';', dtype={'number': str})
# raw_data.info()

# 2. Data Transformation

In [None]:
transform_data = raw_data.copy()
transform_data.loc[transform_data['company_code'] == 'DMC', 'company_code'] = 'Da Ma Cai'
transform_data.loc[transform_data['company_code'] == 'MAG', 'company_code'] = 'Magnum'
transform_data.loc[transform_data['company_code'] == 'ST', 'company_code'] = 'Sports Toto'

transform_data.loc[transform_data['category'] == 'FST', 'category'] = '1st'
transform_data.loc[transform_data['category'] == 'SCD', 'category'] = '2nd'
transform_data.loc[transform_data['category'] == 'TRD', 'category'] = '3rd'
transform_data.loc[transform_data['category'] == 'SP', 'category'] = 'Special'
transform_data.loc[transform_data['category'] == 'CONS', 'category'] = 'Consolation'

# 3. Data Filtering

In [None]:
date_from = transform_data.min()['draw_date']
date_to = transform_data.max()['draw_date']

# date_from = '2019-01-01'
# date_to = '2019-07-31'

categories = ['1st', '2nd', '3rd', 'Special', 'Consolation']
price_count = sum(list(map(lambda x:
                           (x == '1st' or x == '2nd' or x == '3rd') and 1 or
                           (x == 'Special' or x == 'Consolation') and 10 or 0, categories
                          )))

company_code = 'Magnum'
data = transform_data[(transform_data['number'] != '----') &
                      (transform_data['company_code'] == company_code) &
                      (transform_data['draw_date'] >= date_from) &
                      (transform_data['draw_date'] <= date_to) &
                      (transform_data['category'].isin(categories))]
data = data.sort_values(by=['draw_date', 'company_code', 'category', 'position'])

# 4. Pre-Processing
### 4.1 Input & Target Preparation

In [None]:
period_arr = []
period_dict = data.groupby('draw_date').groups
for k in period_dict.keys():
    period_arr.append([data.loc[i]['number'] for i in period_dict.get(k)])

In [None]:
max_value = 9999
input_data = []
target_data = []

period_count = 1
for i, arr in enumerate(period_arr): 
    if i == len(period_arr) - period_count:
        break
    
    tmp_arr = []
    for n in range(period_count):
        tmp_arr.append(
            list(filter(lambda x: x[0] == i+n, enumerate(period_arr)))[0][1]
        )
    
    # Data Normalization
    input_data.append(
        [[float(n) / max_value for n in tmp_arr2] for tmp_arr2 in tmp_arr]
    )
    target_data.append(
        [float(n) / max_value for n in period_arr[i + period_count]]
    )

In [None]:
input_data = np.array(input_data, dtype=float)
target_data = np.array(target_data, dtype=float)

print('input_data.shape\t', input_data.shape)
print('target_data.shape\t', target_data.shape)

### 4.2 Train & Test Preparation

In [None]:
x_train, x_test, y_train, y_test = train_test_split(input_data, target_data, test_size=0.2, random_state=4)

print('x_train.shape', x_train.shape)
print('x_test.shape', x_test.shape)

# print('x_train:')
# print(x_train[:3])
# print('y_train:')
# print(y_train[:3])
# print()

# start_index = list(filter(lambda x: x[1][0] == '2644', enumerate(period_arr)))[0][0]
# print('input:')
# for n in range(period_count):
#     print(period_arr[start_index + n])
    
# print('target:')
# print(period_arr[start_index + period_count])

# 5. Neural Network
### 5.1 Model Definition

In [None]:
model = Sequential()
# model.add(CuDNNLSTM(units=128, input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=True))
model.add(LSTM(units=128, input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))

# model.add(CuDNNLSTM(units=128, return_sequences=True))
model.add(LSTM(units=128, return_sequences=True))
model.add(Dropout(0.2))

# model.add(CuDNNLSTM(units=128))
model.add(LSTM(units=128))
model.add(Dropout(0.2))

model.add(Dense(units=price_count))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(loss='mean_absolute_error', optimizer=opt, metrics=['accuracy'])

model.summary()

### 5.2 Train Model

In [None]:
start_time = dt.now().strftime("%Y-%m-%d %H:%M:%S.%f")
history = model.fit(x_train, y_train, epochs=200, validation_data=(x_test, y_test))
end_time = dt.now().strftime("%Y-%m-%d %H:%M:%S.%f")

### 5.3 Result Prediction

In [None]:
print(f'Train Start:\t{start_time}')
print(f'Train End:\t{end_time}')

results = model.predict(x_test)

print('results.shape', results.shape)
print('y_test.shape', y_test.shape)

# 6. Result Visualization

In [None]:
mpld3.enable_notebook()
plt.rcParams['figure.figsize'] = [6, 4]

for i, arr in enumerate(y_test[:10]):
    plt.title(f'Test Data: {i+1}')
    plt.plot(range(results.shape[1]), results[i], c='r', marker='*', ls='none')
    plt.plot(range(results.shape[1]), arr, c='g', marker='x', ls='none')
    plt.show()

In [None]:
plt.plot(history.history['loss'], c='g', label='loss')
plt.plot(history.history['val_loss'], c='b', label='val_loss')
plt.legend()
plt.show()