In [1]:
import numpy as np
import pandas as pd
import os
import time
import random
from sklearn.model_selection import train_test_split

featuresDf = pd.read_csv(os.path.join('..', 'features', 'featuresFull.csv'))
print(featuresDf.shape)
featuresDf.head()

(827559, 153)


Unnamed: 0,id,WinNo1,WinNo2,WinNo3,WinNo4,WinNo5,WinNo6,WinNo7,WinNo8,WinNo9,...,e_sinId%43,e_cosId%43,sq_sinId%43,sq_cosId%43,sinId%47,cosId%47,e_sinId%47,e_cosId%47,sq_sinId%47,sq_cosId%47
0,1,2,5,7,8,22,33,38,44,46,...,1.156735,2.689468,0.0212,0.9788,0.133287,0.991077,1.142578,2.694136,0.017765,0.982235
1,2,1,9,11,14,33,37,41,42,43,...,1.333889,2.605437,0.083001,0.916999,0.264195,0.964469,1.302383,2.623395,0.069799,0.930201
2,3,1,2,5,8,13,16,19,21,34,...,1.52876,2.47304,0.180164,0.819836,0.390389,0.92065,1.477556,2.510922,0.152404,0.847596
3,4,1,3,6,12,17,24,25,26,40,...,1.73632,2.302505,0.304448,0.695552,0.509617,0.860402,1.664653,2.36411,0.259709,0.740291
4,5,6,7,8,21,30,32,35,37,39,...,1.949005,2.105962,0.445314,0.554686,0.61975,0.784799,1.858463,2.191967,0.38409,0.61591


In [21]:
seqFDict = {}
contFDict = {}
winNumbersTDict = {}
winNos = 20
lookback = 10
batchesNo=1
batchsize = 100

def getPastWinningNumbers(datavalues, drawId): 
    """Processes arguments and performs validity checks
    Args:
        datavalues: a numpy array of all past winning numbers
        drawId: the id of the draw we want past winning numbers for
    Returns:
        A slice of the input that contains lookback draws of winning numbers,
         e.g for draw 12 and lookback 5 it contains a (5,20) matrix
    """
    return datavalues[int(drawId-lookback):drawId, 1:winNos+1]

def prepareData(data):
    """ Prepares the dictionaries that will be used during batch prep
        and defined train, eval and test examples
    Args:
    data: 
    """
    print("\tCreating dictionaries for batchPrep")
    datavalues = data.values
    for example in [x for x in data.iloc[lookback:].values]:
        drawId = int(example[0])
        seqFDict[drawId] = getPastWinningNumbers(datavalues,drawId)          # get previous 'lookback' times winning numbers
        contFDict[drawId] = example[winNos+1:]                            # continuous features
        winNumbersTDict[drawId] = example[1:winNos+1].astype(int)                   # the targets (12 numbers)

    inputSize = contFDict[drawId].shape[0]
    inputSeqSize = seqFDict[drawId].shape
    targetSize = winNumbersTDict[drawId].shape[0]                            # 20

    # Train/eval/test split
    trainDf , testDf = train_test_split(data.iloc[lookback:], train_size=0.8, test_size=0.2)   # 80-10-10 split
    testDf, evalDf = train_test_split(testDf, train_size=0.5, test_size=0.5)
    trainDrawIds = set(trainDf['id'])
    evalDrawIds = set(evalDf['id'])
    testDrawIds = set(testDf['id'])
    return seqFDict, contFDict, winNumbersTDict, trainDrawIds, evalDrawIds, testDrawIds

seqFDict, contFDict, winNumbersTDict, trainDrawIds, evalDrawIds, testDrawIds = prepareData(featuresDf)


def batchPrep():
    """ Prepares batches for training using prebuilt dictionaries
    Returns:
        2-D numpy arrays for continuous/sequence features and targets respectively
    """
    for i in range(batchesNo):
        batchDrawIds = random.sample(trainDrawIds, batchsize)
        contFeatures = np.array([contFDict[x] for x in batchDrawIds], dtype=object)
        seqFeatures = np.array([seqFDict[x] for x in batchDrawIds], dtype=object)
        targets = np.array([winNumbersTDict[x] for x in batchDrawIds], dtype=object)
        return contFeatures, seqFeatures, targets

	Creating dictionaries for batchPrep


In [67]:
# Return one batch

contFeatures, seqFeatures, targets = batchPrep()
sampleTarget = targets[0]#.reshape(1,targets[0].size)
print(targets.shape, sampleTarget.shape)
print(sampleTarget)

(100, 20) (20,)
[1 3 4 14 19 23 26 32 48 49 50 58 60 65 66 69 71 72 73 78]


In [82]:
sampleTarget

array([1, 3, 4, 14, 19, 23, 26, 32, 48, 49, 50, 58, 60, 65, 66, 69, 71,
       72, 73, 78], dtype=object)

In [70]:
sampleTargetValue = np.array(sampleTarget[0])
sampleTargetValue

array(1)

# 0D (single value) to 1-hot encoded

In [98]:
print(sampleTarget[:3])
b = np.zeros((sampleTargetValue.size, 80))
b[0,sampleTarget[0]-1] = 1                     # -1: in case we dont want (0,len-1) but (1,len) instead
print(b.shape)
b

[1 3 4]
(1, 80)


array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

# 1D to 2D 1-hot encoded

In [97]:
print(sampleTarget[:3])
b = np.zeros((sampleTargetValue.size, 80))
b[0,sampleTarget[0]-1] = 1                     # -1: in case we dont want (0,len-1) but (1,len) instead
print(b.shape)
b

[1 3 4]
(1, 80)


array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

# 1D to 2D 1-hot encoded

In [100]:
targets2 = (np.arange(80) == sampleTarget[...,None]-1).astype(int)
print(targets2.shape)
targets2

(20, 80)


array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

# 2D to 3D 1-hot encoded

In [105]:
print(targets.shape) # a whole batch

(100, 20)


In [104]:
targets2 = (np.arange(80) == targets[...,None]-1).astype(int)
print(targets2.shape)
targets2

(100, 20, 80)


array([[[1, 0, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0]],

       [[1, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0]],

       [[0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 1]],

       ...,

       [[1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[1, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 

In [130]:
effort = np.argmax(targets2, axis=2)
effort = effort+1
print(effort.shape)
effort

(100, 20)


array([[ 1,  3,  4, ..., 72, 73, 78],
       [ 1,  2,  4, ..., 74, 77, 78],
       [ 2,  4,  6, ..., 68, 78, 80],
       ...,
       [ 1,  4, 18, ..., 67, 73, 76],
       [ 1,  2,  4, ..., 70, 75, 76],
       [ 2,  4,  8, ..., 72, 76, 79]], dtype=int64)

In [131]:
effort2 = effort

In [138]:
commonList = []
for tarRow, predRow in zip(effort, effort2):
    targetSet = set(tarRow)
    predSet = set(predRow)
    commonList.append(len(targetSet.intersection(predSet)))
    
print("Average common numbers in ", effort.shape[0], " contests is ", np.array(commonList).mean())

Average common numbers in  100  contests is  20.0
