# California lotto number prediction project

### data load

In [1]:
import pandas as pd

# df = pd.read_csv("lotto.csv", index_col=0) # No order
df = pd.read_csv("lotto_ord.csv", index_col=0) # with order

Use only the most recent n data

In [2]:
train = df#[-200:]

In [3]:
train

Unnamed: 0,1,2,3,4,5,6,B
1,37,23,10,33,29,40,16
2,42,21,9,25,32,13,2
3,31,21,27,19,11,16,30
4,40,30,14,42,31,27,2
5,16,42,29,40,24,41,3
...,...,...,...,...,...,...,...
997,16,7,4,44,24,14,20
998,18,45,13,20,17,42,41
999,9,1,28,3,18,14,34
1000,19,2,8,22,42,32,39


Process the winning number list to create sequences

In [4]:
sequences = list()
for _, seq in train.iterrows():
    for i in range(1, len(seq)):
        sequence = list(seq)[:i+1]
        sequences.append(sequence)

print('Number of samples to use for training: %d' % len(sequences))

Number of samples to use for training: 6006


In [5]:
train.iloc[0].tolist()

[37, 23, 10, 33, 29, 40, 16]

In [6]:
sequences[:5]

[[37, 23],
 [37, 23, 10],
 [37, 23, 10, 33],
 [37, 23, 10, 33, 29],
 [37, 23, 10, 33, 29, 40]]

잘린 sequences의 길이를 padding

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequences = pad_sequences(sequences, maxlen=7, padding='pre')

In [8]:
import numpy as np

sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]
print("X.shape:", X.shape)
print("y.shape:", y.shape)

X.shape: (6006, 6)
y.shape: (6006,)


It is padded with zeros after padding. (0-45 total size of 46)

In [9]:
sequences[:5]

array([[ 0,  0,  0,  0,  0, 37, 23],
       [ 0,  0,  0,  0, 37, 23, 10],
       [ 0,  0,  0, 37, 23, 10, 33],
       [ 0,  0, 37, 23, 10, 33, 29],
       [ 0, 37, 23, 10, 33, 29, 40]])

Separate Test/Verification Datasets

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
X_valid[:3]

array([[ 0,  0,  0,  0,  6, 12],
       [ 0,  0,  0,  0, 35,  5],
       [ 0,  0, 11, 23, 40, 17]])

### create model

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import Embedding

model = Sequential()
model.add(Embedding(45+1, 10))
model.add(LSTM(64, input_shape=(6, 1), return_sequences=True))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(128, activation='relu'))
model.add(Dense(45+1, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='acc')

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import Embedding

model = Sequential()
model.add(Embedding(45+1, 10))
model.add(Bidirectional(LSTM(64, input_shape=(6, 1), return_sequences=True)))
model.add(Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dense(128, activation='relu'))
model.add(Dense(45+1, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='acc')

In [14]:
model.fit(X_train, y_train, batch_size=64, epochs=200, verbose=1, validation_data=(X_valid, y_valid))

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200


Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200


Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200


Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x27012b149e8>

### function returning a list of numbers

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def seq_generate(model, input_num, verbose=False): 
    sequence = [input_num]

    while len(sequence) < 7:
        encoded = pad_sequences([sequence], maxlen=7, padding='pre')
        result = model.predict(encoded, verbose=0)
        
        mask = np.zeros(result.size, dtype=bool)
        mask[sequence] = True
        result = np.ma.array(result, mask=mask)
        result = np.argmax(result)

        sequence.append(result)
        
        if verbose:
            print("sequence:", sequence)

    return [sorted(sequence[:-1]), sequence[-1]]

테스트

In [16]:
test_num = 2
seq = [[test_num]]
encoded = pad_sequences(seq, maxlen=7, padding='pre')
result = model.predict(encoded, verbose=0)

print(result)
print(np.argmax(result))

seq_generate(model, test_num, verbose=True)

[[1.0314834e-17 8.9763565e-04 3.1238937e-07 4.7520551e-08 1.3271838e-05
  7.9703137e-02 2.2938398e-01 1.2309606e-03 1.2171864e-01 9.8727259e-04
  2.7534894e-05 5.7089310e-02 5.0864004e-02 8.9254837e-07 3.9849801e-05
  9.2154165e-04 5.5561084e-02 5.2318823e-02 4.5428376e-07 8.0502074e-04
  5.8052409e-04 7.8816875e-04 8.4111176e-02 4.8434456e-05 4.7312543e-02
  1.4438300e-03 6.1117564e-05 3.3328225e-04 5.8724955e-02 3.4168109e-02
  1.8935837e-02 5.4489370e-03 3.2386813e-02 6.5905544e-05 2.8633702e-05
  6.1230071e-02 1.3459387e-05 4.4922122e-05 2.3244693e-05 4.1275352e-04
  2.1671376e-05 2.4509643e-07 3.5191953e-04 4.8888690e-04 1.3760059e-03
  3.4814140e-05]]
6
sequence: [2, 6]
sequence: [2, 6, 13]
sequence: [2, 6, 13, 27]
sequence: [2, 6, 13, 27, 43]
sequence: [2, 6, 13, 27, 43, 17]
sequence: [2, 6, 13, 27, 43, 17, 23]


[[2, 6, 13, 17, 27, 43], 23]

### performance result

Print the first number in order of frequency

In [17]:
first_num = df['1'].value_counts()[:6]
first_num

37    35
25    31
26    31
23    29
4     28
45    28
Name: 1, dtype: int64

In [18]:
for num, _ in first_num.iteritems():
    print(f"{num:2}:", seq_generate(model, num))

37: [[10, 18, 20, 31, 37, 42], 27]
25: [[6, 15, 19, 25, 33, 38], 26]
26: [[5, 20, 26, 27, 35, 45], 23]
23: [[3, 4, 22, 23, 36, 41], 9]
 4: [[3, 4, 16, 37, 38, 40], 30]
45: [[1, 13, 20, 26, 29, 45], 39]


모든 숫자에 대해 결과 출력

In [19]:
for i in range(1, 46):
    print(f"{i:2}:", seq_generate(model, i))

 1: [[1, 3, 12, 13, 24, 44], 31]
 2: [[2, 6, 13, 17, 27, 43], 23]
 3: [[3, 8, 21, 22, 34, 41], 12]
 4: [[3, 4, 16, 37, 38, 40], 30]
 5: [[4, 5, 9, 15, 26, 27], 42]
 6: [[2, 4, 6, 10, 11, 37], 28]
 7: [[1, 2, 7, 15, 23, 34], 17]
 8: [[8, 10, 14, 31, 34, 36], 12]
 9: [[6, 9, 12, 14, 26, 37], 31]
10: [[5, 10, 13, 21, 28, 35], 9]
11: [[10, 11, 18, 22, 28, 39], 30]
12: [[1, 10, 12, 18, 24, 35], 31]
13: [[6, 13, 25, 28, 38, 45], 39]
14: [[8, 14, 18, 30, 31, 44], 15]
15: [[6, 10, 11, 15, 19, 34], 25]
16: [[16, 17, 23, 24, 29, 44], 3]
17: [[4, 5, 6, 8, 17, 39], 25]
18: [[13, 15, 17, 18, 25, 33], 37]
19: [[6, 11, 16, 19, 20, 28], 12]
20: [[3, 20, 22, 33, 40, 44], 8]
21: [[11, 15, 21, 22, 26, 35], 37]
22: [[9, 14, 20, 22, 33, 34], 13]
23: [[3, 4, 22, 23, 36, 41], 9]
24: [[8, 15, 24, 26, 39, 41], 22]
25: [[6, 15, 19, 25, 33, 38], 26]
26: [[5, 20, 26, 27, 35, 45], 23]
27: [[2, 13, 21, 25, 27, 42], 38]
28: [[5, 18, 19, 28, 31, 32], 12]
29: [[7, 8, 11, 29, 32, 34], 20]
30: [[10, 12, 13, 26, 29, 30],