# 로또 번호 예측 프로젝트

### 데이터 로드

In [1]:
import pandas as pd

# df = pd.read_csv("lotto.csv", index_col=0) # 순서 미포함
df = pd.read_csv("lotto_ord.csv", index_col=0) # 순서 포함

최근 n개의 데이터만 사용

In [2]:
train = df#[-200:]

In [3]:
train

Unnamed: 0,1,2,3,4,5,6,B
1,37,23,10,33,29,40,16
2,42,21,9,25,32,13,2
3,31,21,27,19,11,16,30
4,40,30,14,42,31,27,2
5,16,42,29,40,24,41,3
...,...,...,...,...,...,...,...
996,6,11,24,15,32,39,28
997,16,7,4,44,24,14,20
998,18,45,13,20,17,42,41
999,9,1,28,3,18,14,34


1\~45의 번호를 0\~44로 변경

In [4]:
train = train - 1

당첨번호 리스트를 가공하여 sequences 생성

In [5]:
sequences = list()
for _, seq in train.iterrows():
    for i in range(1, len(seq)):
        sequence = list(seq)[:i+1]
        sequences.append(sequence)

print('학습에 사용할 샘플의 개수: %d' % len(sequences))

학습에 사용할 샘플의 개수: 6000


In [6]:
sequences[:5]

[[36, 22],
 [36, 22, 9],
 [36, 22, 9, 32],
 [36, 22, 9, 32, 28],
 [36, 22, 9, 32, 28, 39]]

잘린 sequences의 길이를 padding

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequences = pad_sequences(sequences, maxlen=7, padding='pre')

In [8]:
import numpy as np

sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]
print("X.shape:", X.shape)
print("y.shape:", y.shape)

X.shape: (6000, 6)
y.shape: (6000,)


테스트 / 검증 데이터셋 분리

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
X_valid[:3]

array([[ 0,  0,  0,  0,  0,  4],
       [25,  5, 37, 26,  4, 38],
       [29, 32, 34,  6, 26, 36]])

### 모델 생성

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import Embedding

model = Sequential()
model.add(Embedding(45, 10))
model.add(LSTM(64, input_shape=(6, 1), return_sequences=True))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(128, activation='relu'))
model.add(Dense(45, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='acc')

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import Embedding

model = Sequential()
model.add(Embedding(45, 10))
model.add(Bidirectional(LSTM(64, input_shape=(6, 1), return_sequences=True)))
model.add(Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dense(128, activation='relu'))
model.add(Dense(45, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='acc')

In [13]:
model.fit(X_train, y_train, batch_size=64, epochs=200, verbose=1, validation_data=(X_valid, y_valid))

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 1/200
63/63 - 7s - loss: 3.8071 - acc: 0.0187 - val_loss: 3.8070 - val_acc: 0.0222
Epoch 2/200
63/63 - 1s - loss: 3.8060 - acc: 0.0209 - val_loss: 3.8072 - val_acc: 0.0162
Epoch 3/200
63/63 - 1s - loss: 3.8042 - acc: 0.0234 - val_loss: 3.8074 - val_acc: 0.0227
Epoch 4/200
63/63 - 1s - loss: 3.8019 - acc: 0.0244 - val_loss: 3.8121 - val_acc: 0.0227
Epoch 5/200
63/63 - 1s - loss: 3.7965 - acc: 0.0284 - val_loss: 3.8119 - val_acc: 0.0222
Epoch 6/200
63/63 - 1s - loss: 3.7899 - acc: 0.0303 - val_loss: 3.8160 - val_acc: 0.0207
Epoch 7/200
63/63 - 1s - loss: 3.7795 - acc: 0.0328 - val_loss: 3.8220 - val_acc: 0.0232
Epoch 8/200
63/63 - 1s - loss: 3.7713 - acc: 0.0333 - val_loss: 3.8278 - val_acc: 0.0202
Epoch 9/200
63/63 - 1s - loss: 3.7615 - acc: 0.0363 - val_loss: 3.8425 - val_acc: 0.0182
Epoch 10/200
63/63 - 1s - loss: 3.7543 - acc: 0.0396 - val_lo

Epoch 88/200
63/63 - 1s - loss: 0.7508 - acc: 0.7933 - val_loss: 8.5601 - val_acc: 0.0268
Epoch 89/200
63/63 - 1s - loss: 0.7391 - acc: 0.7955 - val_loss: 8.6762 - val_acc: 0.0232
Epoch 90/200
63/63 - 1s - loss: 0.7323 - acc: 0.7975 - val_loss: 8.7843 - val_acc: 0.0258
Epoch 91/200
63/63 - 1s - loss: 0.7298 - acc: 0.7923 - val_loss: 8.8032 - val_acc: 0.0217
Epoch 92/200
63/63 - 1s - loss: 0.7179 - acc: 0.7990 - val_loss: 8.7837 - val_acc: 0.0202
Epoch 93/200
63/63 - 1s - loss: 0.7081 - acc: 0.7975 - val_loss: 8.8883 - val_acc: 0.0242
Epoch 94/200
63/63 - 1s - loss: 0.7024 - acc: 0.7973 - val_loss: 8.9187 - val_acc: 0.0237
Epoch 95/200
63/63 - 1s - loss: 0.6978 - acc: 0.7983 - val_loss: 8.9933 - val_acc: 0.0247
Epoch 96/200
63/63 - 1s - loss: 0.6915 - acc: 0.8010 - val_loss: 9.0605 - val_acc: 0.0212
Epoch 97/200
63/63 - 1s - loss: 0.6922 - acc: 0.8010 - val_loss: 9.0573 - val_acc: 0.0232
Epoch 98/200
63/63 - 1s - loss: 0.6851 - acc: 0.7990 - val_loss: 9.0640 - val_acc: 0.0273
Epoch 99/2

Epoch 178/200
63/63 - 1s - loss: 0.6922 - acc: 0.7744 - val_loss: 10.7622 - val_acc: 0.0258
Epoch 179/200
63/63 - 1s - loss: 0.6525 - acc: 0.7838 - val_loss: 10.6621 - val_acc: 0.0232
Epoch 180/200
63/63 - 1s - loss: 0.6202 - acc: 0.7963 - val_loss: 10.8212 - val_acc: 0.0258
Epoch 181/200
63/63 - 1s - loss: 0.5842 - acc: 0.8017 - val_loss: 10.8342 - val_acc: 0.0237
Epoch 182/200
63/63 - 1s - loss: 0.5792 - acc: 0.8032 - val_loss: 10.8146 - val_acc: 0.0237
Epoch 183/200
63/63 - 1s - loss: 0.5781 - acc: 0.8060 - val_loss: 10.8439 - val_acc: 0.0237
Epoch 184/200
63/63 - 1s - loss: 0.5784 - acc: 0.8020 - val_loss: 10.8265 - val_acc: 0.0253
Epoch 185/200
63/63 - 1s - loss: 0.5753 - acc: 0.7978 - val_loss: 10.8764 - val_acc: 0.0237
Epoch 186/200
63/63 - 1s - loss: 0.5748 - acc: 0.8052 - val_loss: 10.8983 - val_acc: 0.0253
Epoch 187/200
63/63 - 1s - loss: 0.5731 - acc: 0.8022 - val_loss: 10.8677 - val_acc: 0.0217
Epoch 188/200
63/63 - 1s - loss: 0.5740 - acc: 0.8035 - val_loss: 10.8613 - val_

<keras.callbacks.History at 0x1ed1d21dfd0>

### 번호 리스트를 반환하는  함수

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def seq_generate(model, input_num, verbose=False): 
    sequence = [input_num-1]

    while len(sequence) < 7:
        encoded = pad_sequences([sequence], maxlen=7, padding='pre')
        result = model.predict(encoded, verbose=0)
        
        mask = np.zeros(result.size, dtype=bool)
        mask[sequence] = True
        result = np.ma.array(result, mask=mask)
        result = np.argmax(result)

        sequence.append(result)
        
        if verbose:
            print("sequence:", np.array(sequence)+1)

    return sorted(np.array(sequence)+1)

테스트

In [15]:
test_num = 2
seq = [[test_num]]
encoded = pad_sequences(np.array(seq)-1, maxlen=7, padding='pre')
result = model.predict(encoded, verbose=0)

print(result)
print(np.argmax(result)+1)

seq_generate(model, test_num, verbose=True)

[[6.52778544e-04 3.42292037e-06 3.04023473e-11 5.37682754e-05
  2.31783949e-02 1.24109417e-01 1.67978124e-03 9.71946418e-02
  3.85511044e-07 5.79439131e-07 3.66964377e-02 2.15722024e-02
  1.70113216e-03 1.37454990e-04 1.17460033e-03 4.83730659e-02
  1.03015425e-02 1.27624153e-05 1.68326346e-06 7.90973172e-06
  2.64510118e-06 4.82845381e-02 6.68205803e-06 4.44977321e-02
  1.81600379e-04 1.57314978e-04 2.37600616e-04 9.02232006e-02
  3.47046182e-02 1.19572721e-01 1.19874812e-06 1.24370031e-01
  1.14929734e-03 8.59234603e-08 1.67844102e-01 6.54901955e-09
  1.03396466e-07 3.67716042e-04 8.24172166e-04 6.91287059e-05
  3.10795847e-04 6.12104777e-05 2.35283529e-04 3.26485929e-06
  4.29480497e-05]]
35
sequence: [ 2 35]
sequence: [ 2 35 28]
sequence: [ 2 35 28 24]
sequence: [ 2 35 28 24 45]
sequence: [ 2 35 28 24 45 11]
sequence: [ 2 35 28 24 45 11 19]


[2, 11, 19, 24, 28, 35, 45]

### 수행 결과

첫번째로 나오는 번호 빈도순으로 출력

In [16]:
first_num = df['1'].value_counts()[:6]
first_num

37    35
25    31
26    31
23    29
4     28
45    28
Name: 1, dtype: int64

In [17]:
for num, _ in first_num.iteritems():
    print(f"{num:2}:", seq_generate(model, num))

37: [1, 2, 11, 22, 28, 30, 37]
25: [9, 20, 25, 33, 36, 39, 43]
26: [1, 8, 26, 37, 38, 39, 42]
23: [3, 7, 14, 22, 23, 24, 26]
 4: [2, 4, 25, 29, 33, 37, 43]
45: [4, 12, 28, 36, 38, 41, 45]


모든 숫자에 대해 결과 출력

In [18]:
for i in range(1, 46):
    print(f"{i:2}:", seq_generate(model, i))

 1: [1, 5, 7, 8, 15, 30, 43]
 2: [2, 11, 19, 24, 28, 35, 45]
 3: [3, 6, 10, 21, 32, 40, 43]
 4: [2, 4, 25, 29, 33, 37, 43]
 5: [5, 6, 22, 34, 35, 38, 45]
 6: [3, 6, 8, 13, 16, 30, 43]
 7: [6, 7, 18, 22, 24, 31, 34]
 8: [6, 8, 16, 24, 32, 37, 43]
 9: [4, 8, 9, 13, 33, 40, 43]
10: [1, 10, 11, 29, 33, 38, 42]
11: [3, 5, 8, 11, 36, 39, 44]
12: [12, 15, 24, 32, 33, 40, 41]
13: [8, 13, 19, 20, 35, 42, 45]
14: [2, 14, 31, 36, 40, 41, 43]
15: [1, 2, 8, 15, 25, 28, 36]
16: [4, 16, 17, 23, 29, 36, 42]
17: [17, 23, 25, 35, 36, 39, 44]
18: [1, 11, 13, 18, 19, 25, 45]
19: [2, 6, 9, 15, 16, 19, 42]
20: [5, 10, 20, 33, 36, 41, 44]
21: [8, 17, 21, 34, 35, 40, 43]
22: [1, 19, 20, 22, 25, 40, 43]
23: [3, 7, 14, 22, 23, 24, 26]
24: [9, 12, 19, 24, 30, 43, 44]
25: [9, 20, 25, 33, 36, 39, 43]
26: [1, 8, 26, 37, 38, 39, 42]
27: [10, 14, 21, 27, 30, 33, 38]
28: [1, 2, 14, 22, 25, 28, 36]
29: [7, 19, 26, 29, 38, 39, 40]
30: [7, 11, 14, 30, 33, 36, 41]
31: [6, 7, 18, 24, 31, 38, 43]
32: [1, 21, 32, 36, 38, 40,