# 로또 번호 예측 프로젝트

### 데이터 로드

In [1]:
import pandas as pd

# df = pd.read_csv("lotto.csv", index_col=0) # 순서 미포함
df = pd.read_csv("lotto_ord.csv", index_col=0) # 순서 포함

최근 n개의 데이터만 사용

In [2]:
train = df#[-200:]

In [3]:
train

Unnamed: 0,1,2,3,4,5,6,B
1,37,23,10,33,29,40,16
2,42,21,9,25,32,13,2
3,31,21,27,19,11,16,30
4,40,30,14,42,31,27,2
5,16,42,29,40,24,41,3
...,...,...,...,...,...,...,...
996,6,11,24,15,32,39,28
997,16,7,4,44,24,14,20
998,18,45,13,20,17,42,41
999,9,1,28,3,18,14,34


당첨번호 리스트를 가공하여 sequences 생성

In [4]:
sequences = list()
for _, seq in train.iterrows():
    for i in range(1, len(seq)):
        sequence = list(seq)[:i+1]
        sequences.append(sequence)

print('학습에 사용할 샘플의 개수: %d' % len(sequences))

학습에 사용할 샘플의 개수: 6000


In [5]:
train.iloc[0].tolist()

[37, 23, 10, 33, 29, 40, 16]

In [6]:
sequences[:5]

[[37, 23],
 [37, 23, 10],
 [37, 23, 10, 33],
 [37, 23, 10, 33, 29],
 [37, 23, 10, 33, 29, 40]]

잘린 sequences의 길이를 padding

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequences = pad_sequences(sequences, maxlen=7, padding='pre')

In [8]:
import numpy as np

sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]
print("X.shape:", X.shape)
print("y.shape:", y.shape)

X.shape: (6000, 6)
y.shape: (6000,)


padding후 0으로 채워진다. (0~45 총 46의 크기)

In [9]:
sequences[:5]

array([[ 0,  0,  0,  0,  0, 37, 23],
       [ 0,  0,  0,  0, 37, 23, 10],
       [ 0,  0,  0, 37, 23, 10, 33],
       [ 0,  0, 37, 23, 10, 33, 29],
       [ 0, 37, 23, 10, 33, 29, 40]])

### 모델 생성

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import Embedding

model = Sequential()
model.add(Embedding(45+1, 10))
model.add(LSTM(64, input_shape=(6, 1), return_sequences=False))
model.add(Dense(128, activation='relu'))
model.add(Dense(45+1, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='acc')

In [11]:
model.fit(X, y, batch_size=64, epochs=200, verbose=1)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200


Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x2b88b5a6c88>

### 번호 리스트를 반환하는  함수

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def seq_generate(model, input_num, verbose=False): 
    sequence = [input_num]

    while len(sequence) < 7:
        encoded = pad_sequences([sequence], maxlen=7, padding='pre')
        result = model.predict(encoded, verbose=0)
        
        mask = np.zeros(result.size, dtype=bool)
        mask[sequence] = True
        result = np.ma.array(result, mask=mask)
        result = np.argmax(result)

        sequence.append(result)
        
        if verbose:
            print("sequence:", sequence)

    return sorted(sequence)

테스트

In [13]:
test_num = 2
seq = [[test_num]]
encoded = pad_sequences(seq, maxlen=7, padding='pre')
result = model.predict(encoded, verbose=0)

print(result)
print(np.argmax(result))

seq_generate(model, test_num, verbose=True)

[[1.26557035e-15 1.08643947e-02 1.00932139e-05 1.43000980e-05
  8.05335306e-03 5.33186411e-03 1.40156541e-02 1.02390014e-02
  4.81116399e-02 5.50538711e-02 3.80362011e-03 4.24289657e-03
  2.92597506e-02 9.20619641e-04 1.18962368e-02 5.01444098e-03
  1.92005280e-02 1.53621912e-01 1.38556375e-03 1.33279755e-04
  3.95237934e-04 4.06804914e-03 7.42011219e-02 2.90906732e-03
  2.42458005e-02 1.97518058e-03 2.44950003e-04 9.51450411e-03
  5.64707071e-02 3.59455086e-02 1.22266589e-02 1.36154677e-05
  1.39822394e-01 2.16146559e-02 1.36946574e-05 6.64970726e-02
  1.53096637e-03 3.20121821e-04 1.09302765e-02 8.81507352e-04
  6.83743638e-05 1.88312342e-03 2.55329511e-03 3.22266744e-04
  1.49313003e-01 8.65859212e-04]]
17
sequence: [2, 17]
sequence: [2, 17, 24]
sequence: [2, 17, 24, 19]
sequence: [2, 17, 24, 19, 45]
sequence: [2, 17, 24, 19, 45, 3]
sequence: [2, 17, 24, 19, 45, 3, 25]


[2, 3, 17, 19, 24, 25, 45]

### 수행 결과

첫번째로 나오는 번호 빈도순으로 출력

In [18]:
first_num = df['1'].value_counts()[:6]
first_num

37    35
25    31
26    31
23    29
4     28
45    28
Name: 1, dtype: int64

In [15]:
for num, _ in first_num.iteritems():
    print(f"{num:2}:", seq_generate(model, num))

37: [4, 17, 25, 33, 37, 41, 43]
25: [2, 4, 7, 22, 24, 25, 43]
26: [1, 9, 26, 28, 34, 41, 42]
23: [5, 10, 14, 20, 23, 27, 42]
 4: [3, 4, 5, 8, 11, 27, 37]
45: [3, 7, 22, 36, 40, 42, 45]


모든 숫자에 대해 결과 출력

In [16]:
for i in range(1, 46):
    print(f"{i:2}:", seq_generate(model, i))

 1: [1, 12, 16, 32, 41, 42, 45]
 2: [2, 3, 17, 19, 24, 25, 45]
 3: [3, 5, 12, 13, 26, 30, 42]
 4: [3, 4, 5, 8, 11, 27, 37]
 5: [5, 10, 14, 18, 26, 39, 43]
 6: [6, 12, 13, 18, 27, 31, 43]
 7: [1, 7, 8, 31, 36, 37, 43]
 8: [8, 12, 13, 14, 18, 21, 28]
 9: [7, 9, 11, 16, 31, 38, 39]
10: [4, 6, 10, 15, 27, 41, 42]
11: [5, 8, 11, 15, 21, 40, 42]
12: [10, 12, 14, 18, 24, 27, 45]
13: [1, 7, 8, 13, 25, 36, 44]
14: [3, 4, 14, 20, 28, 42, 44]
15: [4, 12, 14, 15, 17, 23, 28]
16: [12, 16, 17, 23, 25, 40, 45]
17: [5, 12, 17, 27, 32, 37, 38]
18: [4, 12, 18, 26, 27, 29, 33]
19: [4, 7, 13, 19, 25, 34, 45]
20: [5, 7, 17, 19, 20, 24, 27]
21: [1, 6, 14, 18, 21, 29, 34]
22: [10, 20, 22, 33, 40, 41, 45]
23: [5, 10, 14, 20, 23, 27, 42]
24: [3, 24, 26, 27, 29, 37, 45]
25: [2, 4, 7, 22, 24, 25, 43]
26: [1, 9, 26, 28, 34, 41, 42]
27: [9, 14, 25, 27, 28, 33, 37]
28: [3, 19, 23, 28, 34, 38, 42]
29: [1, 2, 7, 9, 19, 29, 42]
30: [1, 3, 7, 14, 18, 26, 30]
31: [8, 13, 14, 18, 31, 39, 40]
32: [3, 4, 11, 17, 27, 32, 45