# 로또 번호 예측 프로젝트

### 데이터 로드

In [1]:
import pandas as pd

# df = pd.read_csv("lotto.csv", index_col=0) # 순서 미포함
df = pd.read_csv("lotto_ord.csv", index_col=0) # 순서 포함

최근 n개의 데이터만 사용

In [2]:
train = df[-200:]

In [3]:
valid = df[-300:-200]

In [17]:
train

Unnamed: 0,1,2,3,4,5,6,B
801,36,16,24,42,43,27,1
802,11,17,23,10,41,9,26
803,25,8,4,13,42,29,1
804,35,25,12,0,9,31,8
805,17,31,11,2,30,12,41
...,...,...,...,...,...,...,...
996,5,10,23,14,31,38,27
997,15,6,3,43,23,13,19
998,17,44,12,19,16,41,40
999,8,0,27,2,17,13,33


1\~45의 번호를 0\~44로 변경

In [4]:
train = train - 1

당첨번호 리스트를 가공하여 sequences 생성

In [5]:
train_sequences = list()
for _, seq in train.iterrows():
    for i in range(1, len(seq)):
        sequence = list(seq)[:i+1]
        train_sequences.append(sequence)

print('학습에 사용할 샘플의 개수: %d' % len(train_sequences))

학습에 사용할 샘플의 개수: 1200


In [6]:
train_sequences[:5]

[[36, 16],
 [36, 16, 24],
 [36, 16, 24, 42],
 [36, 16, 24, 42, 43],
 [36, 16, 24, 42, 43, 27]]

검증 데이터에도 동일하게 적용

In [7]:
valid_sequences = list()
for _, seq in valid.iterrows():
    for i in range(1, len(seq)):
        sequence = list(seq)[:i+1]
        valid_sequences.append(sequence)
        
print('검증에 사용할 샘플의 개수: %d' % len(valid_sequences))

검증에 사용할 샘플의 개수: 1200


잘린 sequences의 길이를 padding

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_sequences = pad_sequences(train_sequences, maxlen=7, padding='pre')
valid_sequences = pad_sequences(valid_sequences, maxlen=7, padding='pre')

In [9]:
import numpy as np

train_sequences = np.array(train_sequences)
valid_sequences = np.array(valid_sequences)
X_train = train_sequences[:,:-1]
y_train = train_sequences[:,-1]
X_valid = valid_sequences[:,:-1]
y_valid = valid_sequences[:,-1]

print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)
print("X_valid.shape:", X_valid.shape)
print("y_valid.shape:", y_valid.shape)

X_train.shape: (1200, 6)
y_train.shape: (1200,)
X_valid.shape: (600, 6)
y_valid.shape: (600,)


### 모델 생성

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import Embedding

model = Sequential()
model.add(Embedding(45, 10))
model.add(LSTM(64, input_shape=(6, 1), return_sequences=False))
model.add(Dense(128, activation='relu'))
model.add(Dense(45, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics='acc')

In [11]:
model.fit(X_train, y_train, epochs=200, verbose=1, validation_data=(X_valid, y_valid))

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200


Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200


Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200


Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x1bc810e8400>

### 번호 리스트를 반환하는  함수

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def seq_generate(model, input_num, verbose=False): 
    sequence = [input_num-1]

    while len(sequence) < 7:
        encoded = pad_sequences([sequence], maxlen=7, padding='pre')
        result = model.predict(encoded, verbose=0)
        
        mask = np.zeros(result.size, dtype=bool)
        mask[sequence] = True
        result = np.ma.array(result, mask=mask)
        result = np.argmax(result)

        sequence.append(result)
        
        if verbose:
            print("sequence:", np.array(sequence)+1)

    return sorted(np.array(sequence)+1)

테스트

In [13]:
test_num = 2
seq = [[test_num]]
encoded = pad_sequences(np.array(seq)-1, maxlen=7, padding='pre')
result = model.predict(encoded, verbose=0)

print(result)
print(np.argmax(result)+1)

seq_generate(model, test_num, verbose=True)

[[5.2144099e-04 1.7221245e-03 2.0790415e-02 4.0945569e-03 7.2814822e-05
  8.6311527e-02 1.7217269e-05 3.1626038e-02 4.4568129e-05 1.2823209e-04
  2.0120796e-03 6.1575707e-02 3.2064196e-04 1.5749189e-03 4.8646745e-03
  5.7824992e-02 2.7600804e-02 3.4341536e-02 7.4056420e-04 2.8835844e-02
  3.1393385e-03 5.3425538e-03 6.2311320e-03 4.4577294e-03 9.8606710e-05
  4.2753897e-04 5.8926247e-02 3.5226934e-02 4.7481764e-02 1.7538142e-01
  2.1246040e-02 5.2361344e-03 8.1400282e-07 5.8825933e-03 9.5033705e-02
  1.3609278e-02 6.9210009e-04 2.9512795e-03 2.2023295e-03 4.1593242e-02
  3.7730773e-04 5.6720413e-03 6.6656433e-04 1.0303250e-01 7.0117581e-05]]
30
sequence: [ 2 30]
sequence: [ 2 30 28]
sequence: [ 2 30 28 45]
sequence: [ 2 30 28 45 43]
sequence: [ 2 30 28 45 43 38]
sequence: [ 2 30 28 45 43 38 40]


[2, 28, 30, 38, 40, 43, 45]

### 수행 결과

첫번째로 나오는 번호 빈도순으로 출력

In [14]:
first_num = df['1'].value_counts()[:6]
first_num

37    35
25    31
26    31
23    29
4     28
45    28
Name: 1, dtype: int64

In [15]:
for num, _ in first_num.iteritems():
    print(f"{num:2}:", seq_generate(model, num))

37: [3, 8, 17, 26, 37, 38, 39]
25: [9, 10, 18, 23, 25, 34, 35]
26: [2, 7, 8, 13, 23, 26, 44]
23: [14, 17, 18, 23, 24, 26, 34]
 4: [1, 4, 8, 28, 36, 42, 45]
45: [8, 11, 19, 21, 25, 36, 45]


모든 숫자에 대해 결과 출력

In [16]:
for i in range(1, 46):
    print(f"{i:2}:", seq_generate(model, i))

 1: [1, 2, 8, 14, 23, 27, 44]
 2: [2, 28, 30, 38, 40, 43, 45]
 3: [3, 6, 11, 13, 27, 31, 34]
 4: [1, 4, 8, 28, 36, 42, 45]
 5: [5, 13, 21, 23, 27, 39, 43]
 6: [1, 6, 11, 17, 18, 20, 37]
 7: [7, 15, 20, 25, 29, 30, 34]
 8: [8, 10, 18, 22, 35, 42, 43]
 9: [7, 9, 12, 15, 19, 23, 24]
10: [1, 3, 10, 11, 12, 35, 38]
11: [11, 20, 22, 25, 31, 40, 41]
12: [5, 7, 12, 13, 31, 38, 43]
13: [13, 15, 16, 36, 38, 39, 45]
14: [2, 8, 13, 14, 23, 27, 44]
15: [13, 15, 18, 25, 31, 33, 43]
16: [2, 8, 13, 16, 19, 39, 44]
17: [7, 11, 13, 17, 18, 29, 43]
18: [5, 14, 18, 22, 31, 43, 44]
19: [4, 8, 19, 23, 28, 36, 39]
20: [1, 8, 19, 20, 21, 37, 39]
21: [7, 11, 21, 27, 29, 43, 44]
22: [4, 14, 22, 23, 27, 28, 45]
23: [14, 17, 18, 23, 24, 26, 34]
24: [12, 13, 16, 24, 29, 43, 44]
25: [9, 10, 18, 23, 25, 34, 35]
26: [2, 7, 8, 13, 23, 26, 44]
27: [3, 23, 24, 27, 34, 41, 43]
28: [1, 9, 11, 14, 28, 31, 44]
29: [4, 7, 13, 18, 29, 31, 39]
30: [14, 27, 30, 31, 35, 40, 43]
31: [4, 8, 10, 16, 31, 35, 36]
32: [4, 9, 18, 20, 2