In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import *

In [2]:
text = '해보지 않으면 해낼 수 없다'

In [3]:
result = text_to_word_sequence(text)

In [4]:
docs = ['먼저 텍스트의 각 단어를 나누어 토큰화 합니다.', '텍스트의 단어로 토큰화해야 딥러닝에서 인식합니다.', '토큰화 한 결과는 딥러닝에서 사용할 수 있습니다.']

In [5]:
token = Tokenizer()
token.fit_on_texts(docs)

In [6]:
print(token.word_counts)

OrderedDict([('먼저', 1), ('텍스트의', 2), ('각', 1), ('단어를', 1), ('나누어', 1), ('토큰화', 2), ('합니다', 1), ('단어로', 1), ('토큰화해야', 1), ('딥러닝에서', 2), ('인식합니다', 1), ('한', 1), ('결과는', 1), ('사용할', 1), ('수', 1), ('있습니다', 1)])


In [9]:
print(token.document_count)
print(token.word_docs)
print(token.word_index)



3
defaultdict(<class 'int'>, {'토큰화': 2, '먼저': 1, '나누어': 1, '합니다': 1, '텍스트의': 2, '각': 1, '단어를': 1, '토큰화해야': 1, '인식합니다': 1, '단어로': 1, '딥러닝에서': 2, '있습니다': 1, '결과는': 1, '사용할': 1, '한': 1, '수': 1})
{'텍스트의': 1, '토큰화': 2, '딥러닝에서': 3, '먼저': 4, '각': 5, '단어를': 6, '나누어': 7, '합니다': 8, '단어로': 9, '토큰화해야': 10, '인식합니다': 11, '한': 12, '결과는': 13, '사용할': 14, '수': 15, '있습니다': 16}


In [10]:
docs = ['너무 재미네요',' 최고에요', '참 잘 만든 영화에요', '추천하고 싶은 영화입니다.', ' 한번도 보고 싶네요', '글쎄요', '별로에요', '생각보다 지루해요', '연기가 어색해요', '재미없어요']
classes = np.array([1,1,1,1,1,0,0,0,0,0])

In [14]:
token = Tokenizer()
token.fit_on_texts(docs)
print(token.word_index)
x = token.texts_to_sequences(docs)
#x에 저장된 서로 다른 단어의 개수(길이)를 4로 통일(패딩)

paddedX = pad_sequences(x, 4)
print('패딩결과:', paddedX)

{'너무': 1, '재미네요': 2, '최고에요': 3, '참': 4, '잘': 5, '만든': 6, '영화에요': 7, '추천하고': 8, '싶은': 9, '영화입니다': 10, '한번도': 11, '보고': 12, '싶네요': 13, '글쎄요': 14, '별로에요': 15, '생각보다': 16, '지루해요': 17, '연기가': 18, '어색해요': 19, '재미없어요': 20}
패딩결과: [[ 0  0  1  2]
 [ 0  0  0  3]
 [ 4  5  6  7]
 [ 0  8  9 10]
 [ 0 11 12 13]
 [ 0  0  0 14]
 [ 0  0  0 15]
 [ 0  0 16 17]
 [ 0  0 18 19]
 [ 0  0  0 20]]


# 딥러닝 모델 작성

In [19]:
wordSize = len(token.word_index)+1 #1을 더한이유는? 

In [25]:
model = Sequential()
# 자연어처리를 위해서 임베딩이 중요
model.add(Embedding(wordSize, 8, input_length=4))
#embading(단어 갯수(voc)), 임베딩벡터크기(8차원), 입력시 시퀀스 길이)

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics= ['accuracy'])
model.fit(paddedX, classes, epochs=20)
print('정확도 : %.4f' %(model.evaluate(paddedX, classes)[1]))

model.summary()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
정확도 : 0.9000
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 8)              168       
_________________________________________________________________
flatten_1 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 201
Trainable params: 201
Non-trainable params: 0
_________________________________________________________________


단어 종류 : 21가지-> 벡터 공간 -> 21차원 -> 임베딩 => 2차원  
100000...0000 => [1.5, 3.7]  
100000...0001 => [1.0, 2.7]  

임베딩 : 단어 -> 밀집 벡터 (Dense vector ))   
            신경말   
ex) 원핫인코딩 단어 벡터의 차원이 50000 차원 => 임베딩 => 2차원   
 

In [27]:
from keras.datasets import reuters
from keras.utils import np_utils

In [28]:
(xTrain, yTrain), (xTest, yTest) = reuters.load_data(num_words=1000, test_split= 0.2)

In [29]:
category = np.max(yTrain) +1
print(category)
print(xTrain)
print(xTest)

46
[list([1, 2, 2, 8, 43, 10, 447, 5, 25, 207, 270, 5, 2, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 2, 2, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 2, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12])
 list([1, 2, 699, 2, 2, 56, 2, 2, 9, 56, 2, 2, 81, 5, 2, 57, 366, 737, 132, 20, 2, 7, 2, 49, 2, 2, 2, 2, 699, 2, 8, 7, 10, 241, 16, 855, 129, 231, 783, 5, 4, 587, 2, 2, 2, 775, 7, 48, 34, 191, 44, 35, 2, 505, 17, 12])
 list([1, 53, 12, 284, 15, 14, 272, 26, 53, 959, 32, 818, 15, 14, 272, 26, 39, 684, 70, 11, 14, 12, 2, 18, 180, 183, 187, 70, 11, 14, 102, 32, 11, 29, 53, 44, 704, 15, 14, 19, 758, 15, 53, 959, 47, 2, 15, 14, 19, 132, 15, 39, 965, 32, 11, 14, 147, 72, 11, 180, 183, 187, 44, 11, 14, 102, 19, 11, 123, 186, 90, 67, 960, 4, 78, 13, 68, 467, 511, 110, 59, 89, 90, 67, 2, 55, 2, 92, 617, 80, 2, 46, 905, 220, 13, 4, 346, 48, 235,

# 데이터 전처리

In [30]:
xTrain=pad_sequences(xTrain,maxlen=100)
xTest=pad_sequences(xTest,maxlen=100)
yTrain=np_utils.to_categorical(yTrain)
yTest=np_utils.to_categorical(yTest)

In [31]:
#모델 생성
model=Sequential()
model.add(Embedding(1000, 100))
model.add(LSTM(100, activation='tanh'))
model.add(Dense(46, activation='softmax'))

In [32]:
model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [33]:
history=model.fit(xTrain, yTrain, batch_size=100, epochs=20,
         validation_data=(xTest, yTest))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [38]:
sample = 'hihello'
set(sample)
charSet = list(set(sample))

In [40]:
chardict = {w: i for i, w in enumerate(charSet)}

In [42]:
chardict

{'h': 0, 'l': 1, 'i': 2, 'e': 3, 'o': 4}

In [61]:
xStr = sample[:-1]
yStr = sample[1:]


In [62]:
dataDim = len(charSet)
numClasses = len(charSet)


In [64]:
x = [chardict[c] for c in xStr] # [0, 2, 0, 3, 1, 1]
y = [chardict[c] for c in yStr] # [2, 0, 3, 1, 1, 4]


AttributeError: 'list' object has no attribute 'shape'

# 원핫인코딩

In [67]:
x = np_utils.to_categorical(x, num_classes=numClasses)
y = np_utils.to_categorical(y, num_classes=numClasses)


In [68]:
x = np.reshape(x, (-1, len(x), dataDim))
x.shape

(1, 6, 5)

In [73]:
y = np.reshape(y, (-1, len(y), dataDim))
y.shape

(1, 6, 5)

In [79]:
model = Sequential()

# 좋지 않은 모델
# model.add(LSTM(5,input_shape = (6, 5), return_sequences=True))
 # 5 :          (출력결과의 차원, timesteps(6글자 입력), dataDim(입력문자의 차원))
model.add(LSTM(5*128,input_shape = (6, 5), return_sequences=True))
model.add(LSTM(5*64, return_sequences=True))
model.add(LSTM(5*16, return_sequences=True))
model.add(LSTM(5*4, return_sequences=True))
model.add(Dense(5))
model.add(Activation('softmax'))
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 6, 640)            1653760   
_________________________________________________________________
lstm_6 (LSTM)                (None, 6, 320)            1230080   
_________________________________________________________________
lstm_7 (LSTM)                (None, 6, 80)             128320    
_________________________________________________________________
lstm_8 (LSTM)                (None, 6, 20)             8080      
_________________________________________________________________
dense_2 (Dense)              (None, 6, 5)              105       
_________________________________________________________________
activation_3 (Activation)    (None, 6, 5)              0         
Total params: 3,020,345
Trainable params: 3,020,345
Non-trainable params: 0
____________________________________________

In [80]:
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics= ['accuracy'])
model.fit(x, y, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x214392c9488>

In [77]:
prediction = model.predict(x)
prediction

array([[[0.20054449, 0.20210536, 0.19993378, 0.216926  , 0.18049039],
        [0.1780259 , 0.20876324, 0.1970203 , 0.2287071 , 0.18748349],
        [0.18320197, 0.21113561, 0.19580962, 0.23735392, 0.17249887],
        [0.17818666, 0.20739934, 0.19365577, 0.22826122, 0.19249699],
        [0.17067021, 0.19621138, 0.2042416 , 0.20549288, 0.2233839 ],
        [0.16630659, 0.18643355, 0.21271063, 0.18641794, 0.24813128]]],
      dtype=float32)

In [78]:
for i, pred in enumerate(prediction):
    xindex = np.argmax(x[i], axis = 1)
    xstr = [charSet[j] for j in xindex]
    print(xindex, " ".join(xstr))

    ind = np.argmax(pred, axis=1)
    res = [charSet[j] for j in ind]
    print(ind, "".join(res))

[0 2 0 3 1 1] h i h e l l
[3 3 3 3 4 4] eeeeoo


In [109]:
code2idx = {'c4':0, 'd4':1, 'e4':2, 'f4':3, 'g4':4, 'a4':5, 'b4':6,
            'c8':7, 'd8':8, 'e8':9, 'f8':10, 'g8':11, 'a8':12, 'b8':13}
idx2code = {0:'c4', 1:'d4', 2:'e4', 3:'f4', 4:'g4', 5:'a4', 6:'b4',
            7:'c8', 8:'d8', 9:'e8', 10:'f8', 11:'g8', 12:'a8', 13:'b8'}
seq = ['g8', 'e8', 'e4', 'f8', 'd8', 'd4', 'c8', 'd8', 'e8', 'f8', 'g8', 'g8', 'g4','g8', 'e8', 'e8', 'e8', 'f8', 'd8', 'd4', 'c8', 'e8', 'g8', 'g8', 'e8', 'e8', 'e4','d8', 'd8', 'd8', 'd8', 'd8', 'e8', 'f4', 'e8', 'e8', 'e8', 'e8', 'e8', 'f8', 'g4','g8', 'e8', 'e4', 'f8', 'd8', 'd4', 'c8', 'e8', 'g8', 'g8', 'e8', 'e8', 'e4']

In [110]:
def seq2dataset(seq, windowSize):
    dataset = []
    for i in range(len(seq)-windowSize):
        subset = seq[i:(i+windowSize+1)]
        dataset.append([code2idx[item] for item in subset])
    return np.array(dataset)

dataset = seq2dataset(seq, windowSize = 4)

In [111]:
dataset.shape
xTrain = dataset[: , 0:4]
yTrain = dataset[:,4]
xTrain = xTrain/11
yTrain = np_utils.to_categorical(yTrain)

In [89]:
yTrain.shape

(50, 12)

In [112]:
onehotVecSize = yTrain.shape[1]

# LSTM으로 구성

In [119]:
dataset.shape
xTrain = dataset[: , 0:4]
yTrain = dataset[:,4]
xTrain = np.reshape(xTrain, (50, 4,1))
xTrain = xTrain/11
yTrain = np_utils.to_categorical(yTrain)

In [120]:
onehotVecSize = yTrain.shape[1]

In [122]:
model = Sequential()
model.add(LSTM(128, batch_input_shape = (1, 4,1), stateful= True))
# model.add(Dense(128, activation='relu'))
model.add(Dense(onehotVecSize, activation='softmax'))

# 신경망 모델

In [91]:
model = Sequential()
model.add(Dense(128, input_dim = 4, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(onehotVecSize, activation='softmax'))

In [125]:
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.fit(xTrain, yTrain, epochs=2000, batch_size=10)

In [126]:
for epoch_idx in range(2000):
    print('epoch : '+str(epoch_idx))
    model.fit(xTrain, yTrain, epochs=1, batch_size=1, shuffle=False)
    model.reset_states()

epoch : 0
epoch : 1
epoch : 2
epoch : 3
epoch : 4
epoch : 5
epoch : 6
epoch : 7
epoch : 8
epoch : 9
epoch : 10
epoch : 11
epoch : 12
epoch : 13
epoch : 14
epoch : 15
epoch : 16
epoch : 17
epoch : 18
epoch : 19
epoch : 20
epoch : 21
epoch : 22
epoch : 23
epoch : 24
epoch : 25
epoch : 26
epoch : 27
epoch : 28
epoch : 29
epoch : 30
epoch : 31
epoch : 32
epoch : 33
epoch : 34
epoch : 35
epoch : 36
epoch : 37
epoch : 38
epoch : 39
epoch : 40
epoch : 41
epoch : 42
epoch : 43
epoch : 44
epoch : 45
epoch : 46
epoch : 47
epoch : 48
epoch : 49
epoch : 50
epoch : 51
epoch : 52
epoch : 53
epoch : 54
epoch : 55
epoch : 56
epoch : 57
epoch : 58
epoch : 59
epoch : 60
epoch : 61
epoch : 62
epoch : 63
epoch : 64
epoch : 65
epoch : 66
epoch : 67
epoch : 68
epoch : 69
epoch : 70
epoch : 71
epoch : 72
epoch : 73
epoch : 74
epoch : 75
epoch : 76
epoch : 77
epoch : 78
epoch : 79
epoch : 80
epoch : 81
epoch : 82
epoch : 83
epoch : 84
epoch : 85
epoch : 86
epoch : 87
epoch : 88
epoch : 89
epoch : 90
epoch : 9

In [93]:
model.evaluate(xTrain, yTrain)



[0.13589219748973846, 0.9200000166893005]

In [96]:
seqOut = ['g8', 'e8', 'e4', 'f8'] # 입력값

predOut = model.predict(xTrain)

In [97]:
for i in range(50):
   idx = np.argmax(predOut[i])
   seqOut.append(idx2code[idx])

In [None]:
# seqOut 애는 최종 악보가 저장됨

In [100]:
print(seqOut)

['g8', 'e8', 'e4', 'f8', 'd8', 'd4', 'c8', 'e8', 'e8', 'f8', 'g8', 'g8', 'g4', 'g8', 'e8', 'e8', 'e8', 'f8', 'd8', 'd4', 'c8', 'e8', 'g8', 'g8', 'e8', 'e8', 'e4', 'd8', 'd8', 'd8', 'd8', 'd8', 'd8', 'f4', 'e8', 'e8', 'e8', 'e8', 'f8', 'f8', 'd8', 'g8', 'e8', 'e4', 'f8', 'd8', 'd4', 'c8', 'e8', 'g8', 'g8', 'e8', 'e8', 'e4']


In [106]:
from pygame import mixer

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


In [107]:
# w = wave.open('sound1.mp3', 'r')
mixer.init()
mixer.music.load('sound1.mp3')
mixer.music.play()

error: DirectSoundCreate: No audio device found