In [14]:
import numpy as np
import pandas as pd
from konlpy.tag import Komoran, Kkma, Okt
from collections import Counter
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense

In [15]:
data = pd.read_csv("kedi.csv",encoding='cp949')

In [16]:
okt = Okt()

In [17]:
def make_nouns(x):
    return okt.nouns(x)

In [18]:
data = data[["프로그램명","소분류코드","소분류"]]

In [19]:
data.columns = ['program_name','code','code_name']

In [20]:
data = data.iloc[1:,:]

In [21]:
data.reset_index(inplace=True)

In [22]:
data.drop('index',axis=1,inplace=True)

In [23]:
X = data.program_name.apply(make_nouns)

In [24]:
def remove_one(x):
    new_x = []
    for i in x:
        if len(i) > 1:
            new_x.append(i)
    return new_x

In [25]:
X = X.apply(remove_one)

In [26]:
full_text = []
for i in X:
    full_text.extend(i)

In [27]:
unique_word = list(set(full_text))

In [34]:
word_cnt = Counter(full_text)
common_word = word_cnt.most_common(1000)

In [35]:
unique_word = [ i for i,j in common_word]

In [36]:
word_index = { k:v for k,v in enumerate(unique_word) }

In [37]:
index_word = { v:k for k,v in word_index.items()}

In [38]:
## 정수 인코딩
def label_encode(x):
    encoded_x = []
    for i in x:
        encoded_x.append(index_word.get(i,0))
    return encoded_x

In [39]:
label_encoded = X.apply(label_encode)

In [40]:
def vectorize_word(x,dimension=1000):
    t = np.zeros((x.shape[0],dimension))
    for k,v in enumerate(x.values):
        for i in v:
            if i < dimension:
                t[k,i] += 1
    return t

In [41]:
one_hot = vectorize_word(label_encoded)

In [42]:
code_text = []
for i in data.code:
    code_text.append(i)

In [43]:
unique_code = list(set(code_text))

In [44]:
code_index = { k:v for k,v in enumerate(unique_code)} 
index_code = { v:k for k,v in code_index.items()}

In [45]:
def code_encode(x):
    encoded_x = []
    for i in x:
        encoded_x.append(index_code.get(i,0))
    return encoded_x

In [46]:
label_code = []
for i in data.code:
    label_code.append(index_code[i])
data['label'] = label_code

In [47]:
y = to_categorical(data.label.values)

In [90]:
data[["code_name","label"]]

Unnamed: 0,code_name,label
0,한국어,72
1,한국어,72
2,한국어,72
3,한국어,72
4,한국어,72
...,...,...
125164,입시 기타,32
125165,입시 기타,32
125166,입시 기타,32
125167,입시 기타,32


In [91]:
code_dictionary = {}
for i in range(data.code_name.size):
    code_dictionary[data.label[i]] = data.code_name[i]

In [48]:
dt = pd.DataFrame(np.c_[one_hot,y])

In [49]:
idx = np.random.choice(dt.index,len(dt.index),replace=False)

In [50]:
tr_idx = idx[:int(0.8*len(idx))]
te_idx = idx[int(0.8*len(idx)):]

In [51]:
X_train = dt.iloc[tr_idx,:1000].values
X_test = dt.iloc[te_idx,:1000].values
y_train = dt.iloc[tr_idx,1000:].values
y_test = dt.iloc[te_idx,1000:].values

In [52]:
input_shape = X_train.shape[1]
output_shape = y_train.shape[1]

In [58]:
model = Sequential()
model.add(Dense(256,activation='relu',input_shape=(input_shape,)))
model.add(Dense(512,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(output_shape,activation='softmax'))

In [59]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 256)               256256    
_________________________________________________________________
dense_4 (Dense)              (None, 512)               131584    
_________________________________________________________________
dense_5 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_6 (Dense)              (None, 121)               31097     
Total params: 550,265
Trainable params: 550,265
Non-trainable params: 0
_________________________________________________________________


In [60]:
optimizer = 'adam'
loss = 'categorical_crossentropy'
metrics = ['accuracy']
model.compile(optimizer=optimizer,
             loss=loss,
             metrics=metrics)

In [61]:
epochs = 100
batch_size = 300
validation_data = (X_test,y_test)
model.fit(X_train,
         y_train,
         epochs=epochs,
         batch_size=batch_size,
         validation_data=validation_data)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f4a19ee7a60>

In [62]:
model.evaluate(X_test,y_test)



[2.0314998626708984, 0.6911001205444336]

In [92]:
def predict_subject(subject):
    x = okt.nouns(subject)
    x = label_encode(x)
    x = vectorize_list(x)
    result = np.argmax(model.predict(x))
    result = code_dictionary[result]
    return f'입력한 {subject}의 분야는 {result}입니다.'

In [102]:
subject = '초등학교 논리수업'
predict_subject(subject)

'입력한 초등학교 논리수업의 분야는 아동발달입니다.'

In [84]:
subject = vectorize_list(subject)

In [82]:
def vectorize_list(x,dimension=1000):
    t = np.zeros(dimension)
    for i in x:
        t[i] = 1
    return np.array(t).reshape(1,-1)

In [None]:
def vectorize_word(x,dimension=1000):
    t = np.zeros((x.shape[0],dimension))
    for k,v in enumerate(x.values):
        for i in v:
            if i < dimension:
                t[k,i] += 1
    return t

In [65]:
np.argmax(model.predict(X_train[[0]]))

19

In [66]:
code_dictionary[]

0         72
1         72
2         72
3         72
4         72
          ..
125164    32
125165    32
125166    32
125167    32
125168    32
Name: label, Length: 125169, dtype: int64