# 청와대 코드 분석하기

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
data = open("/content/gdrive/MyDrive/Colab Notebooks/Big AI Study/1주차/데이터/morphed.txt", "r", encoding = "cp949")
label = [] # label
check = []
dataset = [] # 형태소 분석을 끝낸 청와대 청원 문장

for line in data:
    temp = line.split('\t')
    label.append(temp[0])
    check.extend(temp[1].rstrip("\n").split(" "))
    dataset.append(temp[1].rstrip("\n").split(" "))
print(label[:5])
print(check[:5])
print(dataset[:5])

['2', '0', '1', '1', '1']
['신혼부부', '위하다', '주택정책', '보육', '시설']
[['신혼부부', '위하다', '주택정책', '보육', '시설', '늘리다', '국민', '세금', '일부', '위하다', '정책', '펴지다', '말다', '보편적', '모든', '국민', '수긍', '복지', '정책', '펴다', '주다', '바라다', '저', '신혼부부', '당첨', '되다', '사람', '로또', '되다', '이런', '주택정책', '반대', '국민', '세금', '일부', '사람', '퍼', '주기식', '되다', '안되다', '그', '세금', '아이', '안전', '맡기다', '보육', '시설', '전국', '설치다', '하다', '대기업', '솔선수범', '모든', '사업장', '의무', '설치다', '하다', '있다', '집', '애', '맡', '길', '없다', '경력', '단절', '되다', '더', '괴롭다', '집다', '개인', '능력', '키워', '살다', '맞다', '그', '능력', '켜다', '울다', '있다', '육아', '전담', '힘', '기울이다', '맞다', '아이', '부모', '키우다', '맞다', '이제', '국가', '책임지다', '시대', '가다', '맞다', '그렇잖다', '부동산', '가격', '자꾸', '올라가다', '정부', '정책', '잘못', '되다', '부동산', '그냥', '내버리다', '좀', '건들다', '역효과'], ['학교', '이름', '남자', '붙이다', '울산', '여자', '중학교', '재학', '학생', '최근', '양성평등', '글짓기', '하다', '생각하다', '울산', '울산중', '학교', '두개', '남중', '여', '중', '어째서', '학교', '여자', '붙이다', '하다', '하다', '남자', '우위', '때', '짓다', '학교', '그런', '지다', '모르다', '울산중', '학교', '남중', '남자', '붙이다', '울

In [3]:
from collections import Counter
import numpy as np

data_freq = Counter(check)
text_len = [len(line) for line in dataset]

print("단어의 총 갯수 : ", len(set(check))) # 총 50000개의 단어 존재
print("단어가 100번 이상 나오는 단어 갯수 : ", sum(np.array(list(data_freq.values())) >= 100)) # 5000개를 기준으로 해봅시다.
print("구간별 최대 길이 : ", np.percentile(text_len, [0, 25, 50, 75, 90, 91, 95, 100])) # 300개를 기준으로 해봅시다.

단어의 총 갯수 :  50564
단어가 100번 이상 나오는 단어 갯수 :  5233
구간별 최대 길이 :  [1.000e+00 3.400e+01 7.100e+01 1.380e+02 2.580e+02 2.750e+02 3.930e+02
 7.958e+03]


## Embedding 구성하기

In [4]:
from gensim.models import Word2Vec

embedding_dim = 100
model = Word2Vec(sentences = dataset, size = embedding_dim, window = 10, min_count = 100, workers=4, sg = 1)
model.save("embedding.model")

In [5]:
words = list(model.wv.vocab)
with open("embedding.txt", "w") as f:
  for word in words:
    data = model.wv[word].tolist()
    print("data_pre : ", data)

    data.insert(0, word)
    print("data_after : ", data)

    for item in data:
      f.write("%s " % item)
    f.write("\n")

Output hidden; open in https://colab.research.google.com to view.

## 훈련 데이터 전처리 및 Padding

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import math

max_words = 5000
maxlen = 300

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(dataset)
word_index = tokenizer.word_index

In [7]:
data = tokenizer.texts_to_sequences(dataset)

print("data 0 : ", data[0])

data 0 :  [13, 190, 479, 535, 10, 271, 294, 13, 126, 96, 3862, 66, 10, 678, 126, 2541, 41, 58, 250, 2, 8, 3103, 2, 35, 148, 10, 271, 294, 8, 2558, 2, 138, 15, 271, 6, 353, 902, 190, 479, 469, 1371, 1, 1759, 66, 3954, 237, 1371, 1, 3, 204, 283, 683, 4, 1126, 2394, 2, 18, 2825, 1335, 182, 462, 2771, 74, 124, 15, 462, 606, 496, 3, 669, 2675, 238, 2486, 124, 6, 62, 222, 124, 141, 39, 1000, 384, 31, 124, 2366, 1057, 1963, 1190, 51, 126, 134, 2, 2366, 266, 132]


In [8]:
data = pad_sequences(data, maxlen = maxlen)
print("data : ", data)
print("data 0 : ", data[0])
print(data.shape)
print(data[0].shape)

data :  [[   0    0    0 ... 2366  266  132]
 [   0    0    0 ...  876   26  179]
 [   0    0    0 ...  571    3 2028]
 ...
 [   0    0    0 ...   22 3436    4]
 [   0    0    0 ...  399   27    2]
 [   0    0    0 ...    2  467    2]]
data 0 :  [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    

In [9]:
from tensorflow.keras.utils import to_categorical

label = np.array(label)
label = to_categorical(label)

## 임베딩 딕셔너리 생성

In [10]:
embeddings_index = {}
f = open("embedding.txt", encoding = "utf-8")
for line in f:
  values = line.split()
  word = values[0]
  coefs = np.array(values[1:], dtype = "float32")
  embeddings_index[word] = coefs
f.close()

print("%s개의 단어 벡터를 찾았습니다." % len(embeddings_index))

5233개의 단어 벡터를 찾았습니다.


In [11]:
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
  if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
      embedding_matrix[i] = embedding_vector

## Model 구성하기

In [17]:
class_number = 3
epochs = 50
batch_size = 32
embedding_dim = 100

In [18]:
from keras import models
from keras import layers

model = models.Sequential()

model.add(layers.Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation = "relu"))
model.add(layers.Dense(32, activation = "relu"))
model.add(layers.Dense(class_number, activation = "softmax"))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 300, 100)          500000    
                                                                 
 flatten_1 (Flatten)         (None, 30000)             0         
                                                                 
 dense_3 (Dense)             (None, 64)                1920064   
                                                                 
 dense_4 (Dense)             (None, 32)                2080      
                                                                 
 dense_5 (Dense)             (None, 3)                 99        
                                                                 
Total params: 2,422,243
Trainable params: 2,422,243
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.layers[0].set_weights=([embedding_matrix])
model.layers[0].trainable = False

In [20]:
model.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["acc"])

In [21]:
history = model.fit(data, label, epochs = epochs, batch_size = batch_size, verbose = 1)
dic = history.history

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## 테스트데이터 전처리

In [22]:
test = open("/content/gdrive/MyDrive/Colab Notebooks/Big AI Study/1주차/데이터/morphed_test.txt", "r", encoding = "cp949")
test_data = [] # 저장할 데이터

for line in test:
    temp = line.split('\t')
    temp = " ".join(temp)
    test_data.append(temp.rstrip("\n").split(" "))
print(test_data[:5])

[['소년법', '폐지', '법', '아래', '보호', '받다', '아이', '법', '인하다', '보호', '받다', '오히려', '법', '악용', '사례', '늘어나다', '그', '강도', '높다', '소년법', '폐지', '부탁', '드리다'], ['국공립', '유치원', '증설', '관하다', '국공립', '유치원', '부지', '학보', '립', '및', '증설', '지역', '어린이', '놀이터', '부지', '지역', '방대', '주민', '센터', '휴', '계부', '및', '구청', '시청', '군청', '청사', '공간', '부지', '활용', '청년', '실업', '퇴직', '희망자', '재교육', '통하다', '유아', '유치', '업무', '종사', '방법', '불가능', '하다'], ['나경원', '파면', '나경원', '의원', '동계', '올림픽', '위원', '파면'], ['국민', '위원', '삼성', '편만', '들다', '삼성', '간', '일하다', '혈', '암', '백혈병', '진단', '받다', '사람', '많다', '그래서', '산업', '제외', '받다', '위하다', '환경', '평가', '표', '받다', '그래', '신청', '하다', '법원', '평가', '표', '공개', '판결', '나다', '국가', '국민', '위원', '공개', '하다', '하다', '삼성', '환경', '평', '표', '산업', '기밀', '다시', '막다', '단', '하루', '피해자', '동안', '고통', '받다', '있다', '제발', '국민', '위원', '국민', '위하다', '일해', '주다', '그리고', '국민', '위원', '독단', '처리', '하다', '다시', '감독', '하다'], ['방과', '후', '유치원', '어린이집', '영어', '교육', '유지', '시키다', '아이', '키우다', '평범', '주부', '학교', '방과', '후', '어린이집', '받다', '영어', '교육', '

In [23]:
test_data = tokenizer.texts_to_sequences(test_data)
test_data = pad_sequences(test_data, maxlen = maxlen)

## 예측해보기

In [24]:
predict = model.predict(test_data)

In [32]:
predict = np.array(predict)

answer = np.argmax(predict, axis = 1)
answer = list(answer)

print(predict.shape)
print(len(answer))
print(answer)

(5000, 3)
5000
[0, 2, 1, 0, 2, 0, 1, 1, 1, 2, 0, 0, 1, 0, 0, 1, 2, 0, 1, 1, 0, 0, 0, 1, 2, 1, 1, 2, 1, 2, 2, 0, 0, 0, 0, 2, 2, 1, 0, 2, 2, 0, 1, 1, 1, 0, 0, 2, 2, 1, 0, 1, 2, 2, 1, 0, 0, 0, 2, 0, 0, 2, 1, 0, 1, 1, 0, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 0, 0, 2, 1, 1, 0, 2, 0, 0, 1, 1, 0, 1, 2, 0, 2, 1, 0, 2, 0, 2, 0, 2, 2, 1, 2, 1, 0, 0, 0, 0, 1, 0, 1, 0, 2, 2, 2, 0, 2, 0, 1, 0, 0, 1, 0, 1, 2, 1, 0, 1, 2, 1, 1, 2, 0, 1, 0, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 2, 1, 1, 0, 0, 0, 2, 1, 1, 2, 0, 2, 0, 1, 2, 2, 2, 0, 0, 2, 1, 2, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 2, 2, 1, 1, 2, 1, 1, 1, 0, 0, 2, 2, 0, 1, 2, 2, 2, 0, 0, 2, 1, 0, 1, 1, 2, 2, 0, 0, 0, 1, 2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 1, 0, 2, 2, 1, 0, 2, 0, 1, 1, 2, 1, 0, 0, 1, 0, 2, 1, 0, 1, 2, 1, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 0, 2, 1, 2, 0, 2, 0, 0, 2, 2, 2, 2, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0, 2, 1, 2, 0, 0, 0, 1, 0, 1, 2, 2, 2, 1, 1, 1, 2, 0, 2, 0, 0, 0, 2, 1, 1, 0, 2, 0, 0, 1, 0, 1, 

In [37]:
submit = []

submit.append(["index", "category"])

for i, label in enumerate(answer):
  submit.append([i, label])

print(submit)

[['index', 'category'], [0, 0], [1, 2], [2, 1], [3, 0], [4, 2], [5, 0], [6, 1], [7, 1], [8, 1], [9, 2], [10, 0], [11, 0], [12, 1], [13, 0], [14, 0], [15, 1], [16, 2], [17, 0], [18, 1], [19, 1], [20, 0], [21, 0], [22, 0], [23, 1], [24, 2], [25, 1], [26, 1], [27, 2], [28, 1], [29, 2], [30, 2], [31, 0], [32, 0], [33, 0], [34, 0], [35, 2], [36, 2], [37, 1], [38, 0], [39, 2], [40, 2], [41, 0], [42, 1], [43, 1], [44, 1], [45, 0], [46, 0], [47, 2], [48, 2], [49, 1], [50, 0], [51, 1], [52, 2], [53, 2], [54, 1], [55, 0], [56, 0], [57, 0], [58, 2], [59, 0], [60, 0], [61, 2], [62, 1], [63, 0], [64, 1], [65, 1], [66, 0], [67, 0], [68, 0], [69, 1], [70, 0], [71, 2], [72, 0], [73, 0], [74, 1], [75, 0], [76, 0], [77, 0], [78, 0], [79, 2], [80, 1], [81, 1], [82, 0], [83, 2], [84, 0], [85, 0], [86, 1], [87, 1], [88, 0], [89, 1], [90, 2], [91, 0], [92, 2], [93, 1], [94, 0], [95, 2], [96, 0], [97, 2], [98, 0], [99, 2], [100, 2], [101, 1], [102, 2], [103, 1], [104, 0], [105, 0], [106, 0], [107, 0], [108, 

In [41]:
import csv

f = open("write.csv", "w")

writer = csv.writer(f)
writer.writerows(submit)
f.close()