# 분류

In [2]:
base_dir = r'C:\Users\Yang\Desktop\project'

In [2]:
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.models.doc2vec import Doc2Vec
import csv
import re



# (1) Import model (단어를 벡터로 바꾸는 아래의 3가지 모델 중 하나 선택)

## (1-1) Word2vec model import

In [13]:
model = Word2Vec.load(base_dir + r'\word_model_70000\Word2vec.model')
word_vectors = model.wv
vocabs = word_vectors.vocab.keys()
word_vectors_list = [word_vectors[v] for v in vocabs]

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## (1-2) Fasttext model import

In [22]:
model = FastText.load(base_dir + r'\word_model\Fasttext.model')
word_vectors = model.wv
vocabs = word_vectors.vocab.keys()
word_vectors_list = [word_vectors[v] for v in vocabs]

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## (1-3) Doc2vec model import

In [40]:
model = Doc2Vec.load(base_dir + r'\word_model\Doc2vec.model')
word_vectors = model.wv
vocabs = word_vectors.vocab.keys()
word_vectors_list = [word_vectors[v] for v in vocabs]

## (1-4) Bag of words

In [7]:
corpus = []
for token_sent in token_sents:
    sent = ''
    for i in range(len(token_sent)):
        voca = token_sent[i][:token_sent[i].index('/')]
        if i is not len(token_sent)-1:
            sent = sent+voca+' '
        else:
            sent = sent+voca
    corpus.append(sent)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(corpus)
vec_sents = vect.transform(corpus).toarray()

## (2) Train data import

In [3]:
from sentence_utility import read_damage_csv
sents, label_damage = read_damage_csv(base_dir + r'\data\data.csv')

In [4]:
# 품사 태깅(주어진 텍스트를 형태소 단위로 나누고 나눠진 형태소를 해당하는 품사와 함께 리스트화)
# stemming(단어들을 원형으로 포현), normalization(표현 방법이 다른 단어들을 통합시켜 같은 단어로 만듬)
import json
import os
from konlpy.tag import Okt

okt = Okt()

def tokenize(doc): # 형태소 분석기인 Okt를 이용해서 품사를 태깅시킴
    # norm은 정규화, stem은 근어로 표시하기를 나타냄
    return ['/'.join(t) for t in okt.pos(doc, norm=True, stem=True)]

os.chdir(base_dir) # 경로 설정

In [5]:
token_sents = [] # word_vectors의 key가 '단어/품사'의 형식으로 되어 있기 때문에, 같은 형식으로 만들기 위해 각 문장별로 tokenize를 적용함.
for sent in sents:
    token_sents.append(tokenize(sent))

## (3) Vectorization(word2vec, fasttest, doc2vec)

In [17]:
from sentence_utility import token2vec
vec_sents, label_damage = token2vec(token_sents, label_damage, word_vectors)

# 모델링 - Machine learning

## (1) 문장 속 단어들의 Vector(word2vec, fasttext, doc2vec)의 평균값을 이용해서 문장 분류

In [9]:
## Embedding 일 때
import numpy as np

sentences = np.asarray(vec_sents)
mean_sents = []
for i in range(len(sentences)):
    #mean_sents = mean_sents + np.mean(sentences[i], axis=0)
    mean_sents.append(np.mean(sentences[i], axis=0))

In [10]:
## bag of words 일 때
# (bag of words가 아닐 때는 실행하면 안됨!!)
mean_sents = vec_sents

In [11]:
np.sum(np.array(label_damage) == '1')/len(label_damage)

0.3789924267369114

### (1-1) Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import cross_val_score

logreg = LogisticRegression()
scores = cross_val_score(logreg, mean_sents, label_damage, cv=5) 
print('cross-val-score \n{}'.format(scores))
print('cross-val-score.mean \n{:.3f}'.format(scores.mean()))
# 전체에서 0인 것 57%, 1인 것 43%인데 결과가 84%면 괜찮은 듯.. 



cross-val-score 
[0.63486842 0.85361842 0.8583196  0.86820428 0.74958814]
cross-val-score.mean 
0.793


### (1-2) SVM

In [13]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svm = SVC(kernel='linear', C=1.0, random_state=0, gamma=0.10)
scores_svm = cross_val_score(svm, mean_sents, label_damage, cv=5) 
print('cross-val-score \n{}'.format(scores_svm))
print('cross-val-score.mean \n{:.3f}'.format(scores_svm.mean()))

cross-val-score 
[0.60690789 0.82565789 0.81878089 0.82701812 0.71828666]
cross-val-score.mean 
0.759


### (1-3) Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
scores_rf = cross_val_score(rf, mean_sents, label_damage, cv=5) 
print('cross-val-score \n{}'.format(scores_rf))
print('cross-val-score.mean \n{:.3f}'.format(scores_rf.mean()))

cross-val-score 
[0.62828947 0.63404605 0.62602965 0.62355848 0.62438221]
cross-val-score.mean 
0.627


### (1-4) XGBoost

In [15]:
import xgboost as xgb
label_damage2 = list(map(int, label_damage))
data_dmatrix = xgb.DMatrix(data=mean_sents,label=label_damage2)
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,'max_depth': 5, 'alpha': 10}
cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="auc", as_pandas=True, seed=123)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
cv_results.tail()

In [None]:
print((cv_results["test-auc-mean"]).tail(10))

In [None]:
from sklearn.model_selection import train_test_split

X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(mean_sents, label_damage2, test_size=0.2, random_state=123)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

In [None]:
xg_reg.fit(X_train_xgb,y_train_xgb)

In [None]:
from sklearn.metrics import mean_squared_error

preds = xg_reg.predict(X_test_xgb)
preds_cat = [1 if x > 0.5 else 0 for x in preds]
sum(np.array(y_test_xgb) ==  np.array(preds_cat))/len(y_test_xgb)

In [60]:
import pickle

pickle.dump(xg_reg, open("xgb.dat", "wb")) # 모델 저장

# 모델링 - Neural Network

### Embedding 된 데이터 불러오기

In [18]:
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Bidirectional
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import numpy as np

sentences = np.asarray(vec_sents)

In [19]:
from sentence_utility import zero_padding, circular_padding, reverse_padding
new_sentences = zero_padding(sentences)
#new_sentences = circular_padding(sentences)
#new_sentences = reverse_padding(sentences)

In [20]:
x = new_sentences
y = np_utils.to_categorical(np.asarray(label_damage))
x_train, x_test, y_train, y_test = train_test_split(new_sentences,y,test_size=0.2,random_state=1, shuffle=True)

## (2) LSTM을 이용해서 문장 분류

### (2-1) Keras 내부 Embedding layer 사용

In [None]:
# 단어에 대해 숫자로 매핑시킴.
def word2num(list_2d):
    w2n_dic = dict()  # word가 key이고 index가 value인 dict
    n2w_dic = dict()  # index가 key이고 word가 value인 dict. 나중에 번호에서 단어로 쉽게 바꾸기 위해.
    idx = 1
    num_list = [[] for _ in range(len(list_2d))]   # 숫자에 매핑된 글의 리스트
    for k,i in enumerate(list_2d):
        if not i:
            continue
        elif isinstance(i, str): 
             # 내용이 단어 하나로 이루어진 경우, for loop으로 ['단어']가 '단'과 '어'로 나뉘지 않게 한다.
            if w2n_dic.get(i) is None:
                w2n_dic[i] = idx
                n2w_dic[idx] = i
                idx += 1
            num_list[k] = [dic[i]]
        else:
            for j in i:
                if w2n_dic.get(j) is None:
                    w2n_dic[j] = idx
                    n2w_dic[idx] = j
                    idx += 1
                num_list[k].append(w2n_dic[j])
    return num_list, w2n_dic, n2w_dic

In [None]:
num_list, w2n_dic, n2w_dic = word2num(token_sents)

In [None]:
x = sequence.pad_sequences(num_list, maxlen=50)
y = np_utils.to_categorical(np.asarray(label_damage))
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=1, shuffle=True)

In [None]:
model = Sequential()
model.add(Embedding(len(w2n_dic)+1,100))
model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.5, activation='tanh'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, batch_size=100, epochs=100, validation_data=(x_test, y_test))

### (2-2) 이미 Embedding된 데이터 사용

In [150]:
model = Sequential()
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2, activation='tanh'))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [151]:
history = model.fit(x_train, y_train, batch_size=100, epochs=25, validation_data=(x_test, y_test))

Train on 4859 samples, validate on 1215 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [None]:
model.save('\model\model_doc2vec_reversePadding.h5')

## (3) CNN을 이용해서 문장 분류

In [None]:
from keras.models import Sequential
from keras import layers

In [None]:
model = Sequential()
model.add(layers.Conv1D(filters = 50, kernel_size = 10, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Conv1D(filters = 30, kernel_size = 5, activation='relu'))
model.add(layers.Conv1D(filters = 20, kernel_size = 3, activation='relu'))
#model.add(layers.LSTM(15))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dropout(0.2))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(2, activation='softmax'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
input_shape = (None, 50, 100)
model.build(input_shape)
#model.summary()

In [None]:
history = model.fit(x_train, y_train, batch_size=100, epochs=30, validation_data=(x_test, y_test))

## (4) BRNN

In [56]:
early_stopping = EarlyStopping(patience = 5) # 조기종료 콜백함수 정의
model = Sequential()
model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2, activation='tanh')))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [57]:
history = model.fit(x_train, y_train, batch_size=100, epochs=25, validation_data=(x_test, y_test), callbacks=[early_stopping])

Train on 4858 samples, validate on 1215 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25


## (5) 앙상블 모델

### 비관련 3천개 데이터 복원 추출해서 앙상블

In [None]:
# '(2) LSTM을 이용해서 문장 분류' 아래의 import 부분 실행 후 진행해야함.
from sentence_utility import read_damage_csv, token2vec, zero_padding, circular_padding, reverse_padding
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(patience = 10) # 조기종료 콜백함수 정의
total_num_model = 20 # 만들 모델 갯수

for i in range(total_num_model):
    sents, label_damage = read_damage_csv(base_dir + r'\ensemble_data\data'+str(i)+r'.csv')
    
    token_sents = [] # word_vectors의 key가 '단어/품사'의 형식으로 되어 있기 때문에, 같은 형식으로 만들기 위해 각 문장별로 tokenize를 적용함.
    for sent in sents:
        token_sents.append(tokenize(sent))
    
    vec_sents, label_damage = token2vec(token_sents, label_damage, word_vectors)
        
    sentences = np.asarray(vec_sents)
    new_sentences = zero_padding(sentences)
    #new_sentences = circular_padding(sentences)
    #new_sentences = reverse_padding(sentences)
    
    x = new_sentences
    y = np_utils.to_categorical(np.asarray(label_damage))
    x_train, x_test, y_train, y_test = train_test_split(new_sentences,y,test_size=0.3,random_state=1, shuffle=True)
    
    model = Sequential()
    model.add(LSTM(10, dropout=0.2, recurrent_dropout=0.2, activation='tanh'))
    model.add(Dense(2, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    
    history = model.fit(x_train, y_train, batch_size=100, epochs=30, validation_data=(x_test, y_test), callbacks=[early_stopping])
    model.save(base_dir+r'\model_ensemble_20\lstm_zeroPadding'+str(i)+'.h5')    

Train on 4251 samples, validate on 1823 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Train on 4251 samples, validate on 1823 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Train on 4251 samples, validate on 1823 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 1

In [32]:
# 만들어진 앙상블 모델로 테스트 진행
from keras.models import load_model

model_list = [] # 생성한 모델들을 저장
total_num_model = 40 # 생성한 모델 갯수

for i in range(total_num_model):
    print("Model: "+str(i))
    model = load_model(base_dir+'\model_ensemble_40\lstm_zeroPadding'+str(i)+'.h5') # 위에서 생성한 모델 import
    model_list.append(model)    

Model: 0
Model: 1
Model: 2
Model: 3
Model: 4
Model: 5
Model: 6
Model: 7
Model: 8
Model: 9
Model: 10
Model: 11
Model: 12
Model: 13
Model: 14
Model: 15
Model: 16
Model: 17
Model: 18
Model: 19
Model: 20
Model: 21
Model: 22
Model: 23
Model: 24
Model: 25
Model: 26
Model: 27
Model: 28
Model: 29
Model: 30
Model: 31
Model: 32
Model: 33
Model: 34
Model: 35
Model: 36
Model: 37
Model: 38
Model: 39


In [33]:
result = np.zeros((x_test.shape[0], 2))

for i in range(total_num_model):
    print('Model: '+str(i))
    model = model_list[i]
    result += model.predict(x_test)
        
result = result/total_num_model # 평균값을 이용해서 예측

Model: 0
Model: 1
Model: 2
Model: 3
Model: 4
Model: 5
Model: 6
Model: 7
Model: 8
Model: 9
Model: 10
Model: 11
Model: 12
Model: 13
Model: 14
Model: 15
Model: 16
Model: 17
Model: 18
Model: 19
Model: 20
Model: 21
Model: 22
Model: 23
Model: 24
Model: 25
Model: 26
Model: 27
Model: 28
Model: 29
Model: 30
Model: 31
Model: 32
Model: 33
Model: 34
Model: 35
Model: 36
Model: 37
Model: 38
Model: 39


In [34]:
y_pred_ensemble = []
y_real = []
for i in range(len(result)):
    if result[i,1] > 0.5:
        y_pred_ensemble.append(1)
    else:
        y_pred_ensemble.append(0)
    
    if y_test[i,1] > 0.5:
        y_real.append(1)
    else:
        y_real.append(0)

In [35]:
# 앙상블 테스트 결과
sum(np.equal(np.array(y_pred_ensemble), np.array(y_real)))/len(result)

0.854320987654321