In [1]:
import pandas as pd
import numpy as np
import os
import collections
import pickle
import datetime
import re
from tqdm import trange
import argparse
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm
from gensim.models import Word2Vec

In [3]:
with open('../LSTM/train_kr_all.pkl', 'rb') as f :
    train_kr = pickle.load(f)

In [4]:
train_kr['temp_code'] = train_kr.Script_num.map(lambda x : '-'.join(x.split('-')[:3]))

In [5]:
temp_code = 'UL-MV-01'
temp_script = []
temp_category = []
temp = ''
for i, row in train_kr.iterrows() :
    if temp_code == row.temp_code :
        temp += ' ' + ' '.join(row.Script)
    else :
        temp_code = row.temp_code
        temp_script.append(temp.strip())
        temp_category.append(row.Category)
        temp = ''

In [6]:
temp_category = np.array(temp_category)

In [8]:
count_vectorizer = CountVectorizer(min_df = 10)

In [73]:
raw_data = pd.DataFrame(count_vectorizer.fit_transform(temp_script).toarray())

In [75]:
cv = KFold(n_splits = 5, shuffle = True, random_state = 7)

In [76]:
gnb = GaussianNB()

In [77]:
for train_index, test_index in cv.split(raw_data) :
    gnb.fit(raw_data.iloc[train_index], temp_category[train_index])
    yhat = gnb.predict(raw_data.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

             precision    recall  f1-score   support

         도선       0.39      0.40      0.40        47
         선박       0.37      0.43      0.40        53
         양묘       0.08      0.69      0.14        16
         육상       0.54      0.17      0.25       133
         이동       0.16      0.24      0.19        49
         입항       0.23      0.31      0.26        54
         접안       0.23      0.32      0.27        53
         출항       0.58      0.21      0.30        92
         통과       0.37      0.11      0.17       100
         투묘       0.31      0.25      0.28        40
         횡단       0.27      0.52      0.36        23

avg / total       0.38      0.26      0.27       660

             precision    recall  f1-score   support

         도선       0.33      0.43      0.38        35
         선박       0.34      0.29      0.31        55
         양묘       0.05      0.29      0.09        21
         육상       0.33      0.18      0.23       125
         이동       0.24      0.33      0.28

In [78]:
bnb = BernoulliNB()

In [79]:
for train_index, test_index in cv.split(raw_data) :
    bnb.fit(raw_data.iloc[train_index], temp_category[train_index])
    yhat = bnb.predict(raw_data.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

             precision    recall  f1-score   support

         도선       0.75      0.77      0.76        47
         선박       0.69      0.81      0.75        53
         양묘       0.55      0.38      0.44        16
         육상       0.76      0.68      0.71       133
         이동       0.62      0.71      0.67        49
         입항       0.81      0.72      0.76        54
         접안       0.65      0.85      0.74        53
         출항       0.96      0.80      0.88        92
         통과       0.75      0.72      0.73       100
         투묘       0.62      0.65      0.63        40
         횡단       0.56      0.78      0.65        23

avg / total       0.74      0.73      0.73       660

             precision    recall  f1-score   support

         도선       0.53      0.71      0.61        35
         선박       0.62      0.76      0.68        55
         양묘       0.58      0.33      0.42        21
         육상       0.66      0.63      0.64       125
         이동       0.71      0.70      0.71

In [80]:
mnb = MultinomialNB()

In [81]:
for train_index, test_index in cv.split(raw_data) :
    mnb.fit(raw_data.iloc[train_index], temp_category[train_index])
    yhat = mnb.predict(raw_data.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

             precision    recall  f1-score   support

         도선       0.78      0.74      0.76        47
         선박       0.83      0.75      0.79        53
         양묘       0.73      0.50      0.59        16
         육상       0.76      0.69      0.72       133
         이동       0.58      0.71      0.64        49
         입항       0.82      0.74      0.78        54
         접안       0.66      0.81      0.73        53
         출항       0.88      0.83      0.85        92
         통과       0.72      0.71      0.72       100
         투묘       0.61      0.68      0.64        40
         횡단       0.52      0.74      0.61        23

avg / total       0.74      0.73      0.74       660

             precision    recall  f1-score   support

         도선       0.56      0.69      0.62        35
         선박       0.78      0.64      0.70        55
         양묘       0.80      0.38      0.52        21
         육상       0.65      0.70      0.67       125
         이동       0.65      0.70      0.67

### Using tf-idf

In [83]:
tf = TfidfVectorizer(vocabulary = count_vectorizer.vocabulary_.keys())

In [84]:
raw_data2 = pd.DataFrame(tf.fit_transform(temp_script).toarray())

In [86]:
xgbc = xgboost.XGBClassifier(max_depth = 4, n_jobs = 12, n_estimators = 500)

In [87]:
for train_index, test_index in cv.split(raw_data) :
    xgbc.fit(raw_data.iloc[train_index], temp_category[train_index])
    yhat = xgbc.predict(raw_data.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

             precision    recall  f1-score   support

         도선       0.76      0.66      0.70        47
         선박       0.79      0.83      0.81        53
         양묘       0.60      0.75      0.67        16
         육상       0.73      0.72      0.73       133
         이동       0.76      0.71      0.74        49
         입항       0.80      0.74      0.77        54
         접안       0.85      0.89      0.87        53
         출항       0.80      0.85      0.83        92
         통과       0.80      0.72      0.76       100
         투묘       0.69      0.78      0.73        40
         횡단       0.62      0.78      0.69        23

avg / total       0.77      0.76      0.76       660

             precision    recall  f1-score   support

         도선       0.61      0.71      0.66        35
         선박       0.83      0.71      0.76        55
         양묘       0.75      0.57      0.65        21
         육상       0.61      0.72      0.66       125
         이동       0.76      0.74      0.75

In [88]:
xgbc2 = xgboost.XGBClassifier(max_depth = 4, n_jobs = 12, n_estimators = 500)

In [89]:
for train_index, test_index in cv.split(raw_data2) :
    xgbc2.fit(raw_data2.iloc[train_index], temp_category[train_index])
    yhat = xgbc2.predict(raw_data2.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

             precision    recall  f1-score   support

         도선       0.79      0.64      0.71        47
         선박       0.73      0.81      0.77        53
         양묘       0.57      0.75      0.65        16
         육상       0.71      0.74      0.72       133
         이동       0.70      0.71      0.71        49
         입항       0.84      0.76      0.80        54
         접안       0.88      0.83      0.85        53
         출항       0.81      0.83      0.82        92
         통과       0.73      0.69      0.71       100
         투묘       0.73      0.75      0.74        40
         횡단       0.60      0.65      0.63        23

avg / total       0.75      0.75      0.75       660

             precision    recall  f1-score   support

         도선       0.70      0.74      0.72        35
         선박       0.84      0.75      0.79        55
         양묘       0.65      0.62      0.63        21
         육상       0.59      0.70      0.64       125
         이동       0.68      0.68      0.68

In [19]:
rfc = RandomForestClassifier(max_depth = 15, n_jobs = 12, n_estimators = 1000)

In [116]:
for train_index, test_index in cv.split(raw_data) :
    rfc.fit(raw_data.iloc[train_index], temp_category[train_index])
    yhat = rfc.predict(raw_data.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

             precision    recall  f1-score   support

         도선       0.85      0.60      0.70        47
         선박       0.89      0.60      0.72        53
         양묘       0.86      0.38      0.52        16
         육상       0.60      0.82      0.69       133
         이동       0.71      0.71      0.71        49
         입항       0.90      0.69      0.78        54
         접안       0.82      0.89      0.85        53
         출항       0.81      0.87      0.84        92
         통과       0.76      0.78      0.77       100
         투묘       0.77      0.60      0.68        40
         횡단       0.73      0.70      0.71        23

avg / total       0.77      0.75      0.74       660

             precision    recall  f1-score   support

         도선       0.72      0.60      0.66        35
         선박       0.82      0.51      0.63        55
         양묘       0.57      0.19      0.29        21
         육상       0.52      0.77      0.62       125
         이동       0.69      0.65      0.67

In [20]:
rfc2 = RandomForestClassifier(max_depth = None, n_jobs = 12, n_estimators = 1000)

In [117]:
for train_index, test_index in cv.split(raw_data2) :
    rfc2.fit(raw_data2.iloc[train_index], temp_category[train_index])
    yhat = rfc2.predict(raw_data2.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

             precision    recall  f1-score   support

         도선       0.82      0.68      0.74        47
         선박       0.79      0.72      0.75        53
         양묘       0.62      0.31      0.42        16
         육상       0.68      0.83      0.75       133
         이동       0.78      0.73      0.76        49
         입항       0.86      0.70      0.78        54
         접안       0.82      0.85      0.83        53
         출항       0.78      0.86      0.82        92
         통과       0.75      0.76      0.76       100
         투묘       0.74      0.65      0.69        40
         횡단       0.73      0.70      0.71        23

avg / total       0.76      0.76      0.76       660

             precision    recall  f1-score   support

         도선       0.73      0.69      0.71        35
         선박       0.86      0.56      0.68        55
         양묘       0.55      0.29      0.37        21
         육상       0.52      0.78      0.63       125
         이동       0.60      0.63      0.62

In [22]:
lrc = LogisticRegression(C = 0.8, max_iter = 500, n_jobs = 12, solver = 'sag')

In [160]:
for train_index, test_index in cv.split(raw_data2) :
    lrc.fit(raw_data2.iloc[train_index], temp_category[train_index])
    yhat = lrc.predict(raw_data2.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))



              precision    recall  f1-score   support

          도선       0.79      0.66      0.72        47
          선박       0.90      0.72      0.80        53
          양묘       0.75      0.56      0.64        16
          육상       0.65      0.83      0.73       133
          이동       0.73      0.67      0.70        49
          입항       0.87      0.76      0.81        54
          접안       0.91      0.81      0.86        53
          출항       0.80      0.83      0.81        92
          통과       0.72      0.75      0.74       100
          투묘       0.64      0.62      0.63        40
          횡단       0.74      0.61      0.67        23

   micro avg       0.75      0.75      0.75       660
   macro avg       0.77      0.71      0.74       660
weighted avg       0.76      0.75      0.75       660





              precision    recall  f1-score   support

          도선       0.75      0.60      0.67        35
          선박       0.81      0.62      0.70        55
          양묘       0.67      0.38      0.48        21
          육상       0.54      0.80      0.65       125
          이동       0.67      0.67      0.67        57
          입항       0.81      0.74      0.78        47
          접안       0.98      0.88      0.93        51
          출항       0.75      0.84      0.79        98
          통과       0.80      0.74      0.77        89
          투묘       0.81      0.63      0.71        54
          횡단       0.93      0.46      0.62        28

   micro avg       0.72      0.72      0.72       660
   macro avg       0.77      0.67      0.71       660
weighted avg       0.75      0.72      0.72       660





              precision    recall  f1-score   support

          도선       0.53      0.48      0.50        21
          선박       0.79      0.50      0.61        52
          양묘       0.79      0.50      0.61        30
          육상       0.59      0.83      0.69       132
          이동       0.78      0.68      0.73        66
          입항       0.81      0.70      0.75        54
          접안       0.89      0.83      0.86        47
          출항       0.74      0.75      0.74        95
          통과       0.70      0.75      0.73        93
          투묘       0.76      0.70      0.73        50
          횡단       0.83      0.50      0.62        20

   micro avg       0.71      0.71      0.71       660
   macro avg       0.75      0.66      0.69       660
weighted avg       0.73      0.71      0.71       660





              precision    recall  f1-score   support

          도선       0.88      0.70      0.78        43
          선박       0.82      0.56      0.67        55
          양묘       0.60      0.21      0.32        14
          육상       0.53      0.85      0.65       123
          이동       0.82      0.83      0.82        59
          입항       0.82      0.62      0.71        45
          접안       0.95      0.84      0.89        50
          출항       0.81      0.79      0.80        91
          통과       0.79      0.77      0.78        96
          투묘       0.78      0.67      0.72        58
          횡단       0.88      0.54      0.67        26

   micro avg       0.74      0.74      0.74       660
   macro avg       0.79      0.67      0.71       660
weighted avg       0.77      0.74      0.74       660





              precision    recall  f1-score   support

          도선       0.76      0.53      0.63        30
          선박       0.76      0.59      0.67        49
          양묘       0.89      0.57      0.70        14
          육상       0.60      0.86      0.71       128
          이동       0.81      0.73      0.77        64
          입항       0.84      0.65      0.74        49
          접안       0.93      0.75      0.83        53
          출항       0.87      0.81      0.84        97
          통과       0.74      0.85      0.79       113
          투묘       0.82      0.70      0.76        47
          횡단       1.00      0.56      0.72        16

   micro avg       0.76      0.76      0.76       660
   macro avg       0.82      0.69      0.74       660
weighted avg       0.78      0.76      0.76       660



In [23]:
lrc2 = LogisticRegression(C = 0.8, max_iter = 500, n_jobs = 12, solver = 'sag')

In [163]:
for train_index, test_index in cv.split(raw_data) :
    lrc2.fit(raw_data.iloc[train_index], temp_category[train_index])
    yhat = lrc2.predict(raw_data.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))



              precision    recall  f1-score   support

          도선       0.77      0.64      0.70        47
          선박       0.76      0.85      0.80        53
          양묘       0.65      0.69      0.67        16
          육상       0.68      0.69      0.69       133
          이동       0.73      0.67      0.70        49
          입항       0.84      0.78      0.81        54
          접안       0.86      0.83      0.85        53
          출항       0.83      0.83      0.83        92
          통과       0.69      0.74      0.71       100
          투묘       0.64      0.68      0.66        40
          횡단       0.65      0.65      0.65        23

   micro avg       0.74      0.74      0.74       660
   macro avg       0.74      0.73      0.73       660
weighted avg       0.74      0.74      0.74       660





              precision    recall  f1-score   support

          도선       0.69      0.69      0.69        35
          선박       0.79      0.67      0.73        55
          양묘       0.75      0.57      0.65        21
          육상       0.55      0.70      0.62       125
          이동       0.69      0.67      0.68        57
          입항       0.72      0.77      0.74        47
          접안       0.90      0.88      0.89        51
          출항       0.81      0.82      0.81        98
          통과       0.77      0.74      0.75        89
          투묘       0.83      0.65      0.73        54
          횡단       0.71      0.54      0.61        28

   micro avg       0.72      0.72      0.72       660
   macro avg       0.75      0.70      0.72       660
weighted avg       0.73      0.72      0.72       660





              precision    recall  f1-score   support

          도선       0.56      0.48      0.51        21
          선박       0.83      0.73      0.78        52
          양묘       0.78      0.60      0.68        30
          육상       0.63      0.73      0.68       132
          이동       0.77      0.67      0.72        66
          입항       0.78      0.74      0.76        54
          접안       0.88      0.89      0.88        47
          출항       0.73      0.78      0.76        95
          통과       0.67      0.69      0.68        93
          투묘       0.71      0.72      0.71        50
          횡단       0.71      0.50      0.59        20

   micro avg       0.72      0.72      0.72       660
   macro avg       0.73      0.68      0.70       660
weighted avg       0.72      0.72      0.72       660





              precision    recall  f1-score   support

          도선       0.78      0.74      0.76        43
          선박       0.78      0.78      0.78        55
          양묘       0.45      0.36      0.40        14
          육상       0.60      0.76      0.67       123
          이동       0.78      0.80      0.79        59
          입항       0.71      0.67      0.69        45
          접안       0.93      0.84      0.88        50
          출항       0.80      0.77      0.79        91
          통과       0.78      0.75      0.77        96
          투묘       0.75      0.69      0.72        58
          횡단       0.83      0.58      0.68        26

   micro avg       0.74      0.74      0.74       660
   macro avg       0.75      0.70      0.72       660
weighted avg       0.75      0.74      0.74       660





              precision    recall  f1-score   support

          도선       0.72      0.60      0.65        30
          선박       0.79      0.69      0.74        49
          양묘       0.75      0.64      0.69        14
          육상       0.63      0.77      0.69       128
          이동       0.84      0.73      0.78        64
          입항       0.74      0.69      0.72        49
          접안       0.88      0.79      0.83        53
          출항       0.86      0.85      0.85        97
          통과       0.72      0.81      0.77       113
          투묘       0.82      0.70      0.76        47
          횡단       0.82      0.56      0.67        16

   micro avg       0.76      0.76      0.76       660
   macro avg       0.78      0.71      0.74       660
weighted avg       0.77      0.76      0.76       660



In [25]:
lgbmc = lightgbm.LGBMClassifier(n_jobs = 12, n_estimators = 500, max_depth = -1)

In [172]:
for train_index, test_index in cv.split(raw_data2) :
    lgbmc.fit(raw_data2.iloc[train_index].values, temp_category[train_index])
    yhat = lgbmc.predict(raw_data2.iloc[test_index].values)
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.80      0.60      0.68        47
          선박       0.70      0.75      0.73        53
          양묘       0.56      0.62      0.59        16
          육상       0.67      0.71      0.69       133
          이동       0.70      0.65      0.67        49
          입항       0.84      0.76      0.80        54
          접안       0.87      0.85      0.86        53
          출항       0.78      0.85      0.81        92
          통과       0.74      0.67      0.70       100
          투묘       0.60      0.70      0.64        40
          횡단       0.52      0.57      0.54        23

   micro avg       0.72      0.72      0.72       660
   macro avg       0.71      0.70      0.70       660
weighted avg       0.73      0.72      0.72       660

              precision    recall  f1-score   support

          도선       0.64      0.71      0.68        35
          선박       0.80      0.75      0.77        55
          양묘       0.57 

In [26]:
lgbmc2 = lightgbm.LGBMClassifier(n_jobs = 12, n_estimators = 500, max_depth = -1)

In [170]:
for train_index, test_index in cv.split(raw_data) :
    lgbmc2.fit(raw_data.iloc[train_index].values, temp_category[train_index])
    yhat = lgbmc2.predict(raw_data.iloc[test_index].values)
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.76      0.60      0.67        47
          선박       0.71      0.75      0.73        53
          양묘       0.55      0.69      0.61        16
          육상       0.70      0.71      0.70       133
          이동       0.70      0.71      0.71        49
          입항       0.80      0.76      0.78        54
          접안       0.85      0.87      0.86        53
          출항       0.78      0.86      0.82        92
          통과       0.81      0.72      0.76       100
          투묘       0.65      0.70      0.67        40
          횡단       0.62      0.65      0.64        23

   micro avg       0.74      0.74      0.74       660
   macro avg       0.72      0.73      0.72       660
weighted avg       0.74      0.74      0.74       660

              precision    recall  f1-score   support

          도선       0.60      0.69      0.64        35
          선박       0.71      0.76      0.74        55
          양묘       0.48 

In [27]:
from sklearn.svm import LinearSVC

In [28]:
lsvc = LinearSVC(C = 0.7)

In [189]:
for train_index, test_index in cv.split(raw_data2) :
    lsvc.fit(raw_data2.iloc[train_index].values, temp_category[train_index])
    yhat = lsvc.predict(raw_data2.iloc[test_index].values)
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.78      0.66      0.71        47
          선박       0.78      0.81      0.80        53
          양묘       0.75      0.75      0.75        16
          육상       0.71      0.71      0.71       133
          이동       0.71      0.65      0.68        49
          입항       0.84      0.78      0.81        54
          접안       0.83      0.83      0.83        53
          출항       0.81      0.85      0.83        92
          통과       0.70      0.74      0.72       100
          투묘       0.69      0.72      0.71        40
          횡단       0.67      0.70      0.68        23

   micro avg       0.75      0.75      0.75       660
   macro avg       0.75      0.75      0.75       660
weighted avg       0.75      0.75      0.75       660

              precision    recall  f1-score   support

          도선       0.72      0.74      0.73        35
          선박       0.74      0.67      0.70        55
          양묘       0.76 

In [29]:
lsvc2 = LinearSVC(C = 0.7)

In [191]:
for train_index, test_index in cv.split(raw_data) :
    lsvc2.fit(raw_data.iloc[train_index].values, temp_category[train_index])
    yhat = lsvc2.predict(raw_data.iloc[test_index].values)
    print(classification_report(temp_category[test_index], yhat))



              precision    recall  f1-score   support

          도선       0.70      0.66      0.68        47
          선박       0.78      0.87      0.82        53
          양묘       0.60      0.75      0.67        16
          육상       0.72      0.63      0.67       133
          이동       0.65      0.63      0.64        49
          입항       0.76      0.76      0.76        54
          접안       0.80      0.85      0.83        53
          출항       0.79      0.83      0.81        92
          통과       0.69      0.66      0.67       100
          투묘       0.63      0.72      0.67        40
          횡단       0.71      0.74      0.72        23

   micro avg       0.72      0.72      0.72       660
   macro avg       0.71      0.74      0.72       660
weighted avg       0.72      0.72      0.72       660





              precision    recall  f1-score   support

          도선       0.61      0.71      0.66        35
          선박       0.75      0.69      0.72        55
          양묘       0.55      0.52      0.54        21
          육상       0.59      0.63      0.61       125
          이동       0.69      0.67      0.68        57
          입항       0.65      0.72      0.69        47
          접안       0.90      0.90      0.90        51
          출항       0.80      0.80      0.80        98
          통과       0.72      0.73      0.73        89
          투묘       0.76      0.59      0.67        54
          횡단       0.68      0.61      0.64        28

   micro avg       0.70      0.70      0.70       660
   macro avg       0.70      0.69      0.69       660
weighted avg       0.71      0.70      0.70       660





              precision    recall  f1-score   support

          도선       0.48      0.48      0.48        21
          선박       0.75      0.63      0.69        52
          양묘       0.72      0.60      0.65        30
          육상       0.63      0.70      0.66       132
          이동       0.72      0.67      0.69        66
          입항       0.75      0.74      0.75        54
          접안       0.90      0.91      0.91        47
          출항       0.77      0.78      0.77        95
          통과       0.64      0.62      0.63        93
          투묘       0.64      0.74      0.69        50
          횡단       0.56      0.45      0.50        20

   micro avg       0.69      0.69      0.69       660
   macro avg       0.69      0.67      0.67       660
weighted avg       0.70      0.69      0.69       660





              precision    recall  f1-score   support

          도선       0.77      0.70      0.73        43
          선박       0.73      0.67      0.70        55
          양묘       0.40      0.43      0.41        14
          육상       0.53      0.67      0.59       123
          이동       0.69      0.71      0.70        59
          입항       0.71      0.67      0.69        45
          접안       0.93      0.86      0.90        50
          출항       0.74      0.70      0.72        91
          통과       0.77      0.72      0.74        96
          투묘       0.75      0.69      0.72        58
          횡단       0.76      0.62      0.68        26

   micro avg       0.70      0.70      0.70       660
   macro avg       0.71      0.68      0.69       660
weighted avg       0.71      0.70      0.70       660

              precision    recall  f1-score   support

          도선       0.67      0.60      0.63        30
          선박       0.85      0.69      0.76        49
          양묘       0.64 



### Word Embedding으로 calssification 해보면?

In [31]:
embedding_model = Word2Vec.load('../LSTM/kr_word2vec_all.model')

In [32]:
def get_mean_vector(sentence) :
    words = sentence.split()
    result = []
    for word in words :
        try :
            result.append(embedding_model.wv.get_vector(word))
        except :
            continue
    result = np.array(result)
    if result.sum() == 0 :
        result = np.zeros((1, 300))
    result = result.mean(axis = 0)
    return result
    

In [33]:
embedding_data = np.array(None)
for sentence in temp_script :
    embedding_data = np.append(embedding_data, get_mean_vector(sentence))

In [34]:
embedding_data = np.delete(embedding_data, 0)

In [35]:
embedding_data = embedding_data.reshape(-1, 300)

In [36]:
xgbc3 = xgboost.XGBClassifier(max_depth = 4, n_jobs = 12, n_estimators = 500, random_state = 777)

In [199]:
for train_index, test_index in cv.split(embedding_data) :
    xgbc3.fit(embedding_data[train_index], temp_category[train_index])
    yhat = xgbc3.predict(embedding_data[test_index])
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.73      0.68      0.70        47
          선박       0.71      0.66      0.69        53
          양묘       0.50      0.19      0.27        16
          육상       0.66      0.77      0.71       133
          이동       0.64      0.61      0.62        49
          입항       0.81      0.72      0.76        54
          접안       0.87      0.77      0.82        53
          출항       0.76      0.83      0.79        92
          통과       0.74      0.73      0.73       100
          투묘       0.59      0.65      0.62        40
          횡단       0.60      0.52      0.56        23

   micro avg       0.71      0.71      0.71       660
   macro avg       0.69      0.65      0.66       660
weighted avg       0.71      0.71      0.71       660

              precision    recall  f1-score   support

          도선       0.59      0.77      0.67        35
          선박       0.79      0.62      0.69        55
          양묘       0.57 

In [192]:
# lrc2 = LogisticRegression(C = 0.8, max_iter = 500, n_jobs = 12, solver = 'lbfgs')

In [193]:
# for train_index, test_index in cv.split(embedding_data) :
#     lrc2.fit(embedding_data[train_index], temp_category[train_index])
#     yhat = lrc2.predict(embedding_data[test_index])
#     print(classification_report(temp_category[test_index], yhat))

logistic regression은 성능이 너무 안나와서 코드 실행 x

### N-gram 사용

In [37]:
with open('../LSTM/data_for_classification.pickle', 'rb') as f :
    raw_data = pickle.load(f)

In [38]:
raw_data = raw_data[raw_data.Language == 'KR']

In [39]:
raw_data['temp_code'] = raw_data.Script_num.map(lambda x : '-'.join(x.split('-')[:3]))

In [40]:
raw_data.head()

Unnamed: 0,Script_num,Script,Language,Category,temp_code
0,UL-MV-01-02,아 울산VTS [SN] 감도 있습니까?,KR,이동,UL-MV-01
1,UL-MV-01-03,네 [SN],KR,이동,UL-MV-01
2,UL-MV-01-04,네 수고 많으십니다 본선 델타라인 통과해서 본선 1부두 접안하러 이동하겠습니다,KR,이동,UL-MV-01
3,UL-MV-01-05,들어오세요,KR,이동,UL-MV-01
4,UL-MV-01-06,네 저희 지금…,KR,이동,UL-MV-01


In [41]:
temp_code2 = 'UL-MV-01'
temp_script2 = []
temp_category2 = []
temp2 = ''
for i, row in raw_data.iterrows() :
    if temp_code2 == row.temp_code :
        temp2 += ' ' + ''.join(row.Script)
    else :
        temp_code2 = row.temp_code
        temp_script2.append(temp2.strip())
        temp_category2.append(row.Category)
        temp2 = ''

In [43]:
ngram_vectorizer = CountVectorizer(ngram_range = (1, 2), min_df = 10)

In [44]:
ngram = ngram_vectorizer.fit_transform(temp_script2)

In [45]:
ngram = ngram.toarray()

In [46]:
ngram.shape

(3300, 989)

In [55]:
xgbc3 = xgboost.XGBClassifier(max_depth = 5, n_jobs = 12, n_estimators = 500)

In [56]:
for train_index, test_index in cv.split(ngram) :
    xgbc3.fit(ngram[train_index], temp_category[train_index])
    yhat = xgbc3.predict(ngram[test_index])
    print(classification_report(temp_category[test_index], yhat))

             precision    recall  f1-score   support

         도선       0.78      0.74      0.76        47
         선박       0.80      0.83      0.81        53
         양묘       0.56      0.56      0.56        16
         육상       0.70      0.77      0.73       133
         이동       0.66      0.59      0.62        49
         입항       0.82      0.76      0.79        54
         접안       0.86      0.83      0.85        53
         출항       0.76      0.76      0.76        92
         통과       0.76      0.77      0.77       100
         투묘       0.56      0.60      0.58        40
         횡단       0.72      0.57      0.63        23

avg / total       0.74      0.74      0.74       660

             precision    recall  f1-score   support

         도선       0.69      0.77      0.73        35
         선박       0.71      0.73      0.72        55
         양묘       0.62      0.38      0.47        21
         육상       0.57      0.76      0.65       125
         이동       0.72      0.58      0.64

### Time information 반영

In [57]:
temp_code = 'UL-MV-01'
temp_script2 = []
temp_category2 = []
temp2 = ''
step = []
temp_step = 0
for i, row in train_kr.iterrows() :
    temp_step += 1
    if temp_code == row.temp_code :
        temp2 += ' ' + ' '.join(row.Script)
        temp_script2.append(temp2)
        temp_category2.append(row.Category)
        step.append(temp_step)
    else :
        temp_code = row.temp_code
        temp_script2.append(temp2)
        temp_category2.append(row.Category)
        temp2 = ''
        step.append(temp_step)
        temp_step = 0

In [91]:
time_tf = pd.DataFrame(tf.transform(temp_script2).toarray())

In [92]:
time_tf['category'] = temp_category2
time_tf['step'] = step

In [93]:
time_bow = pd.DataFrame(count_vectorizer.transform(temp_script2).toarray())

In [94]:
time_bow['category'] = temp_category2
time_bow['step'] = step

상위 3개 예측 category에 실제 label이 들어가면 정답

In [102]:
raw_data.shape

(3300, 464)

In [107]:
time_tf[time_tf.step == 2].shape

(3229, 466)

In [103]:
time_tf[time_tf.step == 3].shape

(3160, 466)

In [98]:
for i in range(1, 25) :
    data = time_tf[time_tf.step == i].drop(['category', 'step'], axis = 1)
    yhat = xgbc2.predict(data)
    print(classification_report(time_tf[time_tf.step == i].category.values, yhat), i)

             precision    recall  f1-score   support

         도선       0.23      0.43      0.30       175
         선박       0.60      0.81      0.69       264
         양묘       0.29      0.02      0.04        95
         육상       0.28      0.49      0.35       642
         이동       0.48      0.07      0.13       296
         입항       0.53      0.14      0.23       249
         접안       0.75      0.06      0.11       254
         출항       0.18      0.03      0.05       473
         통과       0.28      0.68      0.40       492
         투묘       0.47      0.03      0.05       248
         횡단       0.88      0.20      0.33       113

avg / total       0.39      0.32      0.26      3301
 1
             precision    recall  f1-score   support

         도선       0.34      0.41      0.37       175
         선박       0.69      0.85      0.77       253
         양묘       0.11      0.26      0.16        95
         육상       0.45      0.65      0.53       611
         이동       0.24      0.27      0.

             precision    recall  f1-score   support

         선박       0.00      0.00      0.00         0
         육상       1.00      0.80      0.89         5
         입항       1.00      1.00      1.00         1
         접안       0.00      0.00      0.00         1
         횡단       0.00      0.00      0.00         0

avg / total       0.86      0.71      0.78         7
 21
             precision    recall  f1-score   support

         육상       1.00      1.00      1.00         4

avg / total       1.00      1.00      1.00         4
 22
             precision    recall  f1-score   support

         육상       1.00      1.00      1.00         3

avg / total       1.00      1.00      1.00         3
 23
             precision    recall  f1-score   support

         육상       1.00      1.00      1.00         2

avg / total       1.00      1.00      1.00         2
 24


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [99]:
for i in range(1, 25) :
    data = time_bow[time_bow.step == i].drop(['category', 'step'], axis = 1)
    yhat = xgbc.predict(data)
    print(classification_report(time_bow[time_bow.step == i].category.values, yhat), i)

             precision    recall  f1-score   support

         도선       0.53      0.17      0.25       175
         선박       0.76      0.72      0.74       264
         양묘       0.21      0.03      0.06        95
         육상       0.24      0.89      0.38       642
         이동       0.53      0.06      0.11       296
         입항       0.58      0.10      0.18       249
         접안       0.75      0.06      0.11       254
         출항       0.35      0.02      0.04       473
         통과       0.54      0.51      0.53       492
         투묘       0.47      0.03      0.06       248
         횡단       0.86      0.21      0.34       113

avg / total       0.49      0.35      0.28      3301
 1
             precision    recall  f1-score   support

         도선       0.42      0.30      0.35       175
         선박       0.69      0.84      0.76       253
         양묘       0.11      0.31      0.16        95
         육상       0.33      0.78      0.47       611
         이동       0.58      0.10      0.

             precision    recall  f1-score   support

         선박       1.00      1.00      1.00         1
         육상       0.92      1.00      0.96        12
         입항       0.75      1.00      0.86         3
         접안       0.00      0.00      0.00         1
         출항       1.00      0.67      0.80         3
         통과       0.00      0.00      0.00         2
         투묘       1.00      1.00      1.00         1
         횡단       0.50      1.00      0.67         1

avg / total       0.78      0.83      0.80        24
 16
             precision    recall  f1-score   support

         선박       1.00      1.00      1.00         1
         육상       0.90      1.00      0.95         9
         입항       0.75      1.00      0.86         3
         접안       0.00      0.00      0.00         1
         출항       1.00      0.50      0.67         2
         통과       0.00      0.00      0.00         2
         투묘       1.00      1.00      1.00         1
         횡단       0.50      1.00      0

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [143]:
np.array(list(count_vectorizer.vocabulary_.keys()))[rfc.feature_importances_.argsort()[::-1]][:15]   # bow feature importance

array(['오른쪽', '그렇', '습니다', '떨어지', '여요', '외부', 'ㄴ다구요', '올라가', '하세', '붙이',
       '호출', 'ㅂ시오', '투묘하', '바지', '대도'], 
      dtype='<U14')

In [144]:
np.array(list(tf.vocabulary_.keys()))[rfc2.feature_importances_.argsort()[::-1]][:15]   # tf-idf feature importance

array(['길이', '바깥쪽', '앵커', 'calling', '교신', '리지', '전방', '서쪽', '포트', '코드',
       '미터', '아마', '건너가', 'is', '치고'], 
      dtype='<U14')

### 마지막 실험!! (Fasttext pre-trained vector 사용하기) 

In [2]:
from gensim.models.wrappers import FastText
import pickle

In [3]:
#! wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.ko.zip

In [4]:
ftmodel = FastText.load_fasttext_format('wiki.ko')

In [5]:
with open('../LSTM/train_kr_all.pkl', 'rb') as f :
    raw_data = pickle.load(f)

In [6]:
word_vocab = ftmodel.wv.index2word

In [7]:
raw_data.head()

Unnamed: 0,Script_num,Script,Language,Category
0,UL-MV-01-02,"[아, 아, 울산, vts, sn, 감도, 있, 습니까, ?]",KR,이동
1,UL-MV-01-03,"[네, sn]",KR,이동
2,UL-MV-01-04,"[네, 수고, 많, 으시, ㅂ니다, 보, ㄴ, 선, 델타, 라인, 통과, 하, 어서...",KR,이동
3,UL-MV-01-05,"[들어오, 세요]",KR,이동
4,UL-MV-01-06,"[네, 저희, 지금]",KR,이동


In [8]:
# new_script = []
# for i, row in raw_data.iterrows() :
#     temp = [word for word in row.Script if word in word_vocab]
#     new_script.append(temp)  

In [9]:
# with open('temp.pkl', 'wb') as f :
#     pickle.dump(new_script, f)

In [10]:
with open('temp.pkl', 'rb') as f :
    new_script = pickle.load(f)

In [11]:
raw_data.Script = new_script

In [12]:
raw_data['temp_code'] = raw_data.Script_num.map(lambda x : '-'.join(x.split('-')[:3]))

In [13]:
temp_code = 'UL-MV-01'
temp_script = []
temp_category = []
temp = ''
temp_codes = []
for i, row in raw_data.iterrows() :
    if temp_code == row.temp_code :
        temp += ' ' + ' '.join(row.Script)
    else :
        temp_codes.append(temp_code)
        temp_code = row.temp_code
        temp_script.append(temp.strip())
        temp_category.append(row.Category)
        temp = ''

In [34]:
len(word_vocab)

879129

In [33]:
len(set(' '.join(temp_script).split()))

2606

In [15]:
word_vocab.index('분류')

8

In [16]:
word_mean = []
temp = []
for text in temp_script :
        for word in text :
            try :
                temp.append(ftmodel[word])
            except :
                pass
        if len(temp) == 0 :
            word_mean.append(np.zeros(300).tolist())
        else :
            word_mean.append(np.array(temp).mean(axis = 0).tolist())
        temp = []

In [17]:
word_mean = np.array(word_mean)

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
lbl = LabelEncoder()

In [20]:
temp_category = lbl.fit_transform(temp_category)

In [21]:
for vector, label, filename in zip(word_mean, temp_category, temp_codes) :
    inputs = {'mean_data' : vector, 'label' : label}
    with open('../LSTM/data/{}'.format(filename), 'wb') as f :
        pickle.dump(inputs, f)

In [22]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import collections
import pickle
import datetime
import re
from tqdm import trange
import argparse
import pprint
from sklearn.metrics import classification_report


def _weight_variable(layer_name, shape):
    init = tf.random_normal_initializer(mean=0.0, stddev=0.01)
    return tf.get_variable(
        layer_name + "_w", shape=shape,
        initializer=init)

def _bias_variable(layer_name, shape):
    init = tf.constant_initializer(value=shape)
    return tf.get_variable(
        layer_name + "_bias", shape=shape,
        initializer=init)

#model = params.model

batch_size = 256
total_epoch = 300

is_training = True
mean_data = word_mean
labels = temp_category
train_x_placeholder = tf.placeholder(tf.float32, shape = [None, 1, 300])
labels_placeholder = tf.placeholder(tf.int64, shape = [None, 1])
#seqlen_placeholder = tf.placeholder(tf.int32, shape = [None])

max_step = int(len(mean_data) / batch_size)

train_index = np.random.choice(len(mean_data), int(len(mean_data) * 0.7), replace=False)
valid_index = np.arange(len(mean_data))
valid_index = np.delete(valid_index, train_index)

if is_training :
    cell = tf.contrib.rnn.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(64), output_keep_prob = 1)
# create a RNN cell composed sequentially of a number of RNNCells
else :
    cell = tf.nn.rnn_cell.LSTMCell(64)

# 'outputs' is a tensor of shape [batch_size, max_time, 256]
# 'state' is a N-tuple where N is the number of LSTMCells containing a
# tf.contrib.rnn.LSTMStateTuple for each cell
outputs, state = tf.nn.dynamic_rnn(cell = cell,
                                   inputs = train_x_placeholder,
#                                   sequence_length = seqlen_placeholder,
                                   time_major = False,
                                   dtype = tf.float32)

layer_name = 'dense_1'
with tf.variable_scope(layer_name, reuse=tf.AUTO_REUSE):
    w_f1 = _weight_variable(layer_name, [64, 32])
    b_f1 = _bias_variable(layer_name, [32])
    outputs = tf.nn.bias_add(tf.matmul(outputs[:, -1], w_f1), b_f1)
    outputs = tf.nn.sigmoid(outputs)
    if is_training:
        outputs = tf.nn.dropout(outputs, keep_prob=1)

layer_name = 'dense_2'
with tf.variable_scope(layer_name, reuse=tf.AUTO_REUSE):
    w_f2 = _weight_variable(layer_name, [32, 16])
    b_f2 = _bias_variable(layer_name, [16])
    outputs = tf.nn.bias_add(tf.matmul(outputs, w_f2), b_f2)
    outputs = tf.nn.sigmoid(outputs)
    if is_training:
        outputs = tf.nn.dropout(outputs, keep_prob=1)

outputs = tf.layers.dense(inputs = outputs, units = 11, name = 'dense_3', activation = tf.nn.sigmoid)
prediction = tf.argmax(outputs, 1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(labels_placeholder, prediction), tf.float32))
probablity = tf.nn.softmax(outputs, name = 'softmax_tensor')
# outputs = tf.reshape(outputs, [-1, 64])
# outputs = tf.contrib.layers.fully_connected(outputs, 18, activation_fn=None)
# outputs = tf.reshape(outputs, [params.batch_size, 24, 18])

#outputs = tf.contrib.layers.fully_connected(outputs, 24, activation_fn = None)
# biases = tf.Variable(tf.random_normal([18]))

strat_lr = 0.01
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.train.exponential_decay(strat_lr, global_step,
                                   20, 0.99, staircase = True)

# sequence_loss = tf.contrib.seq2seq.sequence_loss(logits = outputs, targets = labels_placeholder, weights = weights)
loss = tf.losses.sparse_softmax_cross_entropy(labels = labels_placeholder, logits = outputs)
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate)
train_op = optimizer.minimize(loss, global_step = global_step)

In [28]:
train_x = mean_data[train_index]
valid_x = mean_data[valid_index].reshape(-1, 1, 300)
train_label = temp_category[train_index]
valid_label = temp_category[valid_index]
t = trange(total_epoch)
best_saver = tf.train.Saver(max_to_keep=1)
with tf.Session() as sess :
    sess.run(tf.global_variables_initializer())
    best_acc = 0
    for epoch in t :
        for step in range(max_step) :
            if step == max_step :
                batch_x = train_x[batch_size * step : len(train_x)]
                batch_labels = train_label[batch_size * step : len(train_x)]
               # batch_length = train_length[batch_size * step : len(train_x)]
            else :
                batch_x = train_x[batch_size * step : batch_size * (step + 1)]
                batch_labels = train_label[batch_size * step : batch_size * (step + 1)]
                #batch_length = train_length[batch_size * step : batch_size * (step + 1)]

            batch_x = batch_x.reshape(-1, 1, 300)
            batch_labels = batch_labels.reshape(-1, 1)

            if is_training :
                _, loss_val = sess.run([train_op, loss],
                        feed_dict = {train_x_placeholder : batch_x,
                                labels_placeholder : batch_labels,
                                #seqlen_placeholder : batch_length
                                    })
            acc_val, yhat = sess.run([accuracy, prediction],
                feed_dict = {train_x_placeholder : valid_x,
                        labels_placeholder : valid_label.reshape(-1, 1),
                        #seqlen_placeholder : valid_length
                            })
            if acc_val > best_acc :
                best_acc = acc_val
                best_saver.save(sess, os.path.join(os.getcwd(), 'model', 'mean'))
            t.set_postfix(best_acc = best_acc, loss = loss_val)

    print(classification_report(valid_label, yhat))


  0%|          | 0/300 [00:00<?, ?it/s][A
100%|██████████| 300/300 [00:13<00:00, 21.64it/s, best_acc=0.198, loss=0]   


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        47
          1       0.00      0.00      0.00        80
          2       0.00      0.00      0.00        28
          3       0.20      1.00      0.33       196
          4       0.00      0.00      0.00        91
          5       0.00      0.00      0.00        72
          6       0.00      0.00      0.00        72
          7       0.00      0.00      0.00       152
          8       0.00      0.00      0.00       137
          9       0.00      0.00      0.00        86
         10       0.00      0.00      0.00        29

avg / total       0.04      0.20      0.07       990



  'precision', 'predicted', average, warn_for)
