In [1]:
import pandas as pd
import numpy as np
import os
import collections
import pickle
import datetime
import re
from tqdm import trange
import argparse
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold

In [108]:
from sklearn.preprocessing import LabelEncoder

In [109]:
with open('../LSTM/train_kr_all.pkl', 'rb') as f :
    train_kr = pickle.load(f)

In [110]:
train_kr['temp_code'] = train_kr.Script_num.map(lambda x : '-'.join(x.split('-')[:3]))

In [111]:
temp_code = 'UL-MV-01'
temp_script = []
temp_category = []
temp = ''
for i, row in train_kr.iterrows() :
    if temp_code == row.temp_code :
        temp += ' ' + ' '.join(row.Script)
    else :
        temp_code = row.temp_code
        temp_script.append(temp.strip())
        temp_category.append(row.Category)
        temp = ''

In [112]:
temp_category = np.array(temp_category)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [115]:
count_vectorizer = CountVectorizer(min_df = 10)

In [116]:
raw_data = pd.DataFrame(count_vectorizer.fit_transform(temp_script).toarray())

In [119]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

In [120]:
cv = KFold(n_splits = 5, shuffle = True, random_state = 7)

In [121]:
gnb = GaussianNB()

In [122]:
for train_index, test_index in cv.split(raw_data) :
    gnb.fit(raw_data.iloc[train_index], temp_category[train_index])
    yhat = gnb.predict(raw_data.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.39      0.40      0.40        47
          선박       0.37      0.43      0.40        53
          양묘       0.08      0.69      0.14        16
          육상       0.54      0.17      0.25       133
          이동       0.16      0.24      0.19        49
          입항       0.23      0.31      0.26        54
          접안       0.23      0.32      0.27        53
          출항       0.58      0.21      0.30        92
          통과       0.37      0.11      0.17       100
          투묘       0.31      0.25      0.28        40
          횡단       0.27      0.52      0.36        23

   micro avg       0.26      0.26      0.26       660
   macro avg       0.32      0.33      0.27       660
weighted avg       0.38      0.26      0.27       660

              precision    recall  f1-score   support

          도선       0.33      0.43      0.38        35
          선박       0.34      0.29      0.31        55
          양묘       0.05 

In [123]:
bnb = BernoulliNB()

In [124]:
for train_index, test_index in cv.split(raw_data) :
    bnb.fit(raw_data.iloc[train_index], temp_category[train_index])
    yhat = bnb.predict(raw_data.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.75      0.77      0.76        47
          선박       0.69      0.81      0.75        53
          양묘       0.55      0.38      0.44        16
          육상       0.76      0.68      0.71       133
          이동       0.62      0.71      0.67        49
          입항       0.81      0.72      0.76        54
          접안       0.65      0.85      0.74        53
          출항       0.96      0.80      0.88        92
          통과       0.75      0.72      0.73       100
          투묘       0.62      0.65      0.63        40
          횡단       0.56      0.78      0.65        23

   micro avg       0.73      0.73      0.73       660
   macro avg       0.70      0.72      0.70       660
weighted avg       0.74      0.73      0.73       660

              precision    recall  f1-score   support

          도선       0.53      0.71      0.61        35
          선박       0.62      0.76      0.68        55
          양묘       0.58 

In [125]:
mnb = MultinomialNB()

In [126]:
for train_index, test_index in cv.split(raw_data) :
    mnb.fit(raw_data.iloc[train_index], temp_category[train_index])
    yhat = mnb.predict(raw_data.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.78      0.74      0.76        47
          선박       0.83      0.75      0.79        53
          양묘       0.73      0.50      0.59        16
          육상       0.76      0.69      0.72       133
          이동       0.58      0.71      0.64        49
          입항       0.82      0.74      0.78        54
          접안       0.66      0.81      0.73        53
          출항       0.88      0.83      0.85        92
          통과       0.72      0.71      0.72       100
          투묘       0.61      0.68      0.64        40
          횡단       0.52      0.74      0.61        23

   micro avg       0.73      0.73      0.73       660
   macro avg       0.72      0.72      0.71       660
weighted avg       0.74      0.73      0.74       660

              precision    recall  f1-score   support

          도선       0.56      0.69      0.62        35
          선박       0.78      0.64      0.70        55
          양묘       0.80 

### Using tf-idf

In [127]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [132]:
tf = TfidfVectorizer(vocabulary = count_vectorizer.vocabulary_.keys())

In [133]:
raw_data2 = pd.DataFrame(tf.fit_transform(temp_script).toarray())

In [134]:
import xgboost

In [146]:
xgbc = xgboost.XGBClassifier(max_depth = 4, n_jobs = 12, n_estimators = 500)

In [147]:
for train_index, test_index in cv.split(raw_data) :
    xgbc.fit(raw_data.iloc[train_index], temp_category[train_index])
    yhat = xgbc.predict(raw_data.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.76      0.66      0.70        47
          선박       0.79      0.83      0.81        53
          양묘       0.60      0.75      0.67        16
          육상       0.73      0.72      0.73       133
          이동       0.76      0.71      0.74        49
          입항       0.80      0.74      0.77        54
          접안       0.85      0.89      0.87        53
          출항       0.80      0.85      0.83        92
          통과       0.80      0.72      0.76       100
          투묘       0.69      0.78      0.73        40
          횡단       0.62      0.78      0.69        23

   micro avg       0.76      0.76      0.76       660
   macro avg       0.75      0.77      0.75       660
weighted avg       0.77      0.76      0.76       660

              precision    recall  f1-score   support

          도선       0.61      0.71      0.66        35
          선박       0.83      0.71      0.76        55
          양묘       0.75 

In [148]:
xgbc2 = xgboost.XGBClassifier(max_depth = 4, n_jobs = 12, n_estimators = 500)

In [149]:
for train_index, test_index in cv.split(raw_data2) :
    xgbc2.fit(raw_data2.iloc[train_index], temp_category[train_index])
    yhat = xgbc2.predict(raw_data2.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.78      0.66      0.71        47
          선박       0.73      0.83      0.78        53
          양묘       0.56      0.62      0.59        16
          육상       0.69      0.74      0.71       133
          이동       0.71      0.71      0.71        49
          입항       0.88      0.78      0.82        54
          접안       0.90      0.85      0.87        53
          출항       0.82      0.82      0.82        92
          통과       0.73      0.68      0.70       100
          투묘       0.69      0.72      0.71        40
          횡단       0.62      0.70      0.65        23

   micro avg       0.75      0.75      0.75       660
   macro avg       0.74      0.74      0.73       660
weighted avg       0.75      0.75      0.75       660

              precision    recall  f1-score   support

          도선       0.70      0.74      0.72        35
          선박       0.82      0.76      0.79        55
          양묘       0.67 

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [150]:
rfc = RandomForestClassifier(max_depth = 15, n_jobs = 12, n_estimators = 1000)

In [151]:
for train_index, test_index in cv.split(raw_data2) :
    rfc.fit(raw_data2.iloc[train_index], temp_category[train_index])
    yhat = rfc.predict(raw_data2.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.77      0.57      0.66        47
          선박       0.94      0.55      0.69        53
          양묘       0.71      0.31      0.43        16
          육상       0.59      0.87      0.70       133
          이동       0.80      0.71      0.75        49
          입항       0.88      0.70      0.78        54
          접안       0.85      0.85      0.85        53
          출항       0.81      0.85      0.83        92
          통과       0.75      0.76      0.75       100
          투묘       0.80      0.60      0.69        40
          횡단       0.76      0.70      0.73        23

   micro avg       0.74      0.74      0.74       660
   macro avg       0.79      0.68      0.72       660
weighted avg       0.77      0.74      0.74       660

              precision    recall  f1-score   support

          도선       0.72      0.60      0.66        35
          선박       0.85      0.40      0.54        55
          양묘       0.62 

In [156]:
rfc2 = RandomForestClassifier(max_depth = None, n_jobs = 12, n_estimators = 1000)

In [157]:
for train_index, test_index in cv.split(raw_data) :
    rfc2.fit(raw_data.iloc[train_index], temp_category[train_index])
    yhat = rfc2.predict(raw_data.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.78      0.68      0.73        47
          선박       0.76      0.72      0.74        53
          양묘       0.62      0.50      0.55        16
          육상       0.71      0.76      0.73       133
          이동       0.73      0.78      0.75        49
          입항       0.89      0.74      0.81        54
          접안       0.82      0.89      0.85        53
          출항       0.83      0.87      0.85        92
          통과       0.75      0.77      0.76       100
          투묘       0.69      0.62      0.66        40
          횡단       0.71      0.74      0.72        23

   micro avg       0.76      0.76      0.76       660
   macro avg       0.75      0.73      0.74       660
weighted avg       0.76      0.76      0.76       660

              precision    recall  f1-score   support

          도선       0.67      0.69      0.68        35
          선박       0.76      0.58      0.66        55
          양묘       0.65 

In [158]:
from sklearn.linear_model import LogisticRegression

In [159]:
lrc = LogisticRegression(C = 0.8, max_iter = 500, n_jobs = 12, solver = 'sag')

In [160]:
for train_index, test_index in cv.split(raw_data2) :
    lrc.fit(raw_data2.iloc[train_index], temp_category[train_index])
    yhat = lrc.predict(raw_data2.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))



              precision    recall  f1-score   support

          도선       0.79      0.66      0.72        47
          선박       0.90      0.72      0.80        53
          양묘       0.75      0.56      0.64        16
          육상       0.65      0.83      0.73       133
          이동       0.73      0.67      0.70        49
          입항       0.87      0.76      0.81        54
          접안       0.91      0.81      0.86        53
          출항       0.80      0.83      0.81        92
          통과       0.72      0.75      0.74       100
          투묘       0.64      0.62      0.63        40
          횡단       0.74      0.61      0.67        23

   micro avg       0.75      0.75      0.75       660
   macro avg       0.77      0.71      0.74       660
weighted avg       0.76      0.75      0.75       660





              precision    recall  f1-score   support

          도선       0.75      0.60      0.67        35
          선박       0.81      0.62      0.70        55
          양묘       0.67      0.38      0.48        21
          육상       0.54      0.80      0.65       125
          이동       0.67      0.67      0.67        57
          입항       0.81      0.74      0.78        47
          접안       0.98      0.88      0.93        51
          출항       0.75      0.84      0.79        98
          통과       0.80      0.74      0.77        89
          투묘       0.81      0.63      0.71        54
          횡단       0.93      0.46      0.62        28

   micro avg       0.72      0.72      0.72       660
   macro avg       0.77      0.67      0.71       660
weighted avg       0.75      0.72      0.72       660





              precision    recall  f1-score   support

          도선       0.53      0.48      0.50        21
          선박       0.79      0.50      0.61        52
          양묘       0.79      0.50      0.61        30
          육상       0.59      0.83      0.69       132
          이동       0.78      0.68      0.73        66
          입항       0.81      0.70      0.75        54
          접안       0.89      0.83      0.86        47
          출항       0.74      0.75      0.74        95
          통과       0.70      0.75      0.73        93
          투묘       0.76      0.70      0.73        50
          횡단       0.83      0.50      0.62        20

   micro avg       0.71      0.71      0.71       660
   macro avg       0.75      0.66      0.69       660
weighted avg       0.73      0.71      0.71       660





              precision    recall  f1-score   support

          도선       0.88      0.70      0.78        43
          선박       0.82      0.56      0.67        55
          양묘       0.60      0.21      0.32        14
          육상       0.53      0.85      0.65       123
          이동       0.82      0.83      0.82        59
          입항       0.82      0.62      0.71        45
          접안       0.95      0.84      0.89        50
          출항       0.81      0.79      0.80        91
          통과       0.79      0.77      0.78        96
          투묘       0.78      0.67      0.72        58
          횡단       0.88      0.54      0.67        26

   micro avg       0.74      0.74      0.74       660
   macro avg       0.79      0.67      0.71       660
weighted avg       0.77      0.74      0.74       660





              precision    recall  f1-score   support

          도선       0.76      0.53      0.63        30
          선박       0.76      0.59      0.67        49
          양묘       0.89      0.57      0.70        14
          육상       0.60      0.86      0.71       128
          이동       0.81      0.73      0.77        64
          입항       0.84      0.65      0.74        49
          접안       0.93      0.75      0.83        53
          출항       0.87      0.81      0.84        97
          통과       0.74      0.85      0.79       113
          투묘       0.82      0.70      0.76        47
          횡단       1.00      0.56      0.72        16

   micro avg       0.76      0.76      0.76       660
   macro avg       0.82      0.69      0.74       660
weighted avg       0.78      0.76      0.76       660



In [161]:
lrc2 = LogisticRegression(C = 0.8, max_iter = 500, n_jobs = 12, solver = 'sag')

In [163]:
for train_index, test_index in cv.split(raw_data) :
    lrc2.fit(raw_data.iloc[train_index], temp_category[train_index])
    yhat = lrc2.predict(raw_data.iloc[test_index])
    print(classification_report(temp_category[test_index], yhat))



              precision    recall  f1-score   support

          도선       0.77      0.64      0.70        47
          선박       0.76      0.85      0.80        53
          양묘       0.65      0.69      0.67        16
          육상       0.68      0.69      0.69       133
          이동       0.73      0.67      0.70        49
          입항       0.84      0.78      0.81        54
          접안       0.86      0.83      0.85        53
          출항       0.83      0.83      0.83        92
          통과       0.69      0.74      0.71       100
          투묘       0.64      0.68      0.66        40
          횡단       0.65      0.65      0.65        23

   micro avg       0.74      0.74      0.74       660
   macro avg       0.74      0.73      0.73       660
weighted avg       0.74      0.74      0.74       660





              precision    recall  f1-score   support

          도선       0.69      0.69      0.69        35
          선박       0.79      0.67      0.73        55
          양묘       0.75      0.57      0.65        21
          육상       0.55      0.70      0.62       125
          이동       0.69      0.67      0.68        57
          입항       0.72      0.77      0.74        47
          접안       0.90      0.88      0.89        51
          출항       0.81      0.82      0.81        98
          통과       0.77      0.74      0.75        89
          투묘       0.83      0.65      0.73        54
          횡단       0.71      0.54      0.61        28

   micro avg       0.72      0.72      0.72       660
   macro avg       0.75      0.70      0.72       660
weighted avg       0.73      0.72      0.72       660





              precision    recall  f1-score   support

          도선       0.56      0.48      0.51        21
          선박       0.83      0.73      0.78        52
          양묘       0.78      0.60      0.68        30
          육상       0.63      0.73      0.68       132
          이동       0.77      0.67      0.72        66
          입항       0.78      0.74      0.76        54
          접안       0.88      0.89      0.88        47
          출항       0.73      0.78      0.76        95
          통과       0.67      0.69      0.68        93
          투묘       0.71      0.72      0.71        50
          횡단       0.71      0.50      0.59        20

   micro avg       0.72      0.72      0.72       660
   macro avg       0.73      0.68      0.70       660
weighted avg       0.72      0.72      0.72       660





              precision    recall  f1-score   support

          도선       0.78      0.74      0.76        43
          선박       0.78      0.78      0.78        55
          양묘       0.45      0.36      0.40        14
          육상       0.60      0.76      0.67       123
          이동       0.78      0.80      0.79        59
          입항       0.71      0.67      0.69        45
          접안       0.93      0.84      0.88        50
          출항       0.80      0.77      0.79        91
          통과       0.78      0.75      0.77        96
          투묘       0.75      0.69      0.72        58
          횡단       0.83      0.58      0.68        26

   micro avg       0.74      0.74      0.74       660
   macro avg       0.75      0.70      0.72       660
weighted avg       0.75      0.74      0.74       660





              precision    recall  f1-score   support

          도선       0.72      0.60      0.65        30
          선박       0.79      0.69      0.74        49
          양묘       0.75      0.64      0.69        14
          육상       0.63      0.77      0.69       128
          이동       0.84      0.73      0.78        64
          입항       0.74      0.69      0.72        49
          접안       0.88      0.79      0.83        53
          출항       0.86      0.85      0.85        97
          통과       0.72      0.81      0.77       113
          투묘       0.82      0.70      0.76        47
          횡단       0.82      0.56      0.67        16

   micro avg       0.76      0.76      0.76       660
   macro avg       0.78      0.71      0.74       660
weighted avg       0.77      0.76      0.76       660



In [164]:
import lightgbm

In [171]:
lgbmc = lightgbm.LGBMClassifier(n_jobs = 12, n_estimators = 500, max_depth = -1)

In [172]:
for train_index, test_index in cv.split(raw_data2) :
    lgbmc.fit(raw_data2.iloc[train_index].values, temp_category[train_index])
    yhat = lgbmc.predict(raw_data2.iloc[test_index].values)
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.80      0.60      0.68        47
          선박       0.70      0.75      0.73        53
          양묘       0.56      0.62      0.59        16
          육상       0.67      0.71      0.69       133
          이동       0.70      0.65      0.67        49
          입항       0.84      0.76      0.80        54
          접안       0.87      0.85      0.86        53
          출항       0.78      0.85      0.81        92
          통과       0.74      0.67      0.70       100
          투묘       0.60      0.70      0.64        40
          횡단       0.52      0.57      0.54        23

   micro avg       0.72      0.72      0.72       660
   macro avg       0.71      0.70      0.70       660
weighted avg       0.73      0.72      0.72       660

              precision    recall  f1-score   support

          도선       0.64      0.71      0.68        35
          선박       0.80      0.75      0.77        55
          양묘       0.57 

In [169]:
lgbmc2 = lightgbm.LGBMClassifier(n_jobs = 12, n_estimators = 500, max_depth = -1)

In [170]:
for train_index, test_index in cv.split(raw_data) :
    lgbmc2.fit(raw_data.iloc[train_index].values, temp_category[train_index])
    yhat = lgbmc2.predict(raw_data.iloc[test_index].values)
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.76      0.60      0.67        47
          선박       0.71      0.75      0.73        53
          양묘       0.55      0.69      0.61        16
          육상       0.70      0.71      0.70       133
          이동       0.70      0.71      0.71        49
          입항       0.80      0.76      0.78        54
          접안       0.85      0.87      0.86        53
          출항       0.78      0.86      0.82        92
          통과       0.81      0.72      0.76       100
          투묘       0.65      0.70      0.67        40
          횡단       0.62      0.65      0.64        23

   micro avg       0.74      0.74      0.74       660
   macro avg       0.72      0.73      0.72       660
weighted avg       0.74      0.74      0.74       660

              precision    recall  f1-score   support

          도선       0.60      0.69      0.64        35
          선박       0.71      0.76      0.74        55
          양묘       0.48 

In [173]:
from sklearn.svm import LinearSVC

In [188]:
lsvc = LinearSVC(C = 0.7)

In [189]:
for train_index, test_index in cv.split(raw_data2) :
    lsvc.fit(raw_data2.iloc[train_index].values, temp_category[train_index])
    yhat = lsvc.predict(raw_data2.iloc[test_index].values)
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.78      0.66      0.71        47
          선박       0.78      0.81      0.80        53
          양묘       0.75      0.75      0.75        16
          육상       0.71      0.71      0.71       133
          이동       0.71      0.65      0.68        49
          입항       0.84      0.78      0.81        54
          접안       0.83      0.83      0.83        53
          출항       0.81      0.85      0.83        92
          통과       0.70      0.74      0.72       100
          투묘       0.69      0.72      0.71        40
          횡단       0.67      0.70      0.68        23

   micro avg       0.75      0.75      0.75       660
   macro avg       0.75      0.75      0.75       660
weighted avg       0.75      0.75      0.75       660

              precision    recall  f1-score   support

          도선       0.72      0.74      0.73        35
          선박       0.74      0.67      0.70        55
          양묘       0.76 

In [190]:
lsvc2 = LinearSVC(C = 0.7)

In [191]:
for train_index, test_index in cv.split(raw_data) :
    lsvc2.fit(raw_data.iloc[train_index].values, temp_category[train_index])
    yhat = lsvc2.predict(raw_data.iloc[test_index].values)
    print(classification_report(temp_category[test_index], yhat))



              precision    recall  f1-score   support

          도선       0.70      0.66      0.68        47
          선박       0.78      0.87      0.82        53
          양묘       0.60      0.75      0.67        16
          육상       0.72      0.63      0.67       133
          이동       0.65      0.63      0.64        49
          입항       0.76      0.76      0.76        54
          접안       0.80      0.85      0.83        53
          출항       0.79      0.83      0.81        92
          통과       0.69      0.66      0.67       100
          투묘       0.63      0.72      0.67        40
          횡단       0.71      0.74      0.72        23

   micro avg       0.72      0.72      0.72       660
   macro avg       0.71      0.74      0.72       660
weighted avg       0.72      0.72      0.72       660





              precision    recall  f1-score   support

          도선       0.61      0.71      0.66        35
          선박       0.75      0.69      0.72        55
          양묘       0.55      0.52      0.54        21
          육상       0.59      0.63      0.61       125
          이동       0.69      0.67      0.68        57
          입항       0.65      0.72      0.69        47
          접안       0.90      0.90      0.90        51
          출항       0.80      0.80      0.80        98
          통과       0.72      0.73      0.73        89
          투묘       0.76      0.59      0.67        54
          횡단       0.68      0.61      0.64        28

   micro avg       0.70      0.70      0.70       660
   macro avg       0.70      0.69      0.69       660
weighted avg       0.71      0.70      0.70       660





              precision    recall  f1-score   support

          도선       0.48      0.48      0.48        21
          선박       0.75      0.63      0.69        52
          양묘       0.72      0.60      0.65        30
          육상       0.63      0.70      0.66       132
          이동       0.72      0.67      0.69        66
          입항       0.75      0.74      0.75        54
          접안       0.90      0.91      0.91        47
          출항       0.77      0.78      0.77        95
          통과       0.64      0.62      0.63        93
          투묘       0.64      0.74      0.69        50
          횡단       0.56      0.45      0.50        20

   micro avg       0.69      0.69      0.69       660
   macro avg       0.69      0.67      0.67       660
weighted avg       0.70      0.69      0.69       660





              precision    recall  f1-score   support

          도선       0.77      0.70      0.73        43
          선박       0.73      0.67      0.70        55
          양묘       0.40      0.43      0.41        14
          육상       0.53      0.67      0.59       123
          이동       0.69      0.71      0.70        59
          입항       0.71      0.67      0.69        45
          접안       0.93      0.86      0.90        50
          출항       0.74      0.70      0.72        91
          통과       0.77      0.72      0.74        96
          투묘       0.75      0.69      0.72        58
          횡단       0.76      0.62      0.68        26

   micro avg       0.70      0.70      0.70       660
   macro avg       0.71      0.68      0.69       660
weighted avg       0.71      0.70      0.70       660

              precision    recall  f1-score   support

          도선       0.67      0.60      0.63        30
          선박       0.85      0.69      0.76        49
          양묘       0.64 



### Word Embedding으로 calssification 해보면?

In [192]:
from gensim.models import Word2Vec

In [193]:
embedding_model = Word2Vec.load('../LSTM/kr_word2vec_all.model')

In [194]:
def get_mean_vector(sentence) :
    words = sentence.split()
    result = []
    for word in words :
        try :
            result.append(embedding_model.wv.get_vector(word))
        except :
            continue
    result = np.array(result)
    if result.sum() == 0 :
        result = np.zeros((1, 300))
    result = result.mean(axis = 0)
    return result
    

In [195]:
embedding_data = np.array(None)
for sentence in temp_script :
    embedding_data = np.append(embedding_data, get_mean_vector(sentence))

In [196]:
embedding_data = np.delete(embedding_data, 0)

In [197]:
embedding_data = embedding_data.reshape(-1, 300)

In [198]:
xgbc3 = xgboost.XGBClassifier(max_depth = 4, n_jobs = 12, n_estimators = 500, random_state = 777)

In [199]:
for train_index, test_index in cv.split(embedding_data) :
    xgbc3.fit(embedding_data[train_index], temp_category[train_index])
    yhat = xgbc3.predict(embedding_data[test_index])
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.73      0.68      0.70        47
          선박       0.71      0.66      0.69        53
          양묘       0.50      0.19      0.27        16
          육상       0.66      0.77      0.71       133
          이동       0.64      0.61      0.62        49
          입항       0.81      0.72      0.76        54
          접안       0.87      0.77      0.82        53
          출항       0.76      0.83      0.79        92
          통과       0.74      0.73      0.73       100
          투묘       0.59      0.65      0.62        40
          횡단       0.60      0.52      0.56        23

   micro avg       0.71      0.71      0.71       660
   macro avg       0.69      0.65      0.66       660
weighted avg       0.71      0.71      0.71       660

              precision    recall  f1-score   support

          도선       0.59      0.77      0.67        35
          선박       0.79      0.62      0.69        55
          양묘       0.57 

In [192]:
# lrc2 = LogisticRegression(C = 0.8, max_iter = 500, n_jobs = 12, solver = 'lbfgs')

In [193]:
# for train_index, test_index in cv.split(embedding_data) :
#     lrc2.fit(embedding_data[train_index], temp_category[train_index])
#     yhat = lrc2.predict(embedding_data[test_index])
#     print(classification_report(temp_category[test_index], yhat))

logistic regression은 성능이 너무 안나와서 코드 실행 x

### N-gram 사용

In [200]:
with open('../LSTM/data_for_classification.pickle', 'rb') as f :
    raw_data = pickle.load(f)

In [201]:
raw_data = raw_data[raw_data.Language == 'KR']

In [202]:
raw_data['temp_code'] = raw_data.Script_num.map(lambda x : '-'.join(x.split('-')[:3]))

In [203]:
raw_data.head()

Unnamed: 0,Script_num,Script,Language,Category,temp_code
0,UL-MV-01-02,아 울산VTS [SN] 감도 있습니까?,KR,이동,UL-MV-01
1,UL-MV-01-03,네 [SN],KR,이동,UL-MV-01
2,UL-MV-01-04,네 수고 많으십니다 본선 델타라인 통과해서 본선 1부두 접안하러 이동하겠습니다,KR,이동,UL-MV-01
3,UL-MV-01-05,들어오세요,KR,이동,UL-MV-01
4,UL-MV-01-06,네 저희 지금…,KR,이동,UL-MV-01


In [204]:
temp_code2 = 'UL-MV-01'
temp_script2 = []
temp_category2 = []
temp2 = ''
for i, row in raw_data.iterrows() :
    if temp_code2 == row.temp_code :
        temp2 += ' ' + ''.join(row.Script)
    else :
        temp_code2 = row.temp_code
        temp_script2.append(temp2.strip())
        temp_category2.append(row.Category)
        temp2 = ''

In [205]:
from sklearn.feature_extraction.text import CountVectorizer

In [216]:
ngram_vectorizer = CountVectorizer(ngram_range = (1, 2), min_df = 10)

In [217]:
ngram = ngram_vectorizer.fit_transform(temp_script2)

In [218]:
ngram = ngram.toarray()

In [219]:
ngram.shape

(3300, 989)

In [226]:
xgbc3 = xgboost.XGBClassifier(max_depth = 3, n_jobs = 12, n_estimators = 500)

In [227]:
for train_index, test_index in cv.split(ngram) :
    xgbc3.fit(ngram[train_index], temp_category[train_index])
    yhat = xgbc3.predict(ngram[test_index])
    print(classification_report(temp_category[test_index], yhat))

              precision    recall  f1-score   support

          도선       0.81      0.74      0.78        47
          선박       0.79      0.79      0.79        53
          양묘       0.64      0.56      0.60        16
          육상       0.66      0.78      0.72       133
          이동       0.69      0.63      0.66        49
          입항       0.89      0.76      0.82        54
          접안       0.85      0.85      0.85        53
          출항       0.76      0.77      0.76        92
          통과       0.77      0.75      0.76       100
          투묘       0.62      0.60      0.61        40
          횡단       0.74      0.61      0.67        23

   micro avg       0.74      0.74      0.74       660
   macro avg       0.75      0.71      0.73       660
weighted avg       0.75      0.74      0.74       660

              precision    recall  f1-score   support

          도선       0.71      0.71      0.71        35
          선박       0.68      0.75      0.71        55
          양묘       0.56 

### Time information 반영

In [None]:
temp_code = 'UL-MV-01'
temp_script2 = []
temp_category2 = []
temp2 = ''
step = []
temp_step = 0
for i, row in train_kr.iterrows() :
    temp_step += 1
    if temp_code == row.temp_code :
        temp2 += ' ' + ' '.join(row.Script)
        temp_script2.append(temp2)
        temp_category2.append(row.Category)
        step.append(temp_step)
    else :
        temp_code = row.temp_code
        temp_script2.append(temp2)
        temp_category2.append(row.Category)
        temp2 = ''
        step.append(temp_step)
        temp_step = 0

In [None]:
temp_df = pd.DataFrame(tf.transform(temp_script2).toarray(), columns = list(dictionary.keys()))

In [None]:
temp_df['category'] = temp_category2
temp_df['step'] = step

상위 3개 예측 category에 실제 label이 들어가면 정답

In [None]:
for i in range(1, 25) :
    data = temp_df[temp_df.step == i].drop(['category', 'step'], axis = 1)
    yhat = xgbc.predict(data)
    print(classification_report(temp_df[temp_df.step == i].category.values, yhat), i)