In [40]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import collections
import pickle
import datetime
import re

In [41]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from konlpy.tag import Kkma, Okt
from konlpy.utils import pprint

In [42]:
with open('data_for_classification.pickle', 'rb') as f :
    raw_data = pickle.load(f)

In [43]:
raw_data.groupby('Category')['Language'].count()

Category
도선    1788
선박    1710
양묘     684
육상    4141
이동    1534
입항    1596
접안    1311
출항    2674
통과    2932
투묘    2942
횡단     583
Name: Language, dtype: int64

In [44]:
script_en = raw_data[raw_data.Language == 'EN'].Script
script_kor = raw_data[raw_data.Language == 'KR'].Script

In [45]:
text_en = script_en.values
text_kor = script_kor.values

In [46]:
def clean_str(s):
    # only include alphanumerics
    s = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", s)
    
    # insert spaces in words with apostrophes
    s = re.sub(r"\'s", " \'s", s)
    s = re.sub(r"\'ve", " \'ve", s)
    s = re.sub(r"n\'t", " n\'t", s)
    s = re.sub(r"\'re", " \'re", s)
    s = re.sub(r"\'d", " \'d", s)
    s = re.sub(r"\'ll", " \'ll", s)
    
    # 숫자라는 정보만 가져오기 위해 모든 숫자 0으로 통일
    s = re.sub(r'[0-9]+', '0', s)
    
    
    # insert spaces in special characters
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\(", " \( ", s)
    s = re.sub(r"\)", " \) ", s)
    s = re.sub(r"\?", " \? ", s)
    
    # only include alphanumerics again
    s = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", s)
    
    # reduce multiple spaces to single spaces
    s = re.sub(r"\s{2,}", " ", s)
    return s.strip().lower()

In [47]:
def stopword_stem(sentence):
    stemmer = SnowballStemmer("english")
    stopword_set = set(stopwords.words('english'))
    
    split_sentence = sentence.split()
    stem_sentence = []
    for word in split_sentence:
        stem_word = stemmer.stem(word)
        stem_sentence.append(stem_word)
    filtered_sentence = [w for w in stem_sentence if not w in stopword_set]  
    return " ".join(filtered_sentence)

In [48]:
text_en = np.reshape(text_en, [-1])
text_en = text_en.tolist()
clean_en = clean_data = [clean_str(x) for x in text_en]

In [49]:
clean_en = [stopword_stem(x) for x in clean_en]

In [50]:
vocab = []
for text in clean_en:
    split_text = text.split()
    vocab.extend(split_text)
vocab = set(vocab)    
vocab_size = len(vocab)
print("vocab size is %d"%vocab_size)

vocab size is 943


In [51]:
def clean_str(s):
    # only include alphanumerics and Korean
    s = re.sub(r"[^A-Za-z0-9가-힣(),!?\'\`]", " ", s)
    
    # insert spaces in words with apostrophes
    s = re.sub(r"\'s", " \'s", s)
    s = re.sub(r"\'ve", " \'ve", s)
    s = re.sub(r"n\'t", " n\'t", s)
    s = re.sub(r"\'re", " \'re", s)
    s = re.sub(r"\'d", " \'d", s)
    s = re.sub(r"\'ll", " \'ll", s)
    
    # 숫자라는 정보만 가져오기 위해 모든 숫자 0으로 통일
    s = re.sub(r'[0-9]+', '0', s)
    
    # insert spaces in special characters
    s = re.sub(r",", " , ", s)
    s = re.sub(r"!", " ! ", s)
    s = re.sub(r"\(", " \( ", s)
    s = re.sub(r"\)", " \) ", s)
    s = re.sub(r"\?", " \? ", s)
    
    # only include alphanumerics and Korean again
    s = re.sub(r"[^A-Za-z0-9가-힣(),!?\'\`]", " ", s)
    
    # reduce multiple spaces to single spaces
    s = re.sub(r"\s{2,}", " ", s)
    return s.strip().lower()

In [52]:
clean_kor = [clean_str(x) for x in text_kor]

In [53]:
kkma = Kkma()

In [54]:
kor_pos = [kkma.pos(x) for x in clean_kor]

In [55]:
kor_token = [kkma.morphs(x) for x in clean_kor]

In [56]:
kor_nouns = [kkma.nouns(x) for x in clean_kor]

In [57]:
from gensim.models import Word2Vec

In [84]:
#명사만
embedding_model = Word2Vec(kor_nouns, size=300, window = 10, alpha = 0.025, min_count = 10, workers=8, iter=500, sg=1, compute_loss = True)
wordmodelname = 'kr_word2vec_noun.model'
filename = 'train_kr_noun.pkl'
docmodelname = 'kr_doc2vec_noun.model'

#raw data
# embedding_model = Word2Vec(list(map(lambda x : x.split(), clean_kor)), size=300, window = 10, min_count = 10, workers=8, iter=500, sg=1, compute_loss = True)
# wordmodelname = 'kr_word2vec_raw.model'
# filename = 'train_kr_raw.pkl'
# docmodelname = 'kr_doc2vec_raw.model'

#모든 token
# embedding_model = Word2Vec(kor_token, size=300, window = 10, min_count = 10, workers=8, iter=500, sg=1, compute_loss = True)
# wordmodelname = 'kr_word2vec_all.model'
# filename = 'train_kr_all.pkl'
# docmodelname = 'kr_doc2vec_all.model'

In [59]:
embedding_model.init_sims(replace=True)

In [60]:
kor_vectors = embedding_model.wv.vectors

In [61]:
kor_vectors.shape

(792, 300)

In [62]:
kor_vocab = embedding_model.wv.index2word

In [63]:
kor_embedding = [kor_vocab, kor_vectors]

In [86]:
train_kr = raw_data[raw_data.Language == 'KR']

In [87]:
train_kr.Script = train_kr.Script.map(clean_str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [88]:
train_kr.Script = train_kr.Script.map(kkma.nouns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [68]:
embedding_model.save(wordmodelname)

In [69]:
embedding_model = Word2Vec.load(wordmodelname)

In [70]:
embedding_model.wv.most_similar('조심')

[('히', 0.42756569385528564),
 ('주의', 0.2500979006290436),
 ('잘', 0.23382820188999176),
 ('안전', 0.2296481430530548),
 ('건너가', 0.22516773641109467),
 ('정보', 0.22209471464157104),
 ('ㅂ시오', 0.21655811369419098),
 ('곳', 0.21358218789100647),
 ('calling', 0.20614121854305267),
 ('세요', 0.20552211999893188)]

In [89]:
with open(filename, 'wb') as f :
    pickle.dump(train_kr, f)

In [73]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [74]:
tagged_kr = [TaggedDocument(train_kr.Script.iloc[i], tags = [str(i)]) for i in range(len(train_kr))]

In [75]:
max_epochs = 300
vec_size = 300
alpha = 0.025

In [76]:
model = Doc2Vec(vector_size = vec_size,
                alpha = alpha,
                min_count = 10)

In [77]:
model.build_vocab(tagged_kr)

In [81]:
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_kr,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    if epoch * 10 == 0 :
        model.alpha *= 0.001

# model.save("d2v.model")
# print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [82]:
model.save(docmodelname)

In [83]:
model = Doc2Vec.load('kr_doc2vec_noun.model')