# 4. word embedding

In [None]:
max_len = 100
tokenizer = Tokenizer()
vocabulary = tokenizer.word_index

## (1) Keras Embedding Layer

In [None]:
from keras.models import Sequential
from keras.layers import Embedding
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length = max_len))

## (2) word2vec

In [None]:
import gensim
word2vec = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/ESAA-2/GoogleNews-vectors-negative300.bin.gz', binary = True)

In [None]:
embedding_matrix = np.zeros((vocab_size, 300)) #300차원의 임베딩 매트릭스 생성

for index, word in enumerate(vocabulary): #vocabulary에 있는 토큰들을 하나씩 넘겨줍니다.
    if word in word2vec: #넘겨 받은 토큰이 word2vec에 존재하면(이미 훈련이 된 토큰이라는 뜻)
        embedding_vector = word2vec[word] #해당 토큰에 해당하는 vector를 불러오고
        embedding_mxtrix[i] = embedding_vector #해당 위치의 embedding_mxtrix에 저장합니다.
    else:
        print("word2vec에 없는 단어입니다.")
        break

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 300,weights = [embedding_matrix], input_length = max_len))

## (3) glove

In [None]:
import numpy as np

# load the whole embedding into memory
glove = dict()
f = open('/content/drive/MyDrive/ESAA-2/glove.txt')
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    glove[word] = vector
f.close()

In [None]:
embedding_matrix = np.zeros((vocab_size, 300)) #300차원의 임베딩 매트릭스 생성

for index, word in enumerate(vocabulary): #vocabulary에 있는 토큰들을 하나씩 넘겨줍니다.
    if word in glove: #넘겨 받은 토큰이 word2vec에 존재하면(이미 훈련이 된 토큰이라는 뜻)
        embedding_vector = glove[word] #해당 토큰에 해당하는 vector를 불러오고
        embedding_mxtrix[i] = embedding_vector #해당 위치의 embedding_mxtrix에 저장합니다.
    else:
        print("glove 없는 단어입니다.")
        break

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 300,weights = [embedding_matrix], input_length = max_len))

## (4) Fasttext

In [None]:
from gensim.models.keyedvectors import KeyedVectors
FastText = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/ESAA-2/fasttext.vec', binary = True, unicode_errors='ignore')

In [None]:
embedding_matrix = np.zeros((vocab_size, 300)) #300차원의 임베딩 매트릭스 생성

for index, word in enumerate(vocabulary): #vocabulary에 있는 토큰들을 하나씩 넘겨줍니다.
    if word in word2vec: #넘겨 받은 토큰이 word2vec에 존재하면(이미 훈련이 된 토큰이라는 뜻)
        embedding_vector = word2vec[word] #해당 토큰에 해당하는 vector를 불러오고
        embedding_mxtrix[i] = embedding_vector #해당 위치의 embedding_mxtrix에 저장합니다.

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 300,weights = [embedding_matrix], input_length = max_len))

# 5. Modeling

In [None]:
train = pd.read_csv('/content/drive/MyDrive/ESAA-2/0913_train.csv')

## (1) 간단한 전처리 + 형태소 분석

In [None]:
from konlpy.tag import Okt
import re
import tqdm 

def text_preprocessing(text_list):
    
    stopwords = ['을', '를', '이', '가', '은', '는', 'null'] #불용어 설정
    tokenizer = Okt() #형태소 분석기 
    token_list = []
    
    for text in tqdm.tqdm(text_list):
        txt = re.sub('[^가-힣a-z]', ' ', text) #한글과 영어 소문자만 남기고 다른 글자 모두 제거
        token = tokenizer.morphs(txt) #형태소 분석
        token = [t for t in token if t not in stopwords or type(t) != float] #형태소 분석 결과 중 stopwords에 해당하지 않는 것만 추출
        token_list.append(token)
        
    return token_list, tokenizer

#형태소 분석기를 따로 저장한 이유는 후에 test 데이터 전처리를 진행할 때 이용해야 되기 때문입니다. 
train['token'], okt = text_preprocessing(train['text'])

100%|██████████| 54879/54879 [01:05<00:00, 843.89it/s] 


In [None]:
train.head()

Unnamed: 0,index,text,author,token
0,0,"He was almost choking. There was so much, so m...",3,"[e, was, almost, choking, here, was, so, much,..."
1,1,"“Your sister asked for it, I suppose?”",2,"[our, sister, asked, for, it, suppose]"
2,2,"She was engaged one day as she walked, in per...",1,"[he, was, engaged, one, day, as, she, walked, ..."
3,3,"The captain was in the porch, keeping himself ...",4,"[he, captain, was, in, the, porch, keeping, hi..."
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3,"[ave, mercy, gentlemen, odin, flung, up, his, ..."


## (2) vectorization

In [None]:
def text2sequence(train_text, max_len=1000):
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_text)
    train_X_seq = tokenizer.texts_to_sequences(train_text)
    vocab_size = len(tokenizer.word_index) + 1
    print('vocab_size : ', vocab_size)
    X_train = pad_sequences(train_X_seq, maxlen = max_len)
    return X_train, vocab_size, tokenizer

train_y = train['author']
train_X, vocab_size, vectorizer = text2sequence(train['token'], max_len = 100)
print(train_X.shape, train_y.shape)

vocab_size :  36342
(54879, 100) (54879,)


## (3) Embedding

In [None]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/ESAA-2/GoogleNews-vectors-negative300.bin.gz', binary = True)
embedding_matrix = np.zeros((vocab_size, 300))

for index, word in enumerate(vocabulary):
    if word in word2vec:
        embedding_vector = word2vec[word] 
        embedding_mxtrix[i] = embedding_vector 
    else:
        print("word2vec에 없는 단어입니다.")
        break

## (4) Modeling

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding

def LSTM(vocab_size, max_len=1000):
    model = Sequential()
    model.add(Embedding(vocab_size, 300,weights = [embedding_matrx], input_length = max_len)) #임베딩 가중치 적용 코드
    model.add(SpatialDropout1D(0.3))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu', kernel_regularizer = regularizers.l2(0.001)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')
    model.summary()
    return model