In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 4.6 MB/s 
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (453 kB)
[K     |████████████████████████████████| 453 kB 49.4 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.0 konlpy-0.6.0


### >> 사용할 library

In [9]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

## library forTokenization & Padding
try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass
  
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from konlpy.tag import Okt 

In [4]:
# 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/보아즈_프로젝트/스토커/데이터/labeling5.csv', encoding='utf-8')
df = df.iloc[:, [0, 6]]
df.rename(columns = {'나현':'label'} , inplace = True)
df = df.drop_duplicates(['answer','label'])

---
# 한국어 텍스트 데이터 전처리
---

In [8]:
# 정규표현식 적용
df['nor_answer'] = df['answer'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "")

# 다중 공백 제거
df['nor_answer'] = df['nor_answer'].str.replace("  ", " ")

df[56:61]

Unnamed: 0,answer,label,nor_answer
60,ㅠㅠㅠㅠ왜 요즈음힘든일있어?,0.3,ㅠㅠㅠㅠ왜 요즈음힘든일있어
62,왜 무슨 일이야!,0.3,왜 무슨 일이야
63,엥 근데 잘어울린다 너랑,1.0,엥 근데 잘어울린다 너랑
64,오 무슨색?,0.7,오 무슨색
65,잘어울린다!,1.0,잘어울린다


---
# 토큰화
---
*   말뭉치를 주어진 단위(token)로 나누는 과정
*   형태소 분석기 : Okt 사용

In [12]:
okt = Okt() # 명사 형태소 추출 함수

# 불용어 제거
stop_words = pd.read_csv('/content/drive/MyDrive/보아즈_프로젝트/스토커/bul.txt',header = None)[0].values
tokenized = []

for answer in tqdm(df['nor_answer']) :
  if answer is not np.nan: # NaN값 미포함
    result = okt.morphs(answer) # 형태소
    result = [x for x in result if x not in stop_words] # 불용어 제거
    tokenized.append(result)

  0%|          | 0/1095 [00:00<?, ?it/s]

In [15]:
df = df.dropna(axis=0).reset_index()
X_data = pd.DataFrame(df['nor_answer'])
y_data = pd.DataFrame(df['label'])

In [16]:
import pickle

vocab_size = 10000

token = Tokenizer(num_words = vocab_size)
token.fit_on_texts(tokenized)

# saving
with open('/content/drive/MyDrive/보아즈_프로젝트/스토커/tokenizer.pickle', 'wb') as handle:
    pickle.dump(token, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
X_label_data = token.texts_to_sequences(tokenized)

for idx, word in enumerate(range(1,26), 1):
  print(idx , token.index_word[word])

1 생각
2 잘
3 말
4 내
5 기분
6 은
7 한다
8 다
9 는
10 너무
11 안
12 뭐
13 해
14 좋다
15 관심
16 한
17 도
18 할
19 투자
20 열심히
21 의심
22 근데
23 거
24 마음
25 염색


---
# 패딩
---

In [20]:
trunc_type = 'post'
padding_type = 'post'
max_length = 75

X_data_p = pad_sequences(X_label_data, truncating=trunc_type, padding = padding_type, maxlen = max_length) 

---
# 패딩
---

In [21]:
y_label = []
for y in y_data['label'].values:
    if y<0.5:
       y_label.append([1,0]) 
    else:
       y_label.append([0,1]) 
       
y_label = np.array(y_label)

In [22]:
# train,test set 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X_data_p), y_label, test_size = 0.2, random_state = 1234)

---
# Modeling
---

*   Bi_LSTM



In [23]:
embedding_dim = 10
model_lstm = tf.keras.Sequential([
            Embedding(10000, embedding_dim),
            Bidirectional(tf.keras.layers.LSTM(128 , return_sequences= True)),
            Bidirectional(tf.keras.layers.LSTM(64)),
            Dense(64, activation = 'relu'),
            Dropout(0.5), 
            Dense(2, activation = 'softmax')
])

model_lstm.summary()
model_lstm.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
lstm_history = model_lstm.fit(X_train, y_train, epochs = 5 , batch_size = 64)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 10)          100000    
                                                                 
 bidirectional (Bidirectiona  (None, None, 256)        142336    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              164352    
 nal)                                                            
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 2)                 1

In [24]:
print("\n테스트정확도: {:.2f}%".format(model_lstm.evaluate(X_test, y_test)[1]*100))


테스트정확도: 74.89%


In [25]:
# 모델 저장하기
from keras.models import load_model
model_lstm.save('/content/drive/MyDrive/보아즈_프로젝트/스토커/bilstm_model.h5')

In [28]:
# Test set
pred = model_lstm.predict(X_test.values)
pred_label = np.argmax(pred, axis = 1)
orig_label = np.argmax(y_test, axis = 1)

X_test_indices = X_test.index
df_for_check = df.copy()
#df_for_check = df_for_check[df_for_check.index.isin(X_test_indices)]
df_for_check = df_for_check.loc[X_test_indices]
df_for_check['orig'] = orig_label
df_for_check['pred'] = pred_label

df_for_check[['answer','orig','pred']]

Unnamed: 0,answer,orig,pred
60,잘어울린다!,1,1
330,관심가져주고 질문많이 해줘서 고마움,1,1
1070,무조건..?이라고..? 흠 뭔가 의심이 간다,0,0
366,마음에 들지만 경계한다,0,0
76,화이팅,1,1
...,...,...,...
782,와 글쿤요^^! 라고 대답하고 집가서 평소처럼 산다 (연락안함),0,1
315,날 걱정해주다니..고마워 친구..,1,1
156,빚지는건 오바다,0,0
324,그럴 수 있지,1,1
