In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd

### 파일로드

In [None]:
text_dataset=pd.read_csv('text_dataset_2015_2019.csv')
meta_2015_2019=pd.read_csv('meta_dataset_2015_2019.csv')
ks_text_state=pd.concat([text_dataset[['content','risk_challenge']],meta_2015_2019['state']],axis=1)

In [None]:
#결측치확인
ks_text_state.isna().sum()

In [None]:
#null인 content 행 걸러내기
null_index=ks_text_state[ks_text_state.content.isna()].index.tolist()#null값있는 행 index
ks_text_state=ks_text_state.drop(null_index)

In [None]:
#공백이 있는 content([]) 걸러내기
cont_null_index=[]
for i in ks_text_state.index:
    if ks_text_state.content.loc[i]=="[]":
        cont_null_index.append(i)
ks_text_state=ks_text_state.drop(cont_null_index)

In [None]:
#null인 risk_challenge 확인하고 " "으로 대체
risk_isna=ks_text_state[ks_text_state.risk_challenge.isna()].index.tolist()
ks_text_state.risk_challenge=ks_text_state.risk_challenge.fillna(" ")

In [None]:
#제거한 행의 index list->다른 데이터들과 맞추기위해 남겨두기
total_null_index=null_index+cont_null_index

#null값인 행의 index 목록 pickle 저장
import pickle
with open('total_null_index2.pkl','wb') as f:
    pickle.dump(total_null_index,f)

In [None]:
ks_text_state['all_text']=ks_text_state['content']+ks_text_state['risk_challenge']

In [None]:
y=ks_text_state['state']
X=ks_text_state['all_text']
X.shape,y.shape

### 텍스트 전처리

In [None]:
#y label encoding
#라벨링
#Series를 받아서 라벨인코딩 처리하는 함수
from sklearn.preprocessing import LabelEncoder
col_dict={}

def encoding(x):
    le=LabelEncoder()
#fit()어떻게 변환할지 fitting
    le.fit(x)
#변환
    label_x=le.transform(x)
    col_dict[x.name]=le.classes_
    return label_x
y_enc=encoding(y)

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# 전처리 함수
def text_preprocessing(document):
#     if math.isnan(document):
#         return
    
    # 소문자 변환
    type(document)
    document = document.lower()
    document = document.replace('\\xa0',' ')
    document = document.replace('•\\t',' ') 
    # 특수문자 제거
    pattern = '[{}]'.format(string.punctuation)
    document = re.sub(pattern, ' ', document)
    # stopword 제거, stemming
    sw = stopwords.words('english')+['may']
    word_token=nltk.word_tokenize(document)
    stemmer = PorterStemmer()

    result_token=[ stemmer.stem(word) for word in word_token if word not in sw]
    #문자열로 변환 후 반환
    return ' '.join(result_token)
    

In [None]:
X_list=list(X.values)
text = [text_preprocessing(x) for x in X_list]

In [None]:
#전처리한 텍스트데이터 pickle저장
with open('preprocessed_all_text.pkl','wb') as f:
    pickle.dump(text,f)
with open('y_encoding.pkl','wb') as f:
    pickle.dump(y_enc,f)

In [None]:
# #읽기
# import pickle
# with open('y_encoding.pkl','rb') as f:
#     y_enc=pickle.load(f)
# with open('preprocessed_all_text.pkl','rb') as f:
#     preprocessed_all_text=pickle.load(f)

In [None]:
max_features=10000
maxlen=500
batch_size=32

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(preprocessed_all_text)
vocab_size = len(tokenizer.word_index) + 1
print('단어 집합의 크기 : %d' % vocab_size)

sequences=tokenizer.texts_to_sequences(preprocessed_all_text)

word_index=tokenizer.word_index
data=pad_sequences(sequences,maxlen=maxlen)
labels=np.array(y_enc)
print(data.shape,labels.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, labels,stratify=labels) 
X_train.shape,X_test.shape,y_train.shape,y_test.shape


In [None]:
embedding_dim=300
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense,LSTM,SimpleRNN,Dropout


model = Sequential()
model.add(Embedding(max_features, embedding_dim, input_length=maxlen))

model.add(LSTM(32))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy','AUC','Recall','Precision'])
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,validation_split=0.2)
model.save('lstm_all_text.h5')

In [None]:
model.evaluate(X_test, y_test)


In [None]:
## 저장한 모델 로드하여 예측하기

from tensorflow.keras.models import load_model
new_model = load_model('lstm_all_text.h5')

test1_pre = [text_preprocessing(x) for x in test1]

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(test1_pre)
vocab_size = len(tokenizer.word_index) + 1

sequences=tokenizer.texts_to_sequences(test1_pre)
data=pad_sequences(sequences,maxlen=maxlen)

pred_cls = new_model.predict_classes(data)

pred_proba=new_model.predict_proba(data)
