# 기사 댓글 감성분석 LSTM 모델 생성 및 감성분석
#### 마지막 실행일자 : 23.12.04

# 라이브러리 설치

In [None]:
!pip install update konlpy

In [None]:
!pip install Okt

# 라이브러리 & 데이터 불러오기

In [None]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from konlpy.tag import Okt
from google.colab import drive

In [None]:
# connect google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 데이터 불러오기
train = pd.read_csv("/content/drive/MyDrive/Models/train_fin.csv", index_col=0)
test = pd.read_csv("/content/drive/MyDrive/Models/test_fin.csv", index_col=0)
new= pd.read_csv("/content/drive/MyDrive/Models/네이버_뉴스_댓글.csv",  encoding='cp949')
# df = pd.read_csv(PATH , encoding='cp949', sep='\t')
train.reset_index(inplace=True)
test.reset_index(inplace=True)
new.reset_index(inplace=True)

In [None]:
new.tail()

In [None]:
# 데이터 확인
train.tail()

In [None]:
# 데이터 확인
test.tail()

# 데이터 전처리


## 자연어 정제

In [None]:
# 댓글 정제 - 한글 제외 모두 삭제
train["document"] = train["document"].str.replace(pat=r'[^\w!?]', repl=r' ', regex=True)
train.head()

## 결측치 확인, 처리

In [None]:
# train set 결측치 확인/삭제
new.dropna(axis=0, inplace=True)
print(new.isnull().sum())

In [None]:
# train set 결측치 확인
print(train.isnull().sum())
print(test.isnull().sum())

In [None]:
# train, test set 행/열 확인
print(train.shape)
print(test.shape)
print(new.shape)

## 데이터 EDA (label의 분포 확인)

In [None]:
# train의 label 분포
train["label"].value_counts().plot(kind="bar")

In [None]:
# test의 label 분포
test["label"].value_counts().plot(kind="bar")

# 자연어 처리
https://somjang.tistory.com/entry/Keras%EA%B8%B0%EC%82%AC-%EC%A0%9C%EB%AA%A9%EC%9D%84-%EA%B0%80%EC%A7%80%EA%B3%A0-%EA%B8%8D%EC%A0%95-%EB%B6%80%EC%A0%95-%EC%A4%91%EB%A6%BD-%EB%B6%84%EB%A5%98%ED%95%98%EB%8A%94-%EB%AA%A8%EB%8D%B8-%EB%A7%8C%EB%93%A4%EC%96%B4%EB%B3%B4%EA%B8%B0

In [None]:
# train/test split
X_train = train['document']
X_test = test['document']
y_train = np.array(train["label"])
y_test = np.array(test["label"])

In [None]:
# 불용어
stopwords = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다']

In [None]:
# 토큰화
import konlpy
from konlpy.tag import Okt
okt = Okt()
X_train = []

for sentence in train['document']:
  temp_X = []
  temp_X = okt.morphs(sentence, stem=True) # 토큰화
  temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
  X_train.append(temp_X)

# X_test = []
# for sentence in test['document']:
#   temp_X = []
#   temp_X = okt.morphs(sentence, stem=True) # 토큰화
#   temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
#   X_test.append(temp_X)

In [None]:
X_new = []

for sentence in new['Comments']:
  temp_X = []
  temp_X = okt.morphs(sentence, stem=True) # 토큰화
  temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
  X_new.append(temp_X)

In [None]:
# 정수인코딩
from keras.preprocessing.text import Tokenizer
max_words = 35000
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
# X_test = tokenizer.texts_to_sequences(X_test)
X_new = tokenizer.texts_to_sequences(X_new)

In [None]:
# train 데이터 확인
print("댓글의 최대 길이 : ", max(len(l) for l in X_train))
print("댓글의 평균 길이 : ", sum(map(len, X_train))/ len(X_train))
plt.hist([len(s) for s in X_train], bins=50)
plt.xlabel('length of Data')
plt.ylabel('number of Data')
plt.show()

In [None]:
# test 데이터 확인
print("제목의 최대 길이 : ", max(len(l) for l in X_test))
print("제목의 평균 길이 : ", sum(map(len, X_test))/ len(X_test))
plt.hist([len(s) for s in X_test], bins=50)
plt.xlabel('length of Data')
plt.ylabel('number of Data')
plt.show()

In [None]:
# 원핫인코딩
import numpy as np

y_train = []
y_test = []

for i in range(len(train['label'])):
  if train['label'].iloc[i] == 1:
    y_train.append([0, 0, 1])
  elif train['label'].iloc[i] == 0:
    y_train.append([0, 1, 0])
  elif train['label'].iloc[i] == -1:
    y_train.append([1, 0, 0])

# for i in range(len(test['label'])):
#   if test['label'].iloc[i] == 1:
#     y_test.append([0, 0, 1])
#   elif test['label'].iloc[i] == 0:
#     y_test.append([0, 1, 0])
#   elif test['label'].iloc[i] == -1:
#     y_test.append([1, 0, 0])

y_train = np.array(y_train)
# y_test = np.array(y_test)

In [None]:
# 모델 만들기
from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 20 # 전체 데이터의 길이를 20로 맞춘다

X_train = pad_sequences(X_train, maxlen=max_len)
# y_train = pad_sequences(y_train, maxlen=max_len)
# X_test = pad_sequences(X_test, maxlen=max_len)
X_new = pad_sequences(X_new, maxlen=max_len)

In [None]:
# 모델 만들기
model = Sequential()
model.add(Embedding(max_words, 100))
model.add(LSTM(128))
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, batch_size=10, validation_split=0.1)

In [None]:
# 모델 저장
import joblib  # 모델저장/불러오기
joblib.dump(model,'/content/drive/MyDrive/Models/lstm_model_2023.pkl')  #모델저장

In [None]:
import joblib  # 모델저장/불러오기
model = joblib.load('/content/drive/MyDrive/Models/lstm_model_2023.pkl')   #모델불러오기

In [None]:
# import tensorflow as tf

# 예측
# predict = model.predict(X_test)
# X_new = X_new.astype(np.l)
# X_new=np.asarray(X_new).astype(np.float32)
predict = model.predict(X_new)

In [None]:
import numpy as np
predict_labels = np.argmax(predict, axis=1)
# original_labels = np.argmax(y_test, axis=1)

In [None]:
predict_labels.shape

In [None]:
# # 예측결과
# for i in range(30):
#   print("기사제목 : ", test['document'].iloc[i], "/\t 원래 라벨 : ", original_labels[i], "/\t예측한 라벨 : ", predict_labels[i])

# 예측결과
for i in range(len(X_new)):
  print("기사댓글 : ", new['Comments'].iloc[i], "/\t label : ", predict_labels[i])

In [None]:
new.reset_index(inplace=True)

In [None]:
predict_labels_df = pd.Series(predict_labels)
predict_labels_df

In [None]:
predicted_df = pd.concat([new, predict_labels_df], axis=1)
predicted_df

In [None]:
predicted_df.rename(columns = {0 : 'label', 'content':'comment'}, inplace = True)

In [None]:
predicted_df.head()

In [None]:
# 1: 1 긍정 0 부정 -1 중립
# 1: 부정, 2:긍정, 0 중립

In [None]:
predicted_df['label'].unique()

In [None]:
predicted_df[predicted_df['label']=='-1'].head(20)

In [None]:
predicted_df['label'] = predicted_df['label'].replace(0, '-1')

In [None]:
predicted_df['label'] = predicted_df['label'].replace(1, '0')

In [None]:
predicted_df['label'] = predicted_df['label'].replace(2, '1')

In [None]:
predicted_df.drop(labels='index', axis=1, inplace=True)

In [None]:
predicted_df

In [None]:
# 테스트 정확도
print('테스트 정확도: {:.2f}%'.format(model.evaluate(X_test, y_test)[1]*100))

In [None]:
# 엑셀 저장
# predicted_df = pd.concat({'document':test['document'], 'actual':pd.Series(original_labels), 'predicted': pd.Series(predict_labels)}, axis=1)

predicted_df.to_csv('/content/drive/MyDrive/Models/네이버_뉴스_댓글_감성분석추가.csv', encoding='utf-8-sig')