In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import requests

In [6]:
!pip install konlpy



In [7]:
import numpy as np
import pandas as pd
import re
import json
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

train_data = pd.read_csv('https://github.com/HMinjae/heartfelt_project/raw/main/TF_train.txt', header = 0, delimiter='\t', quoting=3)

In [9]:
#전처리 함수 만들기
def preprocessing(review, okt, remove_stopwords = False, stop_words =[]):
  #함수인자설명
  # review: 전처리할 텍스트
  # okt: okt객체를 반복적으로 생성하지 않고 미리 생성 후 인자로 받음
  # remove_stopword: 불용어를 제거할지 여부 선택. 기본값 False
  # stop_words: 불용어 사전은 사용자가 직접 입력, 기본값 빈 리스트

  # 1. 한글 및 공백 제외한 문자 모두 제거
  review_text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]','',review)

  #2. okt 객체를 활용하여 형태소 단어로 나눔
  word_review = okt.morphs(review_text,stem=True)

  if remove_stopwords:
    #3. 불용어 제거(선택)
    word_review = [token for token in word_review if not token in stop_words]
  return word_review

In [10]:
# 전체 텍스트 전처리
stop_words = ['은','는','이','가','하','아','것','들','의','있','되','수','보','주','등','한']
okt = Okt()
clean_train_review = []

for review in train_data['document']:
  # 리뷰가 문자열인 경우만 전처리 진행
  if type(review) == str:
    clean_train_review.append(preprocessing(review,okt,remove_stopwords=True,stop_words= stop_words))
  else:
    clean_train_review.append([]) #str이 아닌 행은 빈칸으로 놔두기

clean_train_review[:4]

[['오늘', '하루', '도', '너무', '잘', '하다'],
 ['항상', '긍정', '적', '인', '마음', '당신', '을', '빛나다', '해', '요'],
 ['당신', '미소', '주변', '을', '환하다', '만들다'],
 ['매', '순간', '소중하다', '추억', '으로', '남', '을', '거', '예요']]

In [11]:
test_data = pd.read_csv('https://github.com/HMinjae/heartfelt_project/raw/main/TF_train.txt', header = 0, delimiter='\t', quoting=3)

clean_test_review = []
for review in test_data['document']:
  if type(review) == str:
    clean_test_review.append(preprocessing(review, okt, remove_stopwords=True, stop_words=stop_words))
  else:
    clean_test_review.append([])

In [12]:
# 인덱스 벡터 변환 후 일정 길이 넘어가거나 모자라는 리뷰 패딩처리
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_review)
train_sequences = tokenizer.texts_to_sequences(clean_train_review)
test_sequences = tokenizer.texts_to_sequences(clean_test_review)

word_vocab = tokenizer.word_index #단어사전형태
MAX_SEQUENCE_LENGTH = 1000 #문장 최대 길이

#학습 데이터
train_inputs = pad_sequences(train_sequences,maxlen=MAX_SEQUENCE_LENGTH,padding='post')

#학습 데이터 라벨 벡터화
train_labels = np.array(train_data['label'])

#평가 데이터
test_inputs = pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH,padding='post')
#평가 데이터 라벨 벡터화
test_labels = np.array(test_data['label'])

In [13]:
# 학습 데이터 불러오기
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import layers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
from tqdm import tqdm

In [14]:
import numpy as np
import requests
from io import BytesIO
from tensorflow.keras.preprocessing.sequence import pad_sequences

# nsmc_train_input.npy 파일 다운로드
response = requests.get('https://github.com/HMinjae/heartfelt_project/raw/main/DATA/CLEAN_DATA/nsmc_train_input.npy')
train_input = np.load(BytesIO(response.content))
train_input = pad_sequences(train_input, maxlen=train_input.shape[1])

# nsmc_train_label.npy 파일 다운로드
response = requests.get('https://github.com/HMinjae/heartfelt_project/raw/main/DATA/CLEAN_DATA/nsmc_train_label.npy')
train_label = np.load(BytesIO(response.content))

# `data_configs.json` 파일 다운로드
response = requests.get('https://github.com/HMinjae/heartfelt_project/raw/main/DATA/CLEAN_DATA/data_configs.json')
prepro_configs = json.loads(response.text)

In [15]:
model_name= 'cnn_classifier_kr'
BATCH_SIZE = 512
NUM_EPOCHS = 10
VALID_SPLIT = 0.1
MAX_LEN = train_input.shape[1]

kargs={'model_name': model_name, 'vocab_size':prepro_configs['vocab_size'],'embbeding_size':128, 'num_filters':100,'dropout_rate':0.5, 'hidden_dimension':250,'output_dimension':1}

In [16]:
class CNNClassifier(tf.keras.Model):

  def __init__(self, **kargs):
    super(CNNClassifier, self).__init__(name=kargs['model_name'])
    self.embedding = layers.Embedding(input_dim=kargs['vocab_size'], output_dim=kargs['embbeding_size'])
    self.conv_list = [layers.Conv1D(filters=kargs['num_filters'], kernel_size=kernel_size, padding='valid',activation = tf.keras.activations.relu,
                                    kernel_constraint = tf.keras.constraints.MaxNorm(max_value=3)) for kernel_size in [3,4,5]]
    self.pooling = layers.GlobalMaxPooling1D()
    self.dropout = layers.Dropout(kargs['dropout_rate'])
    self.fc1 = layers.Dense(units=kargs['hidden_dimension'],
                            activation = tf.keras.activations.relu,
                            kernel_constraint=tf.keras.constraints.MaxNorm(max_value=3.))
    self.fc2 = layers.Dense(units=kargs['output_dimension'],
                            activation=tf.keras.activations.sigmoid,
                            kernel_constraint= tf.keras.constraints.MaxNorm(max_value=3.))


  def call(self,x):
    x = self.embedding(x)
    x = self.dropout(x)
    x = tf.concat([self.pooling(conv(x)) for conv in self.conv_list], axis = 1)
    x = self.fc1(x)
    x = self.fc2(x)
    return x


In [18]:
model = CNNClassifier(**kargs)
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = [tf.keras.metrics.BinaryAccuracy(name='accuracy')])

In [19]:
INPUT_TEST_DATA = 'nsmc_test_input.npy'
LABEL_TEST_DATA = 'nsmc_test_label.npy'
SAVE_FILE_NM = 'model.keras'


# nsmc_train_input.npy 파일 다운로드
response = requests.get('https://github.com/HMinjae/heartfelt_project/raw/main/DATA/CLEAN_DATA/nsmc_test_input.npy')
test_input = np.load(BytesIO(response.content))
test_input = pad_sequences(test_input, maxlen=test_input.shape[1])

# nsmc_train_label.npy 파일 다운로드
response = requests.get('https://github.com/HMinjae/heartfelt_project/raw/main/DATA/CLEAN_DATA/nsmc_test_label.npy')
test_label_data = np.load(BytesIO(response.content))


In [20]:
import tensorflow as tf
import requests

# 모델 파일 다운로드
url = 'https://github.com/HMinjae/heartfelt_project/raw/main/DATA/my_models/model.keras'
r = requests.get(url)
open('model.keras', 'wb').write(r.content)

# 모델 로드
model = model = CNNClassifier(**kargs)

# 모델 컴파일 (필요시)
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy')])

# 평가 실행
model.evaluate(test_input, test_label_data)


[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.1874 - loss: 0.7048


[0.692462146282196, 0.5073694586753845]

In [22]:
import numpy as np
import pandas as pd
import re
import json
import requests
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model
okt = Okt()
tokenizer  = Tokenizer()


urla = 'https://raw.githubusercontent.com/HMinjae/heartfelt_project/main/DATA/CLEAN_DATA/data_configs.json'

# URL에서 데이터를 가져옵니다.
response = requests.get(urla)

# 응답을 JSON으로 파싱합니다.
prepro_configs = json.loads(response.text)
prepro_configs['vocab'] = word_vocab

tokenizer.fit_on_texts(word_vocab)

MAX_LENGTH = 100 #문장최대길이

sentence = input('일기를 입력해주세요.: ')
print(sentence)
sentence = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣\\s ]','', sentence)
stopwords = ['은','는','이','가','하','아','것','들','의','있','되','수','보','주','등','한'] # 불용어 추가할 것이 있으면 이곳에 추가
sentence = okt.morphs(sentence, stem=True) # 토큰화
sentence = [word for word in sentence if not word in stopwords] # 불용어 제거
vector  = tokenizer.texts_to_sequences(sentence)
pad_new = pad_sequences(vector, maxlen = MAX_LENGTH) # 패딩

model = model = CNNClassifier(**kargs)
predictions = model.predict(pad_new)
predictions = float(predictions.squeeze(-1)[1])
print(predictions)
if(predictions > 0.8):
  print("{:.2f}% 확률로 기분 매우 좋음입니다.\n".format(predictions * 100))
elif(predictions > 0.6):
  print("{:.2f}% 확률로 기분 좋음입니다.\n".format(predictions * 100))
elif(predictions > 0.4):
  print("{:.2f}% 확률로 기분 안 좋음입니다.\n".format(predictions * 100))
elif(predictions > 0.2):
  print("{:.2f}% 확률로 보통입니다.\n".format(predictions * 100))
else:
  print("{:.2f}% 확률로 기분 매우 안 좋음입니다.\n".format((predictions) * 100))

나는 오늘 치킨을 먹었다
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 149ms/step
0.4879351556301117
48.79% 확률로 기분 안 좋음입니다.



In [23]:
import pandas as pd
import random

#감정답변
df = pd.read_csv('https://github.com/HMinjae/heartfelt_project/raw/main/%EA%B0%90%EC%A0%95%EB%8B%B5%EB%B3%80.csv')  

if predictions >= 0.8:
    print(random.choice(df.iloc[:, 0].dropna().tolist()))
elif predictions >= 0.6:
    print(random.choice(df.iloc[:, 1].dropna().tolist()))
elif predictions >= 0.4:
    print(random.choice(df.iloc[:, 2].dropna().tolist()))
elif predictions >= 0.2:
    print(random.choice(df.iloc[:, 3].dropna().tolist()))
else:
    print(random.choice(df.iloc[:, 4].dropna().tolist()))

평범한 하루지만, 그 속에서도 즐거웠던 일, 보람 있었던 일 있었을 거야.


In [24]:
#노래추천
df = pd.read_csv('노래추천.csv')  

if predictions >= 0.8:
    print(random.choice(df.iloc[:, 0].dropna().tolist()))
elif predictions >= 0.6:
    print(random.choice(df.iloc[:, 1].dropna().tolist()))
elif predictions >= 0.4:
    print(random.choice(df.iloc[:, 2].dropna().tolist()))
elif predictions >= 0.2:
    print(random.choice(df.iloc[:, 3].dropna().tolist()))
else:
    print(random.choice(df.iloc[:, 4].dropna().tolist()))

재지팩트 – 하루종일


In [3]:
#행동추천
df = pd.read_csv('행동추천.csv')  
if predictions >= 0.4:
    print(random.choice(df.iloc[:, 0].dropna().tolist()))
else:
    print(random.choice(df.iloc[:, 4].dropna().tolist()))

Lobonabeat! – 빵댕이(Feat.노윤하)
