In [1]:
import pandas as pd
df = pd.read_csv("../Data/lstm.csv")
df.head()

Unnamed: 0,paragraph,category
0,dishplace is located in sunnyvale downtown the...,food
1,service can be slower during busy hours but ou...,food
2,portions are huge both french toast and their ...,food
3,we started with apps going the chicken and waf...,food
4,the biscuits and gravy was too salty two peopl...,food


In [2]:
df.category.unique()

array(['food', 'sports'], dtype=object)

- 단어하나하나 끊을수 있어야 RNN을 사용할 수 있다.

In [3]:
# 데이터에 사용된 중복 없는 전체 단어 갯수를 파악
# set은 중복값이 들어갈 수 없다.
results = set()
df['paragraph'].str.lower().str.split().apply(results.update)
vocab_size = len(results)
vocab_size

536

In [4]:
# numpy -> list로 변경
paragraphs = df['paragraph'].to_list() # 길이가 틀려서 numpy사용 불가
paragraphs[:5]

['dishplace is located in sunnyvale downtown there is parking around the area but it can be difficult to find during peak business hours my sisters and i came to this place for dinner on a weekday they were really busy so i highly recommended making reservations unless you have the patience to wait',
 'service can be slower during busy hours but our waiter was courteous and help gave some great entree recommendations',
 'portions are huge both french toast and their various omelettes are really good their french toast is probably 1.5x more than other brunch places great place to visit if you are hungry and dont want to wait 1 hour for a table',
 'we started with apps going the chicken and waffle slides and chicken nachos the sliders were amazing and the nachos were good too maybe by themselves the nachos would have scored better but after those sliders they were up against some tough competition',
 'the biscuits and gravy was too salty two people in my group had the gravy and all thoug

In [5]:
from tensorflow import keras
keras.utils.set_random_seed(1) # set seed

In [6]:
# 단어를 숫자로 인코딩 one hot encoding
encoded_paragraphs = [keras.preprocessing.text.one_hot(paragraph, vocab_size) for paragraph in paragraphs]
encoded_paragraphs

[[146,
  149,
  119,
  41,
  257,
  527,
  283,
  149,
  138,
  82,
  168,
  349,
  323,
  222,
  483,
  14,
  173,
  193,
  462,
  505,
  109,
  105,
  88,
  51,
  186,
  381,
  170,
  471,
  193,
  127,
  417,
  520,
  238,
  453,
  216,
  108,
  468,
  205,
  247,
  459,
  178,
  170,
  486,
  259,
  4,
  47,
  287,
  204,
  358,
  168,
  50,
  193,
  533],
 [226,
  483,
  14,
  315,
  505,
  459,
  88,
  323,
  234,
  285,
  153,
  320,
  381,
  291,
  57,
  100,
  439,
  175,
  83],
 [393,
  175,
  402,
  274,
  2,
  403,
  381,
  312,
  530,
  515,
  175,
  247,
  470,
  312,
  2,
  403,
  149,
  504,
  472,
  400,
  452,
  535,
  535,
  166,
  479,
  439,
  417,
  193,
  210,
  8,
  204,
  175,
  523,
  381,
  92,
  128,
  193,
  533,
  472,
  454,
  520,
  216,
  531],
 [436,
  498,
  148,
  422,
  434,
  168,
  226,
  381,
  269,
  343,
  381,
  226,
  259,
  168,
  533,
  205,
  205,
  381,
  168,
  259,
  205,
  470,
  254,
  384,
  383,
  14,
  168,
  259,
  12,
  358,
  18

In [7]:
# 데이터에서 가장 긴 문장의 단어 갯수 확인
max_length = 0
for row in df['paragraph']:
    if len(row.split(" ")) > max_length:
        max_length = len(row.split(" "))
print(max_length)

91


In [8]:
# 문장마다 단어 갯수가 다르므로 sequence padding을 넣어서 문장의 길이 동일하게 만들기 
# 이거하려고 위에서 단어갯수 체크한거임

padding_paragraphs_encoding = keras.preprocessing.sequence.pad_sequences(encoded_paragraphs, maxlen=max_length, padding='post')
padding_paragraphs_encoding

array([[146, 149, 119, ...,   0,   0,   0],
       [226, 483,  14, ...,   0,   0,   0],
       [393, 175, 402, ...,   0,   0,   0],
       ...,
       [414, 151, 520, ...,   0,   0,   0],
       [323, 168, 378, ...,   0,   0,   0],
       [216, 471, 168, ...,   0,   0,   0]], dtype=int32)

In [9]:
# 분류항목(food, sports)를 수치로 변경하기
categories = df['category'].to_list()

def category_encoding(category):
    if category == 'food':
        return [1,0]
    else:
        return [0,1]

In [10]:
encoded_category = [category_encoding(category) for category in categories]
encoded_category

[[1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [1, 0],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1],
 [0, 1]]

In [11]:
# Feature 확인
print(encoded_paragraphs[0])

[146, 149, 119, 41, 257, 527, 283, 149, 138, 82, 168, 349, 323, 222, 483, 14, 173, 193, 462, 505, 109, 105, 88, 51, 186, 381, 170, 471, 193, 127, 417, 520, 238, 453, 216, 108, 468, 205, 247, 459, 178, 170, 486, 259, 4, 47, 287, 204, 358, 168, 50, 193, 533]


In [12]:
len(encoded_paragraphs[19])

73

---
## 주제를 분류하는 모델 구현하기

- Embedding(vocab_size) : 원핫인코딩
- Embedding 레이어는 인덱스를 받아 5차원 백터의 임베딩을 출력합니다.
- 5차원(Hyper Parameter) 벡터의 임베딩? : 영어 5형식 -> 과거 미래 현재 처리를 하나도 안해서 사용한거고 한국어는 없음

In [13]:
model =keras.Sequential()
# 문맥 생성 단계

# 임베딩 레이어는 인덱스를 받아 5차원 백터의 임베딩을 출력한다. 
# 5차원 백터의 임베딩? 영어 5형식 (과거분사 , 현재, 등등)
model.add(keras.layers.Embedding(vocab_size, 5, input_length=max_length))    # 전체 536개, 5는 5차원, 인풋랭스 91개

model.add(keras.layers.LSTM(64)) 
# 64는 내마음대로 넣은 수 , 답이 없다.

# 분류 단계 (여기서는 렐루쓸수있다.)
model.add(keras.layers.Dense(32, activation='relu'))
model.add(keras.layers.Dense(2, activation='softmax'))
# 시그모이드 써도되는데 문장이라 확률값을 구해줘야해.

In [14]:
model.compile(loss='encoded_category', optimizer='adam', metrics='accuracy')

In [15]:
# numpy로 바꿔주기
import numpy as np
train_X = np.array(padding_paragraphs_encoding)
train_Y = np.array(encoded_category)

In [16]:
model.fit(train_X, train_Y, batch_size=10, epochs=50)

Epoch 1/50


2022-07-22 14:27:32.705849: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


ValueError: in user code:

    File "/Users/uyoung/miniforge3/envs/tensorflow/lib/python3.9/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/Users/uyoung/miniforge3/envs/tensorflow/lib/python3.9/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/uyoung/miniforge3/envs/tensorflow/lib/python3.9/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/Users/uyoung/miniforge3/envs/tensorflow/lib/python3.9/site-packages/keras/engine/training.py", line 890, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/uyoung/miniforge3/envs/tensorflow/lib/python3.9/site-packages/keras/engine/training.py", line 948, in compute_loss
        return self.compiled_loss(
    File "/Users/uyoung/miniforge3/envs/tensorflow/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 184, in __call__
        self.build(y_pred)
    File "/Users/uyoung/miniforge3/envs/tensorflow/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 133, in build
        self._losses = tf.nest.map_structure(self._get_loss_object, self._losses)
    File "/Users/uyoung/miniforge3/envs/tensorflow/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 272, in _get_loss_object
        loss = losses_mod.get(loss)
    File "/Users/uyoung/miniforge3/envs/tensorflow/lib/python3.9/site-packages/keras/losses.py", line 2367, in get
        return deserialize(identifier)
    File "/Users/uyoung/miniforge3/envs/tensorflow/lib/python3.9/site-packages/keras/losses.py", line 2322, in deserialize
        return deserialize_keras_object(
    File "/Users/uyoung/miniforge3/envs/tensorflow/lib/python3.9/site-packages/keras/utils/generic_utils.py", line 709, in deserialize_keras_object
        raise ValueError(

    ValueError: Unknown loss function: encoded_category. Please ensure this object is passed to the `custom_objects` argument. See https://www.tensorflow.org/guide/keras/save_and_serialize#registering_the_custom_object for details.
