# **Setup**

In [1]:
# download data
# !wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1nckRRVYRiWG8VWoUY4wwtuQm6DjIbpmo' -O w_review_train.csv

# # download font for matplot
# !wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1DsfYE5uI1ZA_IXDNkezQTv-NQehxgZQy' -O THSarabun.ttf

# install library
!pip install pythainlp
# get tensorflow
# !pip install --upgrade pip
!pip install tensorflow



# Import Library
---

In [2]:
from tensorflow import keras

import pandas as pd
import numpy as np
from pythainlp import word_tokenize
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

---
### Prepare Data {Tokenize Word}
---

In [3]:
# check data
data = pd.read_csv("NLP_Elder_Companion.csv")
print(data.shape)
print(data.head())

# word tokenize
comment = data["Text"].apply(word_tokenize, engine="newmm", keep_whitespace=False)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(comment)
comment = tokenizer.texts_to_sequences(comment)

# adjust sequence to have same size
maxlen = max([len(s) for s in comment])
print("max len", maxlen)
x = pad_sequences(comment, maxlen=maxlen, padding="post")

y = np.array(data["Class"], dtype=np.int32).reshape(-1, 1)

# One-hot encode the target data
num_classes = 10
y = to_categorical(y, num_classes=num_classes)

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=22)

(477, 2)
   Class                        Text
0      0         คนแก่กินอะไรได้บ้าง
1      0                ควรกินอะไรดี
2      0               อาหารที่แนะนำ
3      0              ทานอะไรได้บ้าง
4      0  อาหารที่เหมาะกับผู้สูงอายุ
max len 22


In [4]:
from keras.layers import SimpleRNN, LSTM, Embedding, Flatten, Dense, Bidirectional
from keras.models import Sequential, Model
from keras.losses import CategoricalCrossentropy
from keras.optimizers import Adam

In [5]:
# Tokenizer().word_index = word index
# +1 is from function paramiter
vocabSize = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocabSize,
                   output_dim=2000,
                   input_length=maxlen))
model.add(Bidirectional(LSTM(2000)))
model.add(Dense(10, activation="softmax"))

model.summary()

lost_func = CategoricalCrossentropy()
otm = Adam()
model.compile(optimizer=otm, loss=lost_func, metrics=["accuracy"])

print("x", xTrain.shape, ":", "y", yTrain.shape)
model.fit(xTrain, yTrain, batch_size=46, epochs=2, verbose=1)

y_predict = model.predict(xTest)
print(model.evaluate(xTest, yTest, verbose=1))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 22, 2000)          1190000   
                                                                 
 bidirectional (Bidirection  (None, 4000)              64016000  
 al)                                                             
                                                                 
 dense (Dense)               (None, 10)                40010     
                                                                 
Total params: 65246010 (248.89 MB)
Trainable params: 65246010 (248.89 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
x (381, 22) : y (381, 10)
Epoch 1/2
Epoch 2/2
[1.250520944595337, 0.6458333134651184]


In [6]:
# Tokenizer().word_index = word index
# +1 is from function paramiter
vocabSize = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocabSize,
                   output_dim=2000,
                   input_length=maxlen))
model.add(LSTM(2000))
model.add(Dense(10, activation="softmax"))

model.summary()

lost_func = CategoricalCrossentropy()
otm = Adam()
model.compile(optimizer=otm, loss=lost_func, metrics=["accuracy"])

print("x", xTrain.shape, ":", "y", yTrain.shape)
model.fit(xTrain, yTrain, batch_size=46, epochs=3, verbose=1)

y_predict = model.predict(xTest)
print(model.evaluate(xTest, yTest, verbose=1))

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 22, 2000)          1190000   
                                                                 
 lstm_1 (LSTM)               (None, 2000)              32008000  
                                                                 
 dense_1 (Dense)             (None, 10)                20010     
                                                                 
Total params: 33218010 (126.72 MB)
Trainable params: 33218010 (126.72 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
x (381, 22) : y (381, 10)
Epoch 1/3
Epoch 2/3
Epoch 3/3
[1.9090566635131836, 0.21875]


In [7]:
# Tokenizer().word_index = word index
# +1 is from function paramiter
vocabSize = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocabSize,
                   output_dim=2000,
                   input_length=maxlen))
model.add(SimpleRNN(2000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(500,  activation="relu"))
model.add(Dense(10, activation="softmax"))

model.summary()

lost_func = CategoricalCrossentropy()
otm = Adam()
model.compile(optimizer=otm, loss=lost_func, metrics=["accuracy"])

print("x", xTrain.shape, ":", "y", yTrain.shape)
model.fit(xTrain, yTrain, batch_size=46, epochs=3, verbose=1)

y_predict = model.predict(xTest)
print(model.evaluate(xTest, yTest, verbose=1))

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 22, 2000)          1190000   
                                                                 
 simple_rnn (SimpleRNN)      (None, 2000)              8002000   
                                                                 
 dense_2 (Dense)             (None, 1000)              2001000   
                                                                 
 dense_3 (Dense)             (None, 500)               500500    
                                                                 
 dense_4 (Dense)             (None, 10)                5010      
                                                                 
Total params: 11698510 (44.63 MB)
Trainable params: 11698510 (44.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
x (381, 22) 

In [8]:
# Tokenizer().word_index = word index
# +1 is from function paramiter
vocabSize = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocabSize,
                   output_dim=2000,
                   input_length=maxlen))
model.add(Dense(2000, activation="relu"))
model.add(Dense(2000, activation="relu"))
model.add(Dense(10, activation="softmax"))

model.summary()

lost_func = CategoricalCrossentropy()
otm = Adam()
model.compile(optimizer=otm, loss=lost_func, metrics=["accuracy"])

print("x", xTrain.shape, ":", "y", yTrain.shape)
model.fit(xTrain, yTrain, batch_size=46, epochs=3, verbose=1)

y_predict = model.predict(xTest)
print(model.evaluate(xTest, yTest, verbose=1))

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 22, 2000)          1190000   
                                                                 
 dense_5 (Dense)             (None, 22, 2000)          4002000   
                                                                 
 dense_6 (Dense)             (None, 22, 2000)          4002000   
                                                                 
 dense_7 (Dense)             (None, 22, 10)            20010     
                                                                 
Total params: 9214010 (35.15 MB)
Trainable params: 9214010 (35.15 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
x (381, 22) : y (381, 10)
Epoch 1/3


ValueError: in user code:

    File "C:\Users\Lenovo\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Lenovo\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Lenovo\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Lenovo\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1081, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\Lenovo\anaconda3\lib\site-packages\keras\src\engine\training.py", line 1139, in compute_loss
        return self.compiled_loss(
    File "C:\Users\Lenovo\anaconda3\lib\site-packages\keras\src\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\Lenovo\anaconda3\lib\site-packages\keras\src\losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\Lenovo\anaconda3\lib\site-packages\keras\src\losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\Lenovo\anaconda3\lib\site-packages\keras\src\losses.py", line 2122, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "C:\Users\Lenovo\anaconda3\lib\site-packages\keras\src\backend.py", line 5560, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 10) and (None, 22, 10) are incompatible
