# **Setup**

In [1]:
# download data
# !wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1nckRRVYRiWG8VWoUY4wwtuQm6DjIbpmo' -O w_review_train.csv

# # download font for matplot
# !wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1DsfYE5uI1ZA_IXDNkezQTv-NQehxgZQy' -O THSarabun.ttf

# install library
!pip install pythainlp
# get tensorflow
# !pip install --upgrade pip
!pip install tensorflow



# Import Library
---

In [2]:
from tensorflow import keras

import pandas as pd
import numpy as np
from pythainlp import word_tokenize
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

---
### Prepare Data {Tokenize Word}
---

In [13]:
# check data
data = pd.read_csv("NLP_Elder_Companion.csv")
print(data.shape)
print(data.head())

# word tokenize
comment = data["Text"].apply(word_tokenize, engine="newmm", keep_whitespace=False)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(comment)
comment = tokenizer.texts_to_sequences(comment)

# adjust sequence to have same size
maxlen = max([len(s) for s in comment])
print("max len", maxlen)
x = pad_sequences(comment, maxlen=maxlen, padding="post")

y = np.array(data["Class"], dtype=np.int32).reshape(-1, 1)

# One-hot encode the target data
num_classes = 10
y = to_categorical(y, num_classes=num_classes)

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=10)

(477, 2)
   Class                        Text
0      0         คนแก่กินอะไรได้บ้าง
1      0                ควรกินอะไรดี
2      0               อาหารที่แนะนำ
3      0              ทานอะไรได้บ้าง
4      0  อาหารที่เหมาะกับผู้สูงอายุ
max len 22


In [27]:
from keras.layers import SimpleRNN, LSTM, Embedding, Flatten, Dense, Bidirectional
from keras.models import Sequential, Model
from keras.losses import CategoricalCrossentropy
from keras.optimizers import Adam

---
### Fully Connected | [Sequential](https://keras.io/api/models/sequential/), [Dense](https://keras.io/api/layers/core_layers/dense/), [Embedding](https://keras.io/api/layers/core_layers/embedding/)
---

In [28]:
# Tokenizer().word_index = word index
# +1 is from function paramiter
vocabSize = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocabSize,
                   output_dim=2000,
                   input_length=maxlen))
model.add(Bidirectional(LSTM(2000)))
model.add(Dense(10, activation="softmax"))

model.summary()

lost_func = CategoricalCrossentropy()
otm = Adam()
model.compile(optimizer=otm, loss=lost_func, metrics=["accuracy"])

print("x", xTrain.shape, ":", "y", yTrain.shape)
model.fit(xTrain, yTrain, batch_size=46, epochs=3, verbose=1)

y_predict = model.predict(xTest)
print(model.evaluate(xTest, yTest, verbose=1))

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 22, 2000)          1190000   
                                                                 
 bidirectional (Bidirection  (None, 4000)              64016000  
 al)                                                             
                                                                 
 dense_15 (Dense)            (None, 10)                40010     
                                                                 
Total params: 65246010 (248.89 MB)
Trainable params: 65246010 (248.89 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
x (381, 22) : y (381, 10)
Epoch 1/3
Epoch 2/3
Epoch 3/3
[1.1678630113601685, 0.6666666865348816]


In [29]:
# Tokenizer().word_index = word index
# +1 is from function paramiter
vocabSize = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocabSize,
                   output_dim=2000,
                   input_length=maxlen))
model.add(LSTM(2000))
model.add(Dense(10, activation="softmax"))

model.summary()

lost_func = CategoricalCrossentropy()
otm = Adam()
model.compile(optimizer=otm, loss=lost_func, metrics=["accuracy"])

print("x", xTrain.shape, ":", "y", yTrain.shape)
model.fit(xTrain, yTrain, batch_size=46, epochs=3, verbose=1)

y_predict = model.predict(xTest)
print(model.evaluate(xTest, yTest, verbose=1))

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_15 (Embedding)    (None, 22, 2000)          1190000   
                                                                 
 lstm_15 (LSTM)              (None, 2000)              32008000  
                                                                 
 dense_16 (Dense)            (None, 10)                20010     
                                                                 
Total params: 33218010 (126.72 MB)
Trainable params: 33218010 (126.72 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
x (381, 22) : y (381, 10)
Epoch 1/3
Epoch 2/3
Epoch 3/3
[2.113703727722168, 0.1458333283662796]
