# Environment Setup

### _Create new environment and activate_

```
conda create -n autobot python=3.9
conda activate autobot
```

### _Install Packages (OSX)_

In [9]:
%conda install -c apple tensorflow-deps==2.9.0
%pip install tensorflow-macos==2.9.0
%pip install tensorflow-metal
%pip install pandas
%pip install sklearn

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting scikit-learn
  Using cached scikit_learn-1.1.1-cp39-cp39-macosx_12_0_arm64.whl (7.7 MB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting scipy>=1.3.2
  Using cached scipy-1.8.1-cp39-cp39-macosx_12_0_arm64.whl (28.7 MB)
Collecting joblib>=1.0.0
  Using cached joblib-1.1.0-py2.py3-none-any.whl (306 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel 

### _For Macbook M1_

download tensorflow-text package
- Python 3.8
  https://github.com/sun1638650145/Libraries-and-Extensions-for-TensorFlow-for-Apple-Silicon/releases/download/v2.9/tensorflow_text-2.9.0-cp38-cp38-macosx_11_0_arm64.whl
- Python 3.9
  https://github.com/sun1638650145/Libraries-and-Extensions-for-TensorFlow-for-Apple-Silicon/releases/download/v2.9/tensorflow_text-2.9.0-cp39-cp39-macosx_11_0_arm64.whl

and run

```pip install _DOWNLOAD_FILE_```

# Execute Notebook

### _Import Libraries_

In [10]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text # Important: this library is not used but needed

import numpy as np
import pandas as pd
import sklearn

### _Declare Constants_

In [2]:
intent_keyword_csv = 'dataset/dip_keyword_v1.csv'
intent_label_csv = 'dataset/dip_answer_v1.csv'
encoder_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3'

# load encoder model
encoder = hub.load(encoder_url)

def embed_text(text):
    return encoder(text)

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-07-14 21:17:54.172354: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-14 21:17:54.172939: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-07-14 21:17:56.744781: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-07-14 21:17:56.750088: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


### _Import Dataset_

In [3]:
df = pd.read_csv(intent_keyword_csv)
print(df.head(10))

class_names = list(set(df['intent_id'].to_list()))
print(f'total class in dataset = {len(class_names)}')

intents = df['intent_id']
keywords = df['keyword']

features = []
labels = []

# embedding keywords
for i, keyword in enumerate(keywords):
    embed = embed_text(keyword)
    # print(i, keyword)
    features.append(embed)
    
    for j, name in enumerate(class_names):
        if (name == intents[i]):
            labels.append(j)

print(keywords[0], labels[0])

    intent_id                            keyword
0  1650358265                          สิทธิบัตร
1  1650358265                    สอบถามสิทธิบัตร
2  1650359775                      DIP e-Service
3  1650359775          แอพพลิเคชัน DIP e-Service
4  1650359775  ดาวน์โหลดแอพลิเคชัน DIP e-Service
5  1650359775                             แอพIOS
6  1650359775                         แอพAndroid
7  1650359775                             AppIOS
8  1650359775                         AppAndroid
9  1650359841                     ส่งคำร้องเรียน
total class in dataset = 403


2022-07-14 21:18:30.852826: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


สิทธิบัตร 401


### _One-Hot Encoding_

In [5]:
features = np.array(features)
labels = np.array(labels)

labels_onehot = np.zeros([labels.size, labels.max() + 1])
labels_onehot[np.arange(labels.size), labels] = 1

print(features.shape)
print(labels_onehot.shape)


(1135, 1, 512)
(1135, 403)


### _Convert to TensorFlow Dataset_

In [6]:
dataset = tf.data.Dataset.from_tensor_slices((features, labels_onehot))

# shuffle_buffer_size >= dataset size
shuffle_buffer_size = 1000
dataset = dataset.shuffle(shuffle_buffer_size)

test_size = 0.1
batch_size = 16

test_samples = round(test_size * len(intents))

train_ds = dataset.skip(test_samples)
test_ds = dataset.take(test_samples)

train_ds = train_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

### _Model Definition_

In [7]:
def create_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True), input_shape=(1, 512)),
      tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dropout(0.5),
      tf.keras.layers.Dense(len(class_names), activation='softmax')
  ])
  return model

model = create_model()

### _Compile and Define Callbacks_

Adam is usually faster, but SGD is more likely to achieve global minimum

Use callbacks to stop training / save model checkpoints

In [8]:
model.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

model.summary()

callbacks = []

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=10,
    verbose=1,
    restore_best_weights=True,
    mode='auto'
)

callbacks.append(early_stop)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 1, 256)           656384    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               73984     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 403)               26195     
                                                                 
Total params: 760,723
Trainable params: 760,723
Non-trai

### _Train the Model_

In [9]:
epochs = 50000000 # With callbacks this can be arbitrarily large

history = model.fit(
    train_ds,
    epochs=epochs,
    validation_data=test_ds,
    callbacks=callbacks
)

model.save('model/simple')

Epoch 1/50000000


2022-07-14 21:22:08.590690: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-14 21:22:09.126368: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-14 21:22:09.145346: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-14 21:22:09.297984: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-14 21:22:09.311834: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-14 21:22:09.915749: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-14 21:22:09.936911: I tensorflow/core/grappler/optimizers/cust



2022-07-14 21:22:13.214113: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-14 21:22:13.403673: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-14 21:22:13.413065: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-14 21:22:13.471493: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-07-14 21:22:13.480951: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50000000
Epoch 3/50000000
Epoch 4/50000000
Epoch 5/50000000
Epoch 6/50000000
Epoch 7/50000000
Epoch 8/50000000
Epoch 9/50000000
Epoch 10/50000000
Epoch 11/50000000
Epoch 12/50000000
Epoch 13/50000000
Epoch 14/50000000
Epoch 15/50000000
Epoch 16/50000000
Epoch 17/50000000
Epoch 18/50000000
Epoch 19/50000000
Epoch 20/50000000
Epoch 21/50000000
Epoch 22/50000000
Epoch 23/50000000
Epoch 24/50000000
Epoch 25/50000000
Epoch 26/50000000
Epoch 27/50000000
Epoch 28/50000000
Epoch 29/50000000
Epoch 30/50000000
Epoch 31/50000000
Epoch 32/50000000
Epoch 33/50000000
Epoch 34/50000000
Epoch 35/50000000
Epoch 36/50000000
Epoch 37/50000000
Epoch 38/50000000
Epoch 39/50000000
Epoch 40/50000000
Epoch 41/50000000
Epoch 42/50000000
Epoch 43/50000000
Epoch 44/50000000
Epoch 45/50000000
Epoch 46/50000000
Epoch 47/50000000
Epoch 48/50000000
Epoch 49/50000000
Epoch 50/50000000
Epoch 51/50000000
Epoch 52/50000000
Epoch 53/50000000
Epoch 54/50000000
Epoch 55/50000000
Epoch 56/50000000
Epoch 57/50000000




INFO:tensorflow:Assets written to: model/simple/assets


INFO:tensorflow:Assets written to: model/simple/assets


### _Load Intent Answer_

In [4]:

intent_df = pd.read_csv(intent_label_csv)
match_intent = intent_df.loc[intent_df['intent_id'] == 1650359775]
print(match_intent['message'])

0    สามารถดาวน์โหลดแอพลิเคชัน DIP e-Service ได้ทั้...
Name: message, dtype: object


### _Load Saved Model_

In [None]:
model = tf.keras.models.load_model('save/simple')
model.summary()

### _Testing the Model_

In [14]:
confidence_threshold = 80

raw_input = 'ร้องเรียนได้ที่ไหน'
embed = encoder(raw_input)
embed = tf.reshape(embed, (1, 1, 512))
prediction = model.predict(embed)

top2_ind = prediction.argsort()[:,::-1][:,:4]
print(top2_ind[0])

top_ind = prediction.argmax()
print(top_ind)

predicted_index = np.argmax(prediction)
print(predicted_index)

confidence = prediction[0][predicted_index] * 100
match_intent = intent_df.loc[intent_df['intent_id'] == class_names[predicted_index]]
answer = match_intent.to_numpy()

print(f'Input: {raw_input}')
print(f'Confidence: {confidence:.2f}%')
print(f'Result: {answer[0]}')

if confidence < confidence_threshold:
    predicted_removed = np.delete(prediction, predicted_index, 1)
    second_option = np.argmax(predicted_removed)
    print(second_option)
    print(predicted_removed)
    confidence = predicted_removed[0][second_option] * 100
    print(confidence)
    print('\n')
    print(f'Model also considers {class_names[second_option]}')

[289 285 106  48]
289
289
Input: ร้องเรียนได้ที่ไหน
Confidence: 46.64%
Result: [1657015867 '1. ทางโทรศัพท์สายด่วน 1368 2. ทางโทรศัพท์ 0-2547-4....']
285
[[5.48436283e-08 2.28251604e-21 9.01899111e-11 1.15645103e-22
  1.87099000e-23 5.74841737e-12 2.30756791e-09 1.25097523e-08
  1.00479160e-28 1.06229148e-16 1.83505387e-16 1.31088337e-12
  1.05861934e-11 2.15584641e-23 1.05716362e-16 1.20978599e-14
  2.70987713e-20 4.36574223e-18 2.26612173e-10 7.77097896e-21
  1.67082391e-25 3.32152535e-11 2.33190211e-26 6.36045297e-27
  2.05800731e-07 8.50485833e-08 1.36861611e-25 1.00928865e-22
  1.79311148e-21 3.57703581e-15 1.84617291e-23 8.72090247e-11
  1.00055624e-27 5.29596935e-22 7.65471417e-17 6.49579427e-25
  3.40292898e-11 4.75630585e-35 5.10094269e-20 3.96549397e-29
  1.29938173e-18 2.56237365e-09 2.45494791e-10 2.24470389e-14
  6.05966418e-16 5.17235582e-18 1.05625273e-16 1.88092248e-18
  2.33523780e-03 1.15008163e-07 2.38016778e-22 1.24973477e-25
  3.67933378e-17 1.71549459e-24 3.9400741

In [11]:
from sklearn.preprocessing import LabelEncoder

In [95]:
df = pd.read_csv('dataset/v1/q_data.csv')
y = df['intent_id']

le = LabelEncoder()
le.fit(y)
print(le.classes_)

labels = le.transform(y)
print(labels)

intent_id = '1c0ab913-6a04-4e44-a139-d7b8df8b9172'
label_df = pd.read_csv('dataset/v1/a_data.csv')
match = label_df.loc[label_df["intent_id"] == intent_id]

print(match)

messages = match.to_dict("list")["response"] if not match.empty else None
print(messages)

0        สวัสดี
1    สวัสดีครับ
2     สวัสดีค่ะ
3         hello
4            hi
5      greeting
6        สบายดี
7         ว่าไง
8      ถามหน่อย
9       มีคำถาม
Name: keyword, dtype: object
['42fddf8b-9c83-4489-a4c8-586addeb4498'
 '4fa6ed26-eef8-46b4-b7df-23ec0becf933'
 '9b7420e9-678d-46e8-98cc-5d106b61e71f'
 'f7716d57-1666-423a-be1d-7be24010889b']
[3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 2 2 2 2 2 2 2 2]
Empty DataFrame
Columns: [intent_id, response]
Index: []
None
