# GPT - 01418496
**สมาชิกกลุ่ม**

นายศิวกร ภาสว่าง 6410451423

นางสาว เเพรวรุ้ง พุดชะวา 6410451253

นางสาว มารีน่า มิทซุย 6410450222

หมู่ 200

ชุดข้อมูล : Disneyland Reviews

ลิ้งดาวน์โหลด : https://www.kaggle.com/datasets/arushchillar/disneyland-reviews

In [9]:
import tensorflow as tf
import numpy as np
import os
import kagglehub
import shutil
import pandas as pd

## Setting to execute on Processor (GPU or CPU)

In [3]:
gpus = tf.config.list_physical_devices("GPU")
if len(gpus) > 0:
    tf.config.experimental.set_memory_growth(gpus[0], True)
    print("Execute on GPU")
else:
    print("Execute on CPU")

Execute on GPU


## Download Dataset

In [10]:
# Download the dataset folder in latest version
if not "dataset" in os.listdir("."):
    path = kagglehub.dataset_download("arushchillar/disneyland-reviews")
    print("Path to dataset files:", path)
    shutil.move(path, "./dataset")
    print("Download Dataset Complete")
else:
    print("Download Dataset Already")

Path to dataset files: /root/.cache/kagglehub/datasets/arushchillar/disneyland-reviews/versions/1
Download Dataset Complete


## Prepossessing

In [28]:
file_path = "./dataset/DisneylandReviews.csv"
df = pd.read_csv(file_path, encoding="ISO-8859-1")

print(df.head())

   Review_ID  Rating Year_Month     Reviewer_Location  \
0  670772142       4     2019-4             Australia   
1  670682799       4     2019-5           Philippines   
2  670623270       4     2019-4  United Arab Emirates   
3  670607911       4     2019-4             Australia   
4  670607296       4     2019-4        United Kingdom   

                                         Review_Text               Branch  
0  If you've ever been to Disneyland anywhere you...  Disneyland_HongKong  
1  Its been a while since d last time we visit HK...  Disneyland_HongKong  
2  Thanks God it wasn   t too hot or too humid wh...  Disneyland_HongKong  
3  HK Disneyland is a great compact park. Unfortu...  Disneyland_HongKong  
4  the location is not in the city, took around 1...  Disneyland_HongKong  


In [33]:
df_selected = df[["Reviewer_Location", "Branch", "Rating", "Review_Text"]]
df_selected = df_selected.rename(columns=str.lower)
print(df_selected.head())

      reviewer_location               branch  rating  \
0             Australia  Disneyland_HongKong       4   
1           Philippines  Disneyland_HongKong       4   
2  United Arab Emirates  Disneyland_HongKong       4   
3             Australia  Disneyland_HongKong       4   
4        United Kingdom  Disneyland_HongKong       4   

                                         review_text  
0  If you've ever been to Disneyland anywhere you...  
1  Its been a while since d last time we visit HK...  
2  Thanks God it wasn   t too hot or too humid wh...  
3  HK Disneyland is a great compact park. Unfortu...  
4  the location is not in the city, took around 1...  


In [36]:
data_list = df_selected.to_dict(orient="records")
data_list[0]

{'reviewer_location': 'Australia',
 'branch': 'Disneyland_HongKong',
 'rating': 4,
 'review_text': "If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides  its a Small World  is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well. "}

### Sequence construction

In [38]:
filtered_data = [
    "Disneyland review : "
    + x["reviewer_location"]
    + " : "
    + x["branch"]
    + " : "
    + str(x["rating"])
    + " : "
    + x["review_text"]

    for x in data_list
    if x["reviewer_location"] is not None
    and x["branch"] is not None
    and x["rating"] is not None
    and x["review_text"] is not None
]

n_data = len(filtered_data)
print(f"{n_data} recipes loaded")

example = filtered_data[10]
print(example)

42656 recipes loaded
Disneyland review : United States : Disneyland_HongKong : 5 : Disneyland never cease to amaze me! I've been to Disneyland florida and I thought I have exhausted the kid in me but nope! I still had so much fun in disneyland hong kong. 2 DL off my bucketlist and more to come!     


### Tokenization

In [40]:
import re
import string

def pad_punctuation(s):
    s = re.sub(r"([^\w\s'-])", r" \1 ", s) # ไม่แยก _ , -
    s = re.sub(" +", " ", s)
    return s

text_data = [pad_punctuation(x) for x in filtered_data]

example_data = text_data[10]
example_data

"Disneyland review : United States : Disneyland_HongKong : 5 : Disneyland never cease to amaze me ! I've been to Disneyland florida and I thought I have exhausted the kid in me but nope ! I still had so much fun in disneyland hong kong . 2 DL off my bucketlist and more to come ! "

In [41]:
import tensorflow as tf
from tensorflow.keras import layers

BATCH_SIZE = 32 # 🤔
VOCAB_SIZE = 10000 # 🤔
MAX_LEN = 80 # 🤔

text_ds = tf.data.Dataset.from_tensor_slices(text_data)
text_ds = text_ds.batch(BATCH_SIZE)
text_ds = text_ds.shuffle(1000)


vectorize_layer = layers.TextVectorization( # 🤔
    standardize="lower", # 🤔
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: .
3: the
4: :
5: and
6: ,
7: to
8: a
9: of


In [42]:
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[  14   22    4   45   64    4  104    4   39    4   14  195    1    7
 4191  158   19  426   91    7   14  246    5   17  349   17   34 1631
    3  431   11  158   21 4345   19   17  125   44   35   89   98   11
   14  234  235    2   80 1006  205   42    1    5   68    7  221   19
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


### Training set

In [43]:
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

train_ds = text_ds.map(prepare_inputs)
example_input_output = train_ds.take(1).get_single_element()

example_input_output[0][0]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([  14,   22,    4,   45,   64,    4,   54,    4,   39,    4,  170,
         96,   17, 2101,   28,    8,  333,    6,   47,   34, 1159,   35,
        109,  196,  225,  283,   17,   12,   33,   28,    8,  333,    5,
        202,  262,    7,  126,   42,   69,   33,   12,  167,    2,   17,
        211,   84,    8,   58,   72,    7,  417,    8,   30,  728,    6,
         29,   57,    3,  155,  100,   18,   56,   63,    5,   85,    8,
        156,   68,  105,   29,  151,  332,    2,  147,   11,  905,   16,
          8,  151,  802])>

In [44]:
example_input_output[1][0]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([  22,    4,   45,   64,    4,   54,    4,   39,    4,  170,   96,
         17, 2101,   28,    8,  333,    6,   47,   34, 1159,   35,  109,
        196,  225,  283,   17,   12,   33,   28,    8,  333,    5,  202,
        262,    7,  126,   42,   69,   33,   12,  167,    2,   17,  211,
         84,    8,   58,   72,    7,  417,    8,   30,  728,    6,   29,
         57,    3,  155,  100,   18,   56,   63,    5,   85,    8,  156,
         68,  105,   29,  151,  332,    2,  147,   11,  905,   16,    8,
        151,  802,   29])>

### Casual masking

In [45]:
import numpy as np

def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat([tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0)
    return tf.tile(mask, mult)

np.transpose(causal_attention_mask(1, 10, 10, dtype=tf.int32)[0])

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int32)

## Create Model

In [None]:
# code to create model
class TransformerBlock(layers.Layer):
    def __init__(self, num_heads, key_dim, embed_dim, ff_dim, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.num_heads = num_heads
        self.key_dim = key_dim
        self.embed_dim = embed_dim
        self.ff_dim = ff_dim
        self.dropout_rate = dropout_rate
        self.attn = layers.MultiHeadAttention(num_heads, key_dim, output_shape=embed_dim)
        self.dropout_1 = layers.Dropout(self.dropout_rate)
        self.ln_1 = layers.LayerNormalization(epsilon=1e-6)
        self.ffn_1 = layers.Dense(self.ff_dim, activation="relu")
        self.ffn_2 = layers.Dense(self.embed_dim)
        self.dropout_2 = layers.Dropout(self.dropout_rate)
        self.ln_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output, attention_scores = self.attn(inputs, inputs, attention_mask=causal_mask, return_attention_scores=True)
        attention_output = self.dropout_1(attention_output)
        out1 = self.ln_1(inputs + attention_output)
        ffn_1 = self.ffn_1(out1)
        ffn_2 = self.ffn_2(ffn_1)
        ffn_output = self.dropout_2(ffn_2)
        return (self.ln_2(out1 + ffn_output), attention_scores)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "key_dim": self.key_dim,
                "embed_dim": self.embed_dim,
                "num_heads": self.num_heads,
                "ff_dim": self.ff_dim,
                "dropout_rate": self.dropout_rate,
            }
        )
        return config

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.max_len = max_len
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=max_len, output_dim=embed_dim) # GPT-style

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "max_len": self.max_len,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config

In [None]:
from tensorflow.keras import models, losses

EMBEDDING_DIM = 256 # 🤔
KEY_DIM = 256 # 🤔
N_HEADS = 2 # 🤔
FEED_FORWARD_DIM = 256 # 🤔

inputs = layers.Input(shape=(None,), dtype=tf.int32)
x = TokenAndPositionEmbedding(MAX_LEN, VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x, attention_scores = TransformerBlock(N_HEADS, KEY_DIM, EMBEDDING_DIM, FEED_FORWARD_DIM)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
gpt = models.Model(inputs=inputs, outputs=[outputs, attention_scores])
gpt.compile("adam", loss=[losses.SparseCategoricalCrossentropy(), None])
gpt.summary()

In [None]:
from tensorflow.keras import callbacks

class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {word: index for index, word in enumerate(index_to_word)}

    def sample_from(self, probs, temperature):
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs # weighted random

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [self.word_to_index.get(x, 1) for x in start_prompt.split()]
        sample_token = None
        info = []

        while len(start_tokens) < max_tokens and sample_token != 0:
            x = np.array([start_tokens])
            y, att = self.model.predict(x, verbose=0)
            sample_token, probs = self.sample_from(y[0][-1], temperature)
            info.append(
                {
                    "prompt": start_prompt,
                    "word_probs": probs,
                    "atts": att[0, :, -1, :],
                }
            )

            start_tokens.append(sample_token)
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]

        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("wine review", max_tokens=80, temperature=1.0)  # "prefix" 🤔

text_generator = TextGenerator(vocab)

EPOCHS = 5 # 🤔

gpt.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[text_generator],
)

## Visulization

In [None]:
# code to show visulization