# GPT - 01418496
**สมาชิกกลุ่ม**

นายศิวกร ภาสว่าง 6410451423

นางสาว เเพรวรุ้ง พุดชะวา 6410451253

นางสาว มารีน่า มิทซุย 6410450222

หมู่ 200

ชุดข้อมูล : Disneyland Reviews

ลิ้งดาวน์โหลด : https://www.kaggle.com/datasets/arushchillar/disneyland-reviews

In [9]:
import tensorflow as tf
import numpy as np
import os
import kagglehub
import shutil

In [11]:
import pandas as pd

## Setting to execute on Processor (GPU or CPU)

In [3]:
gpus = tf.config.list_physical_devices("GPU")
if len(gpus) > 0:
    tf.config.experimental.set_memory_growth(gpus[0], True)
    print("Execute on GPU")
else:
    print("Execute on CPU")

Execute on GPU


## Download Dataset

In [10]:
# Download the dataset folder in latest version
if not "dataset" in os.listdir("."):
    path = kagglehub.dataset_download("arushchillar/disneyland-reviews")
    print("Path to dataset files:", path)
    shutil.move(path, "./dataset")
    print("Download Dataset Complete")
else:
    print("Download Dataset Already")

Path to dataset files: /root/.cache/kagglehub/datasets/arushchillar/disneyland-reviews/versions/1
Download Dataset Complete


## Prepossessing

In [22]:
!file -i /content/dataset/DisneylandReviews.csv

/content/dataset/DisneylandReviews.csv: text/csv; charset=us-ascii


In [28]:
file_path = "/content/dataset/DisneylandReviews.csv"
df = pd.read_csv(file_path, encoding="ISO-8859-1")

print(df.head())

   Review_ID  Rating Year_Month     Reviewer_Location  \
0  670772142       4     2019-4             Australia   
1  670682799       4     2019-5           Philippines   
2  670623270       4     2019-4  United Arab Emirates   
3  670607911       4     2019-4             Australia   
4  670607296       4     2019-4        United Kingdom   

                                         Review_Text               Branch  
0  If you've ever been to Disneyland anywhere you...  Disneyland_HongKong  
1  Its been a while since d last time we visit HK...  Disneyland_HongKong  
2  Thanks God it wasn   t too hot or too humid wh...  Disneyland_HongKong  
3  HK Disneyland is a great compact park. Unfortu...  Disneyland_HongKong  
4  the location is not in the city, took around 1...  Disneyland_HongKong  


In [33]:
df_selected = df[["Reviewer_Location", "Branch", "Rating", "Review_Text"]]
df_selected = df_selected.rename(columns=str.lower)
print(df_selected.head())

      reviewer_location               branch  rating  \
0             Australia  Disneyland_HongKong       4   
1           Philippines  Disneyland_HongKong       4   
2  United Arab Emirates  Disneyland_HongKong       4   
3             Australia  Disneyland_HongKong       4   
4        United Kingdom  Disneyland_HongKong       4   

                                         review_text  
0  If you've ever been to Disneyland anywhere you...  
1  Its been a while since d last time we visit HK...  
2  Thanks God it wasn   t too hot or too humid wh...  
3  HK Disneyland is a great compact park. Unfortu...  
4  the location is not in the city, took around 1...  


In [36]:
data_list = df_selected.to_dict(orient="records")
data_list[0]

{'reviewer_location': 'Australia',
 'branch': 'Disneyland_HongKong',
 'rating': 4,
 'review_text': "If you've ever been to Disneyland anywhere you'll find Disneyland Hong Kong very similar in the layout when you walk into main street! It has a very familiar feel. One of the rides  its a Small World  is absolutely fabulous and worth doing. The day we visited was fairly hot and relatively busy but the queues moved fairly well. "}

### Sequence construction

In [38]:
filtered_data = [
    "Disneyland review : "
    + x["reviewer_location"]
    + " : "
    + x["branch"]
    + " : "
    + str(x["rating"])
    + " : "
    + x["review_text"]

    for x in data_list
    if x["reviewer_location"] is not None
    and x["branch"] is not None
    and x["rating"] is not None
    and x["review_text"] is not None
]

n_data = len(filtered_data)
print(f"{n_data} recipes loaded")

example = filtered_data[10]
print(example)

42656 recipes loaded
Disneyland review : United States : Disneyland_HongKong : 5 : Disneyland never cease to amaze me! I've been to Disneyland florida and I thought I have exhausted the kid in me but nope! I still had so much fun in disneyland hong kong. 2 DL off my bucketlist and more to come!     


### Tokenization

In [40]:
import re
import string

def pad_punctuation(s):
    s = re.sub(r"([^\w\s'-])", r" \1 ", s) # ไม่แยก _ , -
    s = re.sub(" +", " ", s)
    return s

text_data = [pad_punctuation(x) for x in filtered_data]

example_data = text_data[10]
example_data

"Disneyland review : United States : Disneyland_HongKong : 5 : Disneyland never cease to amaze me ! I've been to Disneyland florida and I thought I have exhausted the kid in me but nope ! I still had so much fun in disneyland hong kong . 2 DL off my bucketlist and more to come ! "

In [41]:
import tensorflow as tf
from tensorflow.keras import layers

BATCH_SIZE = 32 # 🤔
VOCAB_SIZE = 10000 # 🤔
MAX_LEN = 80 # 🤔

text_ds = tf.data.Dataset.from_tensor_slices(text_data)
text_ds = text_ds.batch(BATCH_SIZE)
text_ds = text_ds.shuffle(1000)


vectorize_layer = layers.TextVectorization( # 🤔
    standardize="lower", # 🤔
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: .
3: the
4: :
5: and
6: ,
7: to
8: a
9: of


In [42]:
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[  14   22    4   45   64    4  104    4   39    4   14  195    1    7
 4191  158   19  426   91    7   14  246    5   17  349   17   34 1631
    3  431   11  158   21 4345   19   17  125   44   35   89   98   11
   14  234  235    2   80 1006  205   42    1    5   68    7  221   19
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0]


### Training set

In [43]:
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

train_ds = text_ds.map(prepare_inputs)
example_input_output = train_ds.take(1).get_single_element()

example_input_output[0][0]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([  14,   22,    4,   45,   64,    4,   54,    4,   39,    4,  170,
         96,   17, 2101,   28,    8,  333,    6,   47,   34, 1159,   35,
        109,  196,  225,  283,   17,   12,   33,   28,    8,  333,    5,
        202,  262,    7,  126,   42,   69,   33,   12,  167,    2,   17,
        211,   84,    8,   58,   72,    7,  417,    8,   30,  728,    6,
         29,   57,    3,  155,  100,   18,   56,   63,    5,   85,    8,
        156,   68,  105,   29,  151,  332,    2,  147,   11,  905,   16,
          8,  151,  802])>

In [44]:
example_input_output[1][0]

<tf.Tensor: shape=(80,), dtype=int64, numpy=
array([  22,    4,   45,   64,    4,   54,    4,   39,    4,  170,   96,
         17, 2101,   28,    8,  333,    6,   47,   34, 1159,   35,  109,
        196,  225,  283,   17,   12,   33,   28,    8,  333,    5,  202,
        262,    7,  126,   42,   69,   33,   12,  167,    2,   17,  211,
         84,    8,   58,   72,    7,  417,    8,   30,  728,    6,   29,
         57,    3,  155,  100,   18,   56,   63,    5,   85,    8,  156,
         68,  105,   29,  151,  332,    2,  147,   11,  905,   16,    8,
        151,  802,   29])>

## Create Model

In [None]:
# code to create model

## Visulization

In [None]:
# code to show visulization