<a href="https://colab.research.google.com/github/Jezreel114/CCDEPLRL_EXERCISES_COM222_ML/blob/main/Exercise6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 6

In [1]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
import numpy as np
import pandas as pd

path = "https://github.com/robitussin/CCDEPLRL_EXERCISES/blob/9b8ac1c5683abecc144f0af47eb7cda0688e12b7/dataset/reviews.json?raw=true"

dataset = pd.read_json(path)

In [3]:
dataset.head(20)

Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,5
1,di pareha yong mga size nila may sobrang liit ...,5
2,super worth it ang ganda Sombra grabi order na...,5
3,ganda po salamat,5
4,maayos pagkadeliver maganda den sya,5
5,ang gnda nang short nagustohan nang binigyan k...,4
6,maganda sya medyo manipis nga lang ..,4
7,,4
8,,4
9,"manipis pla at ska dami himulmol ng sinulid, d...",2


## 1. Tokenize the data

In [4]:
dataset['sentiment'] = dataset['rating'].apply(lambda rating : 1 if rating > 3 else 0)
sentences = dataset['review'].tolist()
labels = dataset['sentiment'].tolist()

# Separate out the sentences and labels into training and test sets
training_size = int(len(sentences) * 0.8)

training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

# Make labels into numpy arrays for use with the network later
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [5]:

vocab_size = 3537
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = ""

In [6]:
# answer here

# Import the Tokenizer
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length, padding=padding_type,
                       truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length,
                               padding=padding_type, truncating=trunc_type)

## 2. Sequence the data

## 3. Pad the data

In [7]:
# answer here


In [8]:


# Specify a max length for the padded sequences
padded = pad_sequences(sequences, maxlen=15)
print(padded)

[[   0    0    0 ... 1514  390   66]
 [   0    0    0 ...   89  603 1516]
 [  24    3  100 ...    9  168   38]
 ...
 [ 431   20 3131 ...  218  654   75]
 [   0    0    0 ...  159   38  444]
 [  35 1438    5 ...   87  464   12]]


## 4. Train a sentiment model

In [41]:
# answer here
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),  # Use Global Average Pooling
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [42]:
num_epochs = 200
model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))

Epoch 1/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.5243 - loss: 0.6928 - val_accuracy: 0.2139 - val_loss: 0.7155
Epoch 2/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6128 - loss: 0.6830 - val_accuracy: 0.2139 - val_loss: 0.7398
Epoch 3/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5544 - loss: 0.6774 - val_accuracy: 0.2139 - val_loss: 0.7404
Epoch 4/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6193 - loss: 0.6601 - val_accuracy: 0.2139 - val_loss: 0.7516
Epoch 5/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7054 - loss: 0.6326 - val_accuracy: 0.2139 - val_loss: 0.7326
Epoch 6/200
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7686 - loss: 0.6115 - val_accuracy: 0.2139 - val_loss: 0.7400
Epoch 7/200
[1m25/25[0m [32m━━

<keras.src.callbacks.history.History at 0x7e3ea446c5d0>

## Get files for visualing the network

In [39]:
# answer here
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(3537, 16)


## 5. Predict sentiment with new reviews

In [43]:
fake_reviews = ['super worth it ang ganda nito',
                'ang pangit ng quality nito',
                'putangina mo ang pangit ng nabili ko',
                'pakyu bobo mo',
                'super ganda at mukhang matibay naman yung quality',
                'ang ganda ng product kaso putangina antagal mag reply ni seller',
                'ang ganda tangina',
                'sakto lang']

print(fake_reviews)

# Create the sequences
padding_type = 'post'
sample_sequences = tokenizer.texts_to_sequences(fake_reviews)
fakes_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)

print('\nHOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!\n')

# Predict sentiment
classes = model.predict(fakes_padded)

# Display each review with a human-readable sentiment
for i in range(len(fake_reviews)):
    # Using a threshold of 0.5 to decide sentiment (Sigmoid output between 0 and 1)
    sentiment = "Positive" if classes[i] >= 0.5 else "Negative"
    print(fake_reviews[i])
    print(f"Sentiment: {sentiment}")
    print(f"Model Output (Probability): {classes[i][0]:.4f}")
    print('\n')

['super worth it ang ganda nito', 'ang pangit ng quality nito', 'putangina mo ang pangit ng nabili ko', 'pakyu bobo mo', 'super ganda at mukhang matibay naman yung quality', 'ang ganda ng product kaso putangina antagal mag reply ni seller', 'ang ganda tangina', 'sakto lang']

HOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!





[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step
super worth it ang ganda nito
Sentiment: Positive
Model Output (Probability): 0.9346


ang pangit ng quality nito
Sentiment: Negative
Model Output (Probability): 0.4225


putangina mo ang pangit ng nabili ko
Sentiment: Negative
Model Output (Probability): 0.3216


pakyu bobo mo
Sentiment: Negative
Model Output (Probability): 0.4913


super ganda at mukhang matibay naman yung quality
Sentiment: Positive
Model Output (Probability): 0.8860


ang ganda ng product kaso putangina antagal mag reply ni seller
Sentiment: Negative
Model Output (Probability): 0.4240


ang ganda tangina
Sentiment: Positive
Model Output (Probability): 0.7627


sakto lang
Sentiment: Negative
Model Output (Probability): 0.4980


