<a href="https://colab.research.google.com/github/marielollage/CCDEPLRL_EXERCISES_COM211-ML/blob/main/Exercise4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 4

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import io

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
dataset = pd.read_json('reviews.json')


In [3]:
dataset.head()

Unnamed: 0,review,rating
0,sir okay armygreen shorts nice,5
1,di pareha yong mga size nila may sobrang liit ...,5
2,super worth it ang ganda Sombra grabi order na...,5
3,ganda po salamat,5
4,maayos pagkadeliver maganda den sya,5


In [4]:
# Convert ratings to binary labels
def convert_to_binary_labels(rating):
    if rating >= 4:
        return 1  # Positive
    else:
        return 0  # Negative

dataset['sentiment'] = dataset['rating'].apply(convert_to_binary_labels)

# Separate out the sentences and labels
sentences = dataset['review'].tolist()
labels = dataset['sentiment'].tolist()

# Split the data into training and test sets
training_size = int(len(sentences) * 0.8)
training_sentences = sentences[:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[:training_size]
testing_labels = labels[training_size:]

# Convert labels to numpy arrays
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

## 1. Tokenize the data

In [5]:
# Tokenize the data
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(training_sentences)

## 2. Sequence the data

In [6]:
# Sequence the data
training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

## 3. Pad the data

In [7]:
# Pad the data
max_length = 100
padding_type = 'post'
training_padded = pad_sequences(training_sequences, padding=padding_type, maxlen=max_length)
testing_padded = pad_sequences(testing_sequences, padding=padding_type, maxlen=max_length)

## 4. Train a sentiment model

In [8]:
# Check the balance of the dataset
print(dataset['sentiment'].value_counts())

# If the dataset is imbalanced, use class weights
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(training_labels_final),
    y=training_labels_final
)

class_weights = {i: class_weights[i] for i in range(len(class_weights))}
print("Class weights:", class_weights)

sentiment
0    503
1    498
Name: count, dtype: int64
Class weights: {0: 1.1594202898550725, 1: 0.8791208791208791}


In [9]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=16, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 24)                408       
                                                                 
 dense_1 (Dense)             (None, 1)                 25        
                                                                 
Total params: 160433 (626.69 KB)
Trainable params: 160433 (626.69 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
# Train the model
history = model.fit(training_padded, training_labels_final, epochs=40, validation_data=(testing_padded, testing_labels_final))

# Evaluate the model
loss, accuracy = model.evaluate(testing_padded, testing_labels_final)
print(f"Test Accuracy: {accuracy}")

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test Accuracy: 0.8407959938049316


## Get files for visualing the network

In [11]:
# First get the weights of the embedding layer
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

# Create the reverse word index
word_index = tokenizer.word_index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# Write out the embedding vectors and metadata
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

vocab_size = 10000
for word_num in range(1, vocab_size):
    word = reverse_word_index.get(word_num, "<OOV>")
    embeddings = weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")

out_v.close()
out_m.close()

(10000, 16)


In [12]:
# Download the files
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 5. Predict sentiment with new reviews

In [13]:
# Predict sentiment with new reviews
new_reviews = ['ganda ng produkto', 'hindi maganda', 'sulit na sulit',
               'sayang pera dito', 'Napakabilis ng delivery, at maganda ang kalidad',
               'hindi ako satisfied.',
               'Ang saya ko sa pagbili ko nito, highly recommended!',
               'Hindi maganda ang quality',
               'Napakaganda ng customer service, babalik ako ulit',
               'Walang kwenta ang customer service, hindi helpful.',
               'Maayos ang pagkakagawa, mukhang matibay.']

# Create the sequences
sample_sequences = tokenizer.texts_to_sequences(new_reviews)
new_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)

print('\nHOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!\n')

# Predict
predictions = model.predict(new_padded)

# Print the results
for i in range(len(new_reviews)):
    score = predictions[i][0]
    sentiment_label = 'Positive' if score >= 0.5 else 'Negative'
    print(f"Review: {new_reviews[i]}")
    print(f"Sentiment: {sentiment_label} (Score: {score})\n")


HOT OFF THE PRESS! HERE ARE SOME NEWLY MINTED, ABSOLUTELY GENUINE REVIEWS!

Review: ganda ng produkto
Sentiment: Positive (Score: 0.763824462890625)

Review: hindi maganda
Sentiment: Negative (Score: 0.4576016962528229)

Review: sulit na sulit
Sentiment: Positive (Score: 0.8730558753013611)

Review: sayang pera dito
Sentiment: Negative (Score: 0.20133762061595917)

Review: Napakabilis ng delivery, at maganda ang kalidad
Sentiment: Positive (Score: 0.8081254363059998)

Review: hindi ako satisfied.
Sentiment: Negative (Score: 0.4917011260986328)

Review: Ang saya ko sa pagbili ko nito, highly recommended!
Sentiment: Positive (Score: 0.8201875686645508)

Review: Hindi maganda ang quality
Sentiment: Negative (Score: 0.42362624406814575)

Review: Napakaganda ng customer service, babalik ako ulit
Sentiment: Positive (Score: 0.6260116696357727)

Review: Walang kwenta ang customer service, hindi helpful.
Sentiment: Negative (Score: 0.08579646050930023)

Review: Maayos ang pagkakagawa, mukhang