In [None]:
def clean_word(token):
    token = token.lower()
    token = re.sub("[^a-z0-9]*", "", token)
    token = lemmatizer.lemmatize(token)

    return token

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def tokenize_sentences(sent, max_length, token2id, UNK=1, PAD=0):
    tokens = [token2id.get(clean_word(token), UNK) for token in sent.split()]

    if len(tokens) < max_length:
        diff = max_length - len(tokens)
        tokens.extend([PAD] * diff)
    elif len(tokens) > max_length:
        tokens = tokens[:max_length]

    return tokens

In [None]:
!pip install datasets
!pip install kagglehub nltk pandas numpy



In [None]:
# Import libraries
from nltk.stem import WordNetLemmatizer
from random import shuffle
import pandas as pd
import numpy as np
import datasets
import nltk
import re
import kagglehub
import tensorflow as tf

nltk.download('wordnet')
nltk.download('omw-1.4')

# Define constant parameters
GLOVE_PATH = "/content/drive/MyDrive/glove.6B.50d.txt"
DATASET_NAME = 'quora'
DATA_LEN = 30000
MAX_LEN = 100
TEST_SIZE = 0.1

# Create variables
token2idx = {'<PAD>': 0, '<UNK>': 1}
lemmatizer = WordNetLemmatizer()
counter = 2
dataset = []
s1, s2 = [], []

# Load dataset
data = datasets.load_dataset(DATASET_NAME, split='train', trust_remote_code=True)

# Split data to a fixed size
sentences = data['questions'][:DATA_LEN]
labels = data['is_duplicate'][:DATA_LEN]

# Iterate over sentences to split sentence 1 and 2
for item in sentences:
    s1.append(item['text'][0])
    s2.append(item['text'][1])

# Open GLoVe file for word embedding with UTF-8 encoding
with open(GLOVE_PATH, 'r', encoding='utf-8') as f:
    raw_glove = f.read().strip().split('\n')

# Create an empty array to fill with glive embeddings
glove_weights = np.zeros((len(raw_glove), len(raw_glove[0].split())-1), dtype=float)

# Store values of embeddings and create dictionary with words and tokens
for idx, item in enumerate(raw_glove):
    for idy, entity in enumerate(item.split()):
        if idy == 0:
            token2idx[entity] = counter
            counter += 1
        else:
            glove_weights[idx, idy-1] = float(entity)

# Create a dataset with tokens and labels
for idx, (sent1, sent2) in enumerate(zip(s1, s2)):
    tag = 1 if labels[idx] == True else 0
    dataset.append((tokenize_sentences(sent1, MAX_LEN, token2idx),
                   tokenize_sentences(sent2, MAX_LEN, token2idx),
                   tag))

# Shuffle dataset and split train and test
shuffle(dataset)
test_index = int(len(dataset)*TEST_SIZE)
train = dataset[test_index:]
test = dataset[:test_index]

# Create x and y of train and test from dataset
x_train1 = np.zeros((len(train), MAX_LEN), dtype=int)
x_train2 = np.zeros((len(train), MAX_LEN), dtype=int)
y_train = np.zeros((len(train)), dtype=int)
x_test1 = np.zeros((len(test), MAX_LEN), dtype=int)
x_test2 = np.zeros((len(test), MAX_LEN), dtype=int)
y_test = np.zeros((len(test)), dtype=int)

for idx, item in enumerate(train):
    x_train1[idx, :] = item[0]
    x_train2[idx, :] = item[1]
    y_train[idx] = item[2]

    if idx <= len(test)-1:
        x_test1[idx, :] = test[idx][0]
        x_test2[idx, :] = test[idx][1]
        y_test[idx] = test[idx][2]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# New Section

In [None]:
def abs(x):

  return tf.abs(x[0] - x[1])

In [None]:
from tensorflow.keras.layers import (
    Bidirectional,
    LSTM,
    Dense,
    Embedding,
    Lambda,
    Concatenate,
    Dropout,
    Attention,
)
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.backend import abs as ab
from tensorflow.keras.models import Model
from tensorflow.keras import Input
from tensorflow.keras.optimizers import Adam

(num_vocab, emb_dim) = glove_weights.shape

# Meningkatkan jumlah unit LSTM
lstm = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))

embed = Embedding(
    input_dim=num_vocab,
    output_dim=emb_dim,
    input_length=MAX_LEN,
    weights=[glove_weights],
    trainable=True,  # Fine-tuning embeddings
)


input1 = Input(shape=(MAX_LEN,))
e1 = embed(input1)
t1 = lstm(e1)

input2 = Input(shape=(MAX_LEN,))
e2 = embed(input2)
t2 = lstm(e2)

# Menambahkan Attention Layer
attention = Attention()([t1, t2])
concat = Concatenate()([t1, t2, attention])

sub_layer = Lambda(function=abs, output_shape=(128,))([t1, t2])  # Ubah output_shape sesuai dengan dimensi yang benar
dense_1 = Dense(64, activation="relu")(sub_layer)
dropout_1 = Dropout(0.5)(dense_1)  # Menambahkan Dropout
dense_2 = Dense(32, activation="relu")(dropout_1)
dropout_2 = Dropout(0.5)(dense_2)  # Menambahkan Dropout
preds = Dense(1, activation="sigmoid")(dropout_2)

model = Model(inputs=[input1, input2], outputs=preds)

# Menggunakan AdamW optimizer
optimizer = Adam(learning_rate=0.001)
model.compile(loss=binary_crossentropy, optimizer=optimizer, metrics=["accuracy"])



In [None]:
model.summary()

In [None]:
history = model.fit([x_train1, x_train2], y_train, epochs=20, validation_data=([x_test1, x_test2], y_test), batch_size=256)

Epoch 1/20
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 666ms/step - accuracy: 0.6104 - loss: 0.6705 - val_accuracy: 0.6230 - val_loss: 0.6463
Epoch 2/20
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 657ms/step - accuracy: 0.6279 - loss: 0.6524 - val_accuracy: 0.6230 - val_loss: 0.6319
Epoch 3/20
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 656ms/step - accuracy: 0.6277 - loss: 0.6358 - val_accuracy: 0.6230 - val_loss: 0.6100
Epoch 4/20
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 644ms/step - accuracy: 0.6394 - loss: 0.6146 - val_accuracy: 0.6253 - val_loss: 0.6081
Epoch 5/20
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 649ms/step - accuracy: 0.6612 - loss: 0.5928 - val_accuracy: 0.6727 - val_loss: 0.5892
Epoch 6/20
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 665ms/step - accuracy: 0.6932 - loss: 0.5680 - val_accuracy: 0.6863 - val_loss: 0.5628
Epoch 7/20

In [None]:
model.save('/content/drive/MyDrive//model.h5')



In [None]:
import numpy as np
def preprocess_text(text1, text2):
    """
    Pra-pemrosesan teks baru agar sesuai dengan format input model.

    Args:
        text1 (str): Teks pertama.
        text2 (str): Teks kedua.

    Returns:
        Tuple[np.array, np.array]: Teks yang telah diubah menjadi token dan di-padding.
    """
    # Tokenisasi dan padding teks pertama
    tokens1 = tokenize_sentences(text1, MAX_LEN, token2idx)
    input1 = np.zeros((1, MAX_LEN), dtype=int)
    input1[0, :] = tokens1

    # Tokenisasi dan padding teks kedua
    tokens2 = tokenize_sentences(text2, MAX_LEN, token2idx)
    input2 = np.zeros((1, MAX_LEN), dtype=int)
    input2[0, :] = tokens2

    return input1, input2

In [None]:
# Contoh teks baru
text1 = 'Ini adalah wilayah sosial-ekonomi yang lebih rendah.'
text2 = 'itu merupakan wilayah hukum-politik yang lebih tinggi.'
text3 = "Zelinsky hanya berteori bahwa tidak ada tiga bilangan bulat"

# Pra-pemrosesan teks
input1, input2 = preprocess_text(text1, text2)

# Prediksi
prediction = model.predict([input1, input2],)

# Cetak hasil prediksi
print(f"Probabilitas duplikat: {prediction[0][0]}")
# Jika probabilitas > 0.5, maka dianggap duplikat

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 713ms/step
Probabilitas duplikat: 0.00782870501279831


In [None]:
import tensorflow as tf
import keras

In [None]:
model = keras.models.load_model('./model.h5',custom_objects={'tf': tf,"abs":abs})

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = './model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
model.summary()

In [None]:
# Contoh teks baru
text1 = 'Ini adalah wilayah sosial-ekonomi yang lebih rendah.'
text2 = "Aku akan pergi untuk main bola."
text3 = 'Hariini aku suka bermain bola.'

text4 = "I like dog."
text5 = "She likes cat."

# Pra-pemrosesan teks
input1, input2 = preprocess_text(text4, text5)

# Prediksi
prediction = model.predict([input1, input2])

# Cetak hasil prediksi
print(f"Probabilitas duplikat: {prediction[0][0]}")
# Jika probabilitas > 0.5, maka dianggap duplikat

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step
Probabilitas duplikat: 0.06110258772969246
