In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/test-data/test_data.json
/kaggle/input/glove840b300dtxt/glove.840B.300d.txt
/kaggle/input/training-alta/training.json


In [2]:
!nvidia-smi

Fri Jul  5 15:47:32 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P0             27W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Tahap 1: Download Glove 840B 300D

In [None]:
# import requests
# import zipfile
# import io
# import os

# # URL of the GloVe embeddings
# url = "http://nlp.stanford.edu/data/glove.840B.300d.zip"
# # Nama file zip
# zip_file_name = "glove.840B.300d.zip"
# # Folder where the embeddings will be saved
# save_folder = "glove_embeddings"

# # Create the folder if it does not exist
# if not os.path.exists(save_folder):
#     os.makedirs(save_folder)

# # Path to the zip file
# zip_file_path = os.path.join(save_folder, zip_file_name)

# # Check if the zip file already exists
# if not os.path.exists(zip_file_path):
#     # Download the zip file if it does not exist
#     print("Downloading GloVe embeddings...")
#     response = requests.get(url)
#     with open(zip_file_path, 'wb') as f:
#         f.write(response.content)
#     print("Download complete.")
# else:
#     print(f"File '{zip_file_path}' already exists, skipping download.")

# # Extract the zip file
# print("Extracting GloVe embeddings...")
# with zipfile.ZipFile(zip_file_path, 'r') as z:
#     z.extractall(save_folder)
# print(f"Extraction complete. Files saved to {save_folder}.")


In [3]:
# Load pre-trained GloVe embeddings
EMBEDDING_DIM = 300
glove_file = '/kaggle/input/glove840b300dtxt/glove.840B.300d.txt'  # Update with the correct path to the GloVe file

embeddings_index = {}
with open(glove_file, encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')  # Split the line by space
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

## Tahap 2: Import Dataset

In [4]:
df=pd.read_json("/kaggle/input/training-alta/training.json", lines=True)
df.head()

Unnamed: 0,text,label,id
0,Have you ever heard of the Crusades? A time in...,1,0
1,"The professors, who likely have nearly a decad...",1,1
2,Kemba Walker does a good job of defending Foye...,1,2
3,"Ganias' lawyer, Stanley Twardy, urged the gove...",1,3
4,The Circuit Court of Appeals of New Jersey had...,0,4


## Eksperimen 1: Tidak menggunakan pre-processing

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report 
from tensorflow.keras.metrics import categorical_accuracy

import pandas as pd
import numpy as np

# Load the data from a pandas DataFrame
texts = df.text.values
labels = df.label.values

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)
word_index = tokenizer.word_index

# Prepare the embedding matrix
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Pad sequences to have the same length
max_length = max([len(seq) for seq in train_sequences + test_sequences])
train_padded_sequences = pad_sequences(train_sequences, maxlen=max_length)
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_length)

# # Convert labels to one-hot encoded vectors
# num_classes = max(labels) + 1
# train_labels = tf.keras.utils.to_categorical(train_labels, num_classes)
# test_labels = tf.keras.utils.to_categorical(test_labels, num_classes)

# Define the embedding layer with pre-trained GloVe embeddings
with tf.device("/GPU:0"):
    embedding_layer = Embedding(num_words,
                                 EMBEDDING_DIM,
                                 embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                 input_length=max_length,
                                 trainable=False)

    # Define the LSTM model
    lstm_model = Sequential([
        embedding_layer,
        LSTM(100),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    lstm_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the GRU model
    gru_model = Sequential([
        embedding_layer,
        GRU(100),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    gru_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the combined LSTM-GRU model
    combined_model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100, return_sequences=True)),
        Bidirectional(GRU(100)),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    combined_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the Bidirectional LSTM model
    bi_lstm_model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100)),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    bi_lstm_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the Bidirectional GRU model
    bi_gru_model = Sequential([
        embedding_layer,
        Bidirectional(GRU(100)),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    bi_gru_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the combined Bidirectional LSTM and Bidirectional GRU model
    bi_combined_model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100, return_sequences=True)),
        Bidirectional(GRU(100)),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    bi_combined_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define early stopping callback
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the models
    lstm_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64, validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    gru_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64,  validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    combined_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64,  validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    bi_lstm_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64,  validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    bi_gru_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64,  validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    bi_combined_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64,  validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])

    # Make predictions on test data
    lstm_predictions = lstm_model.predict(test_padded_sequences)
    gru_predictions = gru_model.predict(test_padded_sequences)
    combined_predictions = combined_model.predict(test_padded_sequences)
    bi_lstm_predictions = bi_lstm_model.predict(test_padded_sequences)
    bi_gru_predictions = bi_gru_model.predict(test_padded_sequences)
    bi_combined_predictions = bi_combined_model.predict(test_padded_sequences)

    # Convert predictions to class labels
    lstm_predictions = np.round(lstm_predictions)
    gru_predictions = np.round(gru_predictions)
    combined_predictions = np.round(combined_predictions)
    bi_lstm_predictions = np.round(bi_lstm_predictions)
    bi_gru_predictions = np.round(bi_gru_predictions)
    bi_combined_predictions = np.round(bi_combined_predictions)

    # Get the true class labels
#     true_labels = np.argmax(test_labels, axis=1)
    true_labels = test_labels

    # Print classification reports
    print("LSTM Classification Report:")
    print(classification_report(true_labels, lstm_predictions))

    print("\nGRU Classification Report:")
    print(classification_report(true_labels, gru_predictions))

    print("\nCombined Classification Report:")
    print(classification_report(true_labels, combined_predictions))

    print("\nBi-LSTM Classification Report:")
    print(classification_report(true_labels, bi_lstm_predictions))

    print("\nBi-GRU Classification Report:")
    print(classification_report(true_labels, bi_gru_predictions))

    print("\nBi-Combined Classification Report:")
    print(classification_report(true_labels, bi_combined_predictions))

2024-07-05 15:52:17.386415: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-05 15:52:17.386535: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-05 15:52:17.523173: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch 1/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16ms/step - accuracy: 0.8653 - loss: 0.3336 - val_accuracy: 0.9200 - val_loss: 0.2196
Epoch 2/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9318 - loss: 0.1800 - val_accuracy: 0.9028 - val_loss: 0.2499
Epoch 3/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.8728 - loss: 0.3169 - val_accuracy: 0.9161 - val_loss: 0.2076
Epoch 4/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9406 - loss: 0.1576 - val_accuracy: 0.9017 - val_loss: 0.2505
Epoch 5/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9285 - loss: 0.1875 - val_accuracy: 0.9217 - val_loss: 0.1866
Epoch 6/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9397 - loss: 0.1678 - val_accuracy: 0.9189 - val_loss: 0.2038
Epoch 7/20
[1m254/25

In [6]:
test_df=pd.read_json("/kaggle/input/test-data/test_data.json", lines=True)
test_df.head()

Unnamed: 0,id,text
0,0,Investigators are now hamstrung by the inabili...
1,1,"[10] Indeed, the District Court found that pe..."
2,2,"""The second object of this legislation is to p..."
3,3,"It is in vain, in a case of this nature, that ..."
4,4,*4 Mr. Justice WAYNE delivered the opinion of ...


In [7]:
testing_texts = test_df['text'].tolist()
testing_sequences = tokenizer.texts_to_sequences(testing_texts)
testing_padded_sequences = pad_sequences(testing_sequences, maxlen=max_length)

In [8]:
def predict_to_json(model, xpad, json_name):
    probabilities = model.predict(xpad)
#     answers = np.argmax(probabilities, axis=1)
    answers = np.round(probabilities).astype(int)
    answers = np.hstack(answers)
    answers_df = pd.DataFrame(answers, columns=["label"])
    answers_df["id"] = range(0, len(answers_df))
    answers_df = answers_df[["id", "label"]]
    answers_df.to_json(json_name, orient="records", lines=True)

predict_to_json(lstm_model, testing_padded_sequences, "/kaggle/working/answer_lstm.json")
predict_to_json(gru_model, testing_padded_sequences, "/kaggle/working/answer_gru.json")
predict_to_json(combined_model, testing_padded_sequences, "/kaggle/working/answer_lstmgru.json")
predict_to_json(bi_lstm_model, testing_padded_sequences, "/kaggle/working/answer_bilstm.json")
predict_to_json(bi_gru_model, testing_padded_sequences, "/kaggle/working/answer_bigru.json")
predict_to_json(bi_combined_model, testing_padded_sequences, "/kaggle/working/answer_bilstmbigru.json")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step


## Eksperimen 2: Menggunkan preprocessing Lowercasing, remove number and punctuation

### Preprocessing untuk eksperimen 2 dan 3 

In [9]:
import re 
import string

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Preprocessing text
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^\w\s?!,\']', '', text) # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

# Preprocessing text + stopwords
def preprocess_text_stopwords(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^\w\s?!,\']', '', text) # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [11]:
df['prep_text'] = df['text'].apply(preprocess_text)
df['prep_sw_text'] = df['text'].apply(preprocess_text_stopwords)
df.head()

Unnamed: 0,text,label,id,prep_text,prep_sw_text
0,Have you ever heard of the Crusades? A time in...,1,0,have you ever heard of the crusades? a time in...,ever heard crusades? time christians went year...
1,"The professors, who likely have nearly a decad...",1,1,"the professors, who likely have nearly a decad...","professors, likely nearly decade education eac..."
2,Kemba Walker does a good job of defending Foye...,1,2,kemba walker does a good job of defending foye...,"kemba walker good job defending foye, better o..."
3,"Ganias' lawyer, Stanley Twardy, urged the gove...",1,3,"ganias' lawyer, stanley twardy, urged the gove...","ganias' lawyer, stanley twardy, urged governme..."
4,The Circuit Court of Appeals of New Jersey had...,0,4,the circuit court of appeals of new jersey had...,circuit court appeals new jersey jurisdiction ...


In [12]:
## Preprocessing Test Data
df_test = pd.read_json("/kaggle/input/test-data/test_data.json", lines=True)

df_test['prep_text'] = df_test['text'].apply(preprocess_text)
df_test['prep_sw_text'] = df_test['text'].apply(preprocess_text_stopwords)
df_test.head()

Unnamed: 0,id,text,prep_text,prep_sw_text
0,0,Investigators are now hamstrung by the inabili...,investigators are now hamstrung by the inabili...,investigators hamstrung inability compel witne...
1,1,"[10] Indeed, the District Court found that pe...","indeed, the district court found that petiti...","indeed, district court found petitioners able ..."
2,2,"""The second object of this legislation is to p...",the second object of this legislation is to pr...,second object legislation protect employees ra...
3,3,"It is in vain, in a case of this nature, that ...","it is in vain, in a case of this nature, that ...","vain, case nature, court look intentions legis..."
4,4,*4 Mr. Justice WAYNE delivered the opinion of ...,mr justice wayne delivered the opinion of the...,mr justice wayne delivered opinion court


In [13]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report 
from tensorflow.keras.metrics import categorical_accuracy

import pandas as pd
import numpy as np

# Load the data from a pandas DataFrame
texts = df.prep_text.values
labels = df.label.values

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)
word_index = tokenizer.word_index

# Prepare the embedding matrix
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Pad sequences to have the same length
max_length = max([len(seq) for seq in train_sequences + test_sequences])
train_padded_sequences = pad_sequences(train_sequences, maxlen=max_length)
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_length)

# # Convert labels to one-hot encoded vectors
# num_classes = max(labels) + 1
# train_labels = tf.keras.utils.to_categorical(train_labels, num_classes)
# test_labels = tf.keras.utils.to_categorical(test_labels, num_classes)

# Define the embedding layer with pre-trained GloVe embeddings
with tf.device("/GPU:0"):
    embedding_layer = Embedding(num_words,
                                 EMBEDDING_DIM,
                                 embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                 input_length=max_length,
                                 trainable=False)

    # Define the LSTM model
    lstm_model = Sequential([
        embedding_layer,
        LSTM(100),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    lstm_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the GRU model
    gru_model = Sequential([
        embedding_layer,
        GRU(100),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    gru_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the combined LSTM-GRU model
    combined_model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100, return_sequences=True)),
        Bidirectional(GRU(100)),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    combined_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the Bidirectional LSTM model
    bi_lstm_model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100)),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    bi_lstm_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the Bidirectional GRU model
    bi_gru_model = Sequential([
        embedding_layer,
        Bidirectional(GRU(100)),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    bi_gru_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the combined Bidirectional LSTM and Bidirectional GRU model
    bi_combined_model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100, return_sequences=True)),
        Bidirectional(GRU(100)),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    bi_combined_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define early stopping callback
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the models
    lstm_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64, validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    gru_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64, validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    combined_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64, validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    bi_lstm_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64, validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    bi_gru_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64, validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    bi_combined_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64, validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])

    # Make predictions on test data
    lstm_predictions = lstm_model.predict(test_padded_sequences)
    gru_predictions = gru_model.predict(test_padded_sequences)
    combined_predictions = combined_model.predict(test_padded_sequences)
    bi_lstm_predictions = bi_lstm_model.predict(test_padded_sequences)
    bi_gru_predictions = bi_gru_model.predict(test_padded_sequences)
    bi_combined_predictions = bi_combined_model.predict(test_padded_sequences)

    # Convert predictions to class labels
    lstm_predictions = np.round(lstm_predictions)
    gru_predictions = np.round(gru_predictions)
    combined_predictions = np.round(combined_predictions)
    bi_lstm_predictions = np.round(bi_lstm_predictions)
    bi_gru_predictions = np.round(bi_gru_predictions)
    bi_combined_predictions = np.round(bi_combined_predictions)

    # Get the true class labels
#     true_labels = np.argmax(test_labels, axis=1)
    true_labels = test_labels

    # Print classification reports
    print("LSTM Classification Report:")
    print(classification_report(true_labels, lstm_predictions))

    print("\nGRU Classification Report:")
    print(classification_report(true_labels, gru_predictions))

    print("\nCombined Classification Report:")
    print(classification_report(true_labels, combined_predictions))

    print("\nBi-LSTM Classification Report:")
    print(classification_report(true_labels, bi_lstm_predictions))

    print("\nBi-GRU Classification Report:")
    print(classification_report(true_labels, bi_gru_predictions))

    print("\nBi-Combined Classification Report:")
    print(classification_report(true_labels, bi_combined_predictions))

Epoch 1/20




[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.8410 - loss: 0.3791 - val_accuracy: 0.9089 - val_loss: 0.2460
Epoch 2/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9140 - loss: 0.2253 - val_accuracy: 0.9194 - val_loss: 0.2044
Epoch 3/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9351 - loss: 0.1681 - val_accuracy: 0.9222 - val_loss: 0.1840
Epoch 4/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9403 - loss: 0.1512 - val_accuracy: 0.9178 - val_loss: 0.2074
Epoch 5/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9509 - loss: 0.1203 - val_accuracy: 0.9328 - val_loss: 0.1756
Epoch 6/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9696 - loss: 0.0860 - val_accuracy: 0.9328 - val_loss: 0.1787
Epoch 7/20
[1m254/254[0m [32m

In [14]:
testing_texts = df_test['prep_text'].tolist()
testing_sequences = tokenizer.texts_to_sequences(testing_texts)
testing_padded_sequences = pad_sequences(testing_sequences, maxlen=max_length)

In [15]:
def predict_to_json(model, xpad, json_name):
    probabilities = model.predict(xpad)
#     answers = np.argmax(probabilities, axis=1)
    answers = np.round(probabilities).astype(int)
    answers = np.hstack(answers)
    answers_df = pd.DataFrame(answers, columns=["label"])
    answers_df["id"] = range(0, len(answers_df))
    answers_df = answers_df[["id", "label"]]
    answers_df.to_json(json_name, orient="records", lines=True)

predict_to_json(lstm_model, testing_padded_sequences, "/kaggle/working/answer_lstm2.json")
predict_to_json(gru_model, testing_padded_sequences, "/kaggle/working/answer_gru2.json")
predict_to_json(combined_model, testing_padded_sequences, "/kaggle/working/answer_lstmgru2.json")
predict_to_json(bi_lstm_model, testing_padded_sequences, "/kaggle/working/answer_bilstm2.json")
predict_to_json(bi_gru_model, testing_padded_sequences, "/kaggle/working/answer_bigru2.json")
predict_to_json(bi_combined_model, testing_padded_sequences, "/kaggle/working/answer_bilstmbigru2.json")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step


## Eksperimen 3
- Preprocesing tambahan menggunakan Stopwords

In [16]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report 
from tensorflow.keras.metrics import categorical_accuracy

import pandas as pd
import numpy as np

# Load the data from a pandas DataFrame
texts = df.prep_sw_text.values
labels = df.label.values

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)
word_index = tokenizer.word_index

# Prepare the embedding matrix
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Pad sequences to have the same length
max_length = max([len(seq) for seq in train_sequences + test_sequences])
train_padded_sequences = pad_sequences(train_sequences, maxlen=max_length)
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_length)

# # Convert labels to one-hot encoded vectors
# num_classes = max(labels) + 1
# train_labels = tf.keras.utils.to_categorical(train_labels, num_classes)
# test_labels = tf.keras.utils.to_categorical(test_labels, num_classes)

# Define the embedding layer with pre-trained GloVe embeddings
with tf.device("/GPU:0"):
    embedding_layer = Embedding(num_words,
                                 EMBEDDING_DIM,
                                 embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                 input_length=max_length,
                                 trainable=False)

    # Define the LSTM model
    lstm_model = Sequential([
        embedding_layer,
        LSTM(100),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    lstm_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the GRU model
    gru_model = Sequential([
        embedding_layer,
        GRU(100),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    gru_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the combined LSTM-GRU model
    combined_model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100, return_sequences=True)),
        Bidirectional(GRU(100)),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    combined_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the Bidirectional LSTM model
    bi_lstm_model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100)),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    bi_lstm_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the Bidirectional GRU model
    bi_gru_model = Sequential([
        embedding_layer,
        Bidirectional(GRU(100)),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    bi_gru_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define the combined Bidirectional LSTM and Bidirectional GRU model
    bi_combined_model = Sequential([
        embedding_layer,
        Bidirectional(LSTM(100, return_sequences=True)),
        Bidirectional(GRU(100)),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.001)
    bi_combined_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    # Define early stopping callback
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the models
    lstm_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64, validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    gru_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64, validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    combined_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64, validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    bi_lstm_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64, validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    bi_gru_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64, validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])
    
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    bi_combined_model.fit(train_padded_sequences, train_labels, epochs=20, batch_size=64, validation_data=(test_padded_sequences, test_labels), callbacks=[early_stop])

    # Make predictions on test data
    lstm_predictions = lstm_model.predict(test_padded_sequences)
    gru_predictions = gru_model.predict(test_padded_sequences)
    combined_predictions = combined_model.predict(test_padded_sequences)
    bi_lstm_predictions = bi_lstm_model.predict(test_padded_sequences)
    bi_gru_predictions = bi_gru_model.predict(test_padded_sequences)
    bi_combined_predictions = bi_combined_model.predict(test_padded_sequences)

    # Convert predictions to class labels
    lstm_predictions = np.round(lstm_predictions)
    gru_predictions = np.round(gru_predictions)
    combined_predictions = np.round(combined_predictions)
    bi_lstm_predictions = np.round(bi_lstm_predictions)
    bi_gru_predictions = np.round(bi_gru_predictions)
    bi_combined_predictions = np.round(bi_combined_predictions)

    # Get the true class labels
#     true_labels = np.argmax(test_labels, axis=1)
    true_labels = test_labels

    # Print classification reports
    print("LSTM Classification Report:")
    print(classification_report(true_labels, lstm_predictions))

    print("\nGRU Classification Report:")
    print(classification_report(true_labels, gru_predictions))

    print("\nCombined Classification Report:")
    print(classification_report(true_labels, combined_predictions))

    print("\nBi-LSTM Classification Report:")
    print(classification_report(true_labels, bi_lstm_predictions))

    print("\nBi-GRU Classification Report:")
    print(classification_report(true_labels, bi_gru_predictions))

    print("\nBi-Combined Classification Report:")
    print(classification_report(true_labels, bi_combined_predictions))

Epoch 1/20




[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.8669 - loss: 0.3314 - val_accuracy: 0.9167 - val_loss: 0.2160
Epoch 2/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9271 - loss: 0.1927 - val_accuracy: 0.9206 - val_loss: 0.1983
Epoch 3/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9412 - loss: 0.1481 - val_accuracy: 0.9267 - val_loss: 0.1897
Epoch 4/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9526 - loss: 0.1251 - val_accuracy: 0.9267 - val_loss: 0.1851
Epoch 5/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9646 - loss: 0.0964 - val_accuracy: 0.9289 - val_loss: 0.1918
Epoch 6/20
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9701 - loss: 0.0796 - val_accuracy: 0.9333 - val_loss: 0.1853
Epoch 7/20
[1m254/254[0m [32m━━━━━

In [17]:
testing_texts = df_test['prep_sw_text'].tolist()
testing_sequences = tokenizer.texts_to_sequences(testing_texts)
testing_padded_sequences = pad_sequences(testing_sequences, maxlen=max_length)

In [18]:
def predict_to_json(model, xpad, json_name):
    probabilities = model.predict(xpad)
#     answers = np.argmax(probabilities, axis=1)
    answers = np.round(probabilities).astype(int)
    answers = np.hstack(answers)
    answers_df = pd.DataFrame(answers, columns=["label"])
    answers_df["id"] = range(0, len(answers_df))
    answers_df = answers_df[["id", "label"]]
    answers_df.to_json(json_name, orient="records", lines=True)

predict_to_json(lstm_model, testing_padded_sequences, "/kaggle/working/answer_lstm3.json")
predict_to_json(gru_model, testing_padded_sequences, "/kaggle/working/answer_gru3.json")
predict_to_json(combined_model, testing_padded_sequences, "/kaggle/working/answer_lstmgru3.json")
predict_to_json(bi_lstm_model, testing_padded_sequences, "/kaggle/working/answer_bilstm3.json")
predict_to_json(bi_gru_model, testing_padded_sequences, "/kaggle/working/answer_bigru3.json")
predict_to_json(bi_combined_model, testing_padded_sequences, "/kaggle/working/answer_bilstmbigru3.json")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
