# Library

In [None]:
!pip install pandas

In [None]:
!pip install matplotlib

In [None]:
!pip install opendatasets

In [None]:
!pip install scikit-learn

In [5]:
# Library to dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import opendatasets as od

# Library tensorflow
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

# Surpress warning
import warnings
warnings.filterwarnings("ignore")

# Library sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# pickle
import pickle


# Dataset

In [None]:
# Read dataset
data = pd.read_csv('clean_text.csv')
data.head()

# Encoding

In [7]:
X = data['clean_text'].astype(str)
y = data['class']

# encode class values as integers
# Define label encoder
encoder = LabelEncoder()
# fit and transform
encoded_y = encoder.fit_transform(y)

# Test Train Split

In [8]:
# Test and train split
X_train, X_test, y_train, y_test = train_test_split(X,encoded_y, test_size=.2, random_state=42)

# Data Preprocessing

In [9]:
vocab_size = 10_000
max_length = 10_000
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# Generate the word index dictionary
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

# Generate and pad the training sequences
training_sequences = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Generate and pad the testing sequences
testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert the labels lists into numpy arrays
training_labels = np.array(y_train)
testing_labels = np.array(y_test)

In [10]:
# Save the word index dictionary
np.save('word_index.npy', word_index)

# Save Tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Basic Model Embeding Layers

In [None]:
embedding_dim = 16

# Initialize the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model.summary()

In [12]:
# Difine the callbacks
# Save the best model
checkpoint_callback = ModelCheckpoint(filepath="Basic Model Embeding Layers.h5", save_best_only=True, monitor="val_accuracy", mode="max", verbose=1)

# Early stopping
early_stopping_callback = EarlyStopping(monitor="val_accuracy", mode="max", patience=10, verbose=1, restore_best_weights=True)

# Reduce learning rate
reduce_lr_callback = ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=5, verbose=1, mode="min", min_delta=0.0001, cooldown=0, min_lr=0)

# Callbacks list
callbacks=[checkpoint_callback, early_stopping_callback, reduce_lr_callback]

In [None]:
# Define the number of epochs
num_epochs = 100

# Train the model
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), callbacks=callbacks)

In [None]:
# Evaluate the model
results_base = model.evaluate(testing_padded, testing_labels)

# print results
print(f'Test results - Loss: {results_base[0]} - Accuracy: {100*results_base[1]}%')

In [None]:
# Plot utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
# Plot the accuracy and loss
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

# Model Basic + Global Average Pooling

In [None]:
embedding_dim = 16

# Initialize the model
model_GlobalAveragePooling1D = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model_GlobalAveragePooling1D.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model_GlobalAveragePooling1D.summary()

In [17]:
# Difine the callbacks
# Save the best model
checkpoint_callback = ModelCheckpoint(filepath="Model Basic + Global Average Pooling.h5", save_best_only=True, monitor="val_accuracy", mode="max", verbose=1)

# Early stopping
early_stopping_callback = EarlyStopping(monitor="val_accuracy", mode="max", patience=10, verbose=1, restore_best_weights=True)

# Reduce learning rate
reduce_lr_callback = ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=5, verbose=1, mode="min", min_delta=0.0001, cooldown=0, min_lr=0)

# Callbacks list
callbacks=[checkpoint_callback, early_stopping_callback, reduce_lr_callback]

In [None]:
num_epochs = 100

# Train the model
history_GlobalAveragePooling1D = model_GlobalAveragePooling1D.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), callbacks=callbacks)

In [None]:
# Evaluate the model
results_GlobalAveragePooling1D = model_GlobalAveragePooling1D.evaluate(testing_padded, testing_labels)

# print results
print(f'Test results - Loss: {results_GlobalAveragePooling1D[0]} - Accuracy: {100*results_GlobalAveragePooling1D[1]}%')

In [None]:
# Plot the accuracy and loss
plot_graphs(history_GlobalAveragePooling1D, "accuracy")
plot_graphs(history_GlobalAveragePooling1D, "loss")

# Model Embedding + Conv1D

In [None]:
kernel_size = 5
filters = 128

# Initialize the model
model_Conv1D = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model_Conv1D.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model_Conv1D.summary()

In [22]:
# Difine the callbacks
# Save the best model
checkpoint_callback = ModelCheckpoint(filepath="Model Embedding + Conv1D.h5", save_best_only=True, monitor="val_accuracy", mode="max", verbose=1)

# Early stopping
early_stopping_callback = EarlyStopping(monitor="val_accuracy", mode="max", patience=10, verbose=1, restore_best_weights=True)

# Reduce learning rate
reduce_lr_callback = ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=5, verbose=1, mode="min", min_delta=0.0001, cooldown=0, min_lr=0)

# Callbacks list
callbacks=[checkpoint_callback, early_stopping_callback, reduce_lr_callback]

In [None]:
num_epochs = 100

# Train the model
history_Conv1D = model_Conv1D.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), callbacks=callbacks)

In [None]:
# Evaluate the model
results_Conv1D = model_Conv1D.evaluate(testing_padded, testing_labels)

# print results
print(f'Test results - Loss: {results_Conv1D[0]} - Accuracy: {100*results_Conv1D[1]}%')

In [None]:
# Plot the accuracy and loss
plot_graphs(history_Conv1D, "accuracy")
plot_graphs(history_Conv1D, "loss")

# Testing Model

In [31]:
# Load all model
model_base = tf.keras.models.load_model('Basic-Model-Embeding-Layers.h5')
model_GlobalAveragePooling1D = tf.keras.models.load_model('Model-Basic-Global-Average-Pooling.h5')
model_Conv1D = tf.keras.models.load_model('Model-Embedding-Conv1D.h5')

## Testing Base Model

In [None]:
# Define the tweet
twt = ['I will kill myself']
twt = tokenizer.texts_to_sequences(twt)
twt = pad_sequences(twt, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Predict the sentiment
prediction_base = model_base.predict(twt)

# Print the prediction
if(np.argmax(prediction_base) == 0):
    print("Potential Suicide Post")
elif (np.argmax(prediction_base) == 1):
    print("Non Suicide Post")

## Testing Basic + Global Average Pooling

In [None]:
# Testing Basic + Global Average Pooling
prediction_Globalave = model_GlobalAveragePooling1D.predict(twt)

# Print the prediction
if(np.argmax(prediction_Globalave) == 0):
    print("Potential Suicide Post")
elif (np.argmax(prediction_Globalave) == 1):
    print("Non Suicide Post")

## Testing Embeding + Conv1D

In [None]:
# Testing Embeding + Conv1D
prediction_EmbedCov1D = model_Conv1D.predict(twt)

# Print the prediction
if(np.argmax(prediction_EmbedCov1D) == 0):
    print("Potential Suicide Post")
elif (np.argmax(prediction_EmbedCov1D) == 1):
    print("Non Suicide Post")

# Comparation Model

In [None]:
# Compare model
results=pd.DataFrame({'Model':['Base Model','Base + Global Average Pooling','Base + CNN'],
                     'Accuracy Score':[results_base[1],results_GlobalAveragePooling1D[1],results_Conv1D[1]]})
result_df=results.sort_values(by='Accuracy Score', ascending=False)
result_df=result_df.set_index('Model')
result_df