In [None]:
!pip install pythainlp
!pip install attacut

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,  ConfusionMatrixDisplay
from pythainlp.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Bidirectional, GRU

### Load and preprocess the data

In [None]:
data = pd.read_csv('Depression_Dataset.csv')

In [None]:
data.head(10)

In [None]:
X = data.text
y = data.label

### Tokenize Thai text

In [None]:
def tokenize(sentence):
  return word_tokenize(sentence, engine="attacut")

In [None]:
tokenize_list = [tokenize(sent) for sent in X ]
tokenize_list

In [None]:
tokenized_texts = [' '.join(text) for text in tokenize_list]

# สร้าง tokenizer และ fit
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_texts)

#แปลงเป็นลำดับตัวเลข (sequence)
sequences = tokenizer.texts_to_sequences(tokenized_texts)
maxlen = max([len(s) for s in sequences])

padded_sequences = pad_sequences(sequences, padding='post')

print("Word Index:", tokenizer.word_index)
print("Sequences:", sequences)
print("Padded Sequences:", padded_sequences)
print("Max length =", maxlen)
print("Padded Seq shape =",padded_sequences.shape)

### Apply train-test split

In [None]:
x_train, x_test , y_train, y_test = train_test_split(padded_sequences, y, test_size = 0.3, random_state = 42)

In [None]:
x_train, x_val , y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state = 42)

In [None]:
vocab_size = len(tokenizer.word_index) +1

# Bidirectional RNN

In [None]:
np.random.seed(42)
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=300, input_length=maxlen),
    Bidirectional(SimpleRNN(units=16, activation = 'relu', return_sequences=True)),
    Bidirectional(SimpleRNN(units=32, activation = 'relu', return_sequences=True)),
    Bidirectional(SimpleRNN(units=64, activation = 'relu', return_sequences=True)),
    Bidirectional(SimpleRNN(units=128, activation = 'relu')),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.summary()

In [None]:
model.compile(optimizer= keras.optimizers.Adam(learning_rate = 0.002), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
hist = model.fit(x_train , y_train, epochs= 3, batch_size = 512, validation_data = (x_val, y_val))

In [None]:
hist.history.keys()
fig = plt.figure(figsize=(16, 7))

ax = fig.add_subplot(1, 2, 1)
ax.plot(hist.history["loss"],'cornflowerblue', marker='.', label="train")
ax.plot(hist.history["val_loss"],'orange', marker='.', label="val")
ax.set(xlabel='epoch', ylabel = 'loss', title='Loss')
ax.set_xticks([1] + list(ax.get_xticks()[ax.get_xticks() > 1]))

ax.legend()

ax = fig.add_subplot(1, 2, 2)
ax.plot(hist.history["accuracy"],'cornflowerblue', marker='.', label="train")
ax.plot(hist.history["val_accuracy"],'orange', marker='.', label="val")
ax.set(xlabel='epoch', ylabel = 'accuracy', title='Accuracy')
ax.set_xticks([1] + list(ax.get_xticks()[ax.get_xticks() > 1]))

ax.legend()
plt.show()

In [None]:
y_pred = model.predict(x_test)

In [None]:
y_pred = (y_pred >= 0.5).astype(int)
print("Bi-RNN Accuracy on test data:", round(accuracy_score(y_test, y_pred),2))

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Positive","Negative"])
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
cf = classification_report(y_test, y_pred)
print(cf)

# Bidirectioanl GRU

In [None]:
np.random.seed(42)
model2 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=300, input_length=maxlen),
    Bidirectional(GRU(units=16, activation = 'relu', return_sequences=True)),
    Bidirectional(GRU(units=32, activation = 'relu', return_sequences=True)),
    Bidirectional(GRU(units=64, activation = 'relu', return_sequences=True)),
    Bidirectional(GRU(units=128, activation = 'relu')),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model2.summary()

In [None]:
model2.compile(optimizer= keras.optimizers.Adam(learning_rate = 0.0025), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
hist2 = model2.fit(x_train , y_train, epochs= 3, batch_size = 512, validation_data = (x_val, y_val))

In [None]:
hist2.history.keys()
fig = plt.figure(figsize=(16, 7))

ax = fig.add_subplot(1, 2, 1)
ax.plot(hist2.history["loss"],'cornflowerblue', marker='.', label="train")
ax.plot(hist2.history["val_loss"],'orange', marker='.', label="val")
ax.set(xlabel='epoch', ylabel = 'loss', title='Loss')
ax.set_xticks([1] + list(ax.get_xticks()[ax.get_xticks() > 1]))

ax.legend()

ax = fig.add_subplot(1, 2, 2)
ax.plot(hist2.history["accuracy"],'cornflowerblue', marker='.', label="train")
ax.plot(hist2.history["val_accuracy"],'orange', marker='.', label="val")
ax.set(xlabel='epoch', ylabel = 'accuracy', title='Accuracy')
ax.set_xticks([1] + list(ax.get_xticks()[ax.get_xticks() > 1]))

ax.legend()
plt.show()

In [None]:
y_pred = model2.predict(x_test)

In [None]:
y_pred = (y_pred >= 0.5).astype(int)
print("Bi-GRU Accuracy on test data:", round(accuracy_score(y_test, y_pred),2))

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Positive","Negative"])
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
cf = classification_report(y_test, y_pred)
print(cf)

## Test With New Sample

In [None]:
# Predict custom sentences
sample_sentences = [ "ช่วงนี้นอนไม่หลับ เบื่ออาหาร รู้สึกหมดแรงและไม่อยากคุยกับใครเลย",
    "วันนี้อารมณ์ดีมาก ไปออกกำลังกายและทำงานบ้านเรียบร้อย",
    "รู้สึกกังวลบ่อย ๆ สมาธิไม่ค่อยอยู่กับเนื้อกับตัว แต่พยายามทำงานต่อ",
    "เมื่อคืนหัวเราะกับเพื่อน ๆ สนุกดี ไม่มีอะไรให้เครียดเท่าไหร่",
    "บางวันไม่อยากลุกจากเตียงเลย เหนื่อยล้าแบบไม่มีเหตุผล",]
X_new = [ tokenize(x) for x in sample_sentences]
x_new_tokenize = []
for x in X_new :
  new_sent = []
  for token in x :
    if token in tokenizer.word_index.keys() :
      new_sent.append(tokenizer.word_index[token])
    else :
      new_sent.append(0)
  x_new_tokenize.append(new_sent)
x_new_tokenize_padded = pad_sequences(x_new_tokenize, padding='post', maxlen = maxlen)
y_pred_sample1 = model.predict(x_new_tokenize_padded)
y_pred_sample2 = model2.predict(x_new_tokenize_padded)

In [None]:
for i in range(len(sample_sentences)) :
  print(sample_sentences[i],":",end=' ')
  if (y_pred_sample1[i] >= 0.5) :
    print("Bi-RNN -> Postivie",end = ', ')
  else :
    print("Bi-RNN -> Negative",end = ', ')
  if (y_pred_sample2[i] >= 0.5) :
    print("Bi-GRU -> Postivie")
  else :
    print("Bi-GRU -> Negative")
