<a href="https://colab.research.google.com/github/MOOwuttichai/BSC_DPDM2023/blob/main/LSTM_eng.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
import os

# ดาวน์โหลด NLTK tokenizer (ถ้ายังไม่ได้ดาวน์โหลด)
nltk.download('punkt')

# โหลดข้อมูลจากไฟล์ CSV
data = pd.read_csv("Data_model_eng.csv")

# แยกข้อความและป้ายกำกับ
texts = data["comments"].astype(str).tolist()
labels = data["label"].astype(str).tolist()

# Tokenization โดยใช้ NLTK
tokenized_texts = [word_tokenize(text) for text in texts]

# แปลงคำเป็นตัวเลข
max_words = 5000
max_len = 20
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(tokenized_texts)
sequences = tokenizer.texts_to_sequences(tokenized_texts)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

# แปลง labels เป็นตัวเลข
unique_labels = list(set(labels))
label_map = {label: idx for idx, label in enumerate(unique_labels)}
y = np.array([label_map[label] for label in labels])

# แบ่งข้อมูลเป็น train/test (70/30) โดยใช้ random_state=42
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.3, random_state=42)

# ตรวจสอบว่ามีโมเดลที่ฝึกไว้แล้วหรือไม่
model_path = "lstm_text_classification.h5"
if os.path.exists(model_path):
    print("Loading pre-trained model...")
    model = load_model(model_path)
else:
    print("Creating new model...")
    model = Sequential([
        Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
        LSTM(64, return_sequences=True),
        Dropout(0.2),
        LSTM(32),
        Dense(len(unique_labels), activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# ฝึกโมเดลต่อจากของเดิม
model.fit(X_train, y_train, epochs=10, batch_size=2, validation_data=(X_test, y_test))

# บันทึกโมเดลที่ฝึกใหม่
model.save(model_path)

# ทำนายผลบนชุดทดสอบ
y_pred = np.argmax(model.predict(X_test), axis=1)

# คำนวณ Accuracy, Precision, Recall, และ F1-Score
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
class_report = classification_report(y_test, y_pred, target_names=unique_labels)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:\n", class_report)

# ฟังก์ชันสำหรับทำนายประเภทของข้อความ
def predict_category(text):
    tokenized = word_tokenize(text)
    sequence = tokenizer.texts_to_sequences([tokenized])
    padded = pad_sequences(sequence, maxlen=max_len, padding='post')
    pred = model.predict(padded)
    return unique_labels[np.argmax(pred)]

# ทดสอบการพยากรณ์
print(predict_category("I went to Japan, it was amazing!"))  # ควรได้ 'experience'
print(predict_category("Does anyone know how to make pancakes?"))  # ควรได้ 'question'


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Creating new model...
Epoch 1/10




[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 25ms/step - accuracy: 0.6200 - loss: 0.7732 - val_accuracy: 0.6307 - val_loss: 0.7099
Epoch 2/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 23ms/step - accuracy: 0.7561 - loss: 0.5640 - val_accuracy: 0.6383 - val_loss: 0.7526
Epoch 3/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - accuracy: 0.8755 - loss: 0.3465 - val_accuracy: 0.6080 - val_loss: 0.8960
Epoch 4/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - accuracy: 0.9409 - loss: 0.1951 - val_accuracy: 0.6222 - val_loss: 1.2008
Epoch 5/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 23ms/step - accuracy: 0.9724 - loss: 0.1017 - val_accuracy: 0.6155 - val_loss: 1.4682
Epoch 6/10
[1m1232/1232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 23ms/step - accuracy: 0.9810 - loss: 0.0657 - val_accuracy: 0.6155 - val_loss: 1.5461
Epoch 7/10
[1m



[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step
Accuracy: 0.6004
Precision: 0.6023
Recall: 0.6004
F1 Score: 0.6009

Classification Report:
                                               precision    recall  f1-score   support

ไม่มีประโยชน์/ไม่สำคัญ (useless/unimportant)       0.68      0.67      0.68       622
            เล่าประสบการณ์ (tell experience)       0.49      0.51      0.50       373
                            คำถาม (Question)       0.49      0.39      0.44        61

                                    accuracy                           0.60      1056
                                   macro avg       0.55      0.53      0.54      1056
                                weighted avg       0.60      0.60      0.60      1056

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 477ms/step
ไม่มีประโยชน์/ไม่สำคัญ (useless/unimportant)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
คำถาม (Question)
