# Train

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, Bidirectional, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
df = pd.read_csv('cleaned_dataset_augmented11.csv')
print(df['label'].value_counts())

In [None]:
# โหลดข้อมูล
df = pd.read_csv('/content/cleaned_dataset_augmented11.csv')

# Tokenization & Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])

vocab_size = len(tokenizer.word_index) + 1
max_length = 50
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Encode labels
label_order = ['minimum', 'mild', 'moderate', 'severe']
labels = pd.Categorical(df['label'], categories=label_order, ordered=True)
labels = pd.get_dummies(labels).values

# แบ่งข้อมูล Train/Test
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# สร้างโมเดล LSTM
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
    SpatialDropout1D(0.3),  # ลด Overfitting
    Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=l1_l2(0.0005, 0.001))),
    BatchNormalization(),
    Dropout(0.4),
    LSTM(32, return_sequences=False, kernel_regularizer=l1_l2(0.0005, 0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(24, activation='relu', kernel_regularizer=l1_l2(0.0005, 0.002)),
    Dropout(0.5),
    Dense(labels.shape[1], activation='softmax')
])

# Compile โมเดล
optimizer = Adam(learning_rate=0.0005, clipnorm=1.0)  # Gradient Clipping
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True, verbose=1)

# เทรนโมเดล
history = model.fit(
    x_train, y_train,
    epochs=50, batch_size=32,
    validation_data=(x_test, y_test),
    callbacks=[reduce_lr, early_stopping]
)

# ประเมินผล
y_pred = model.predict(x_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

print(classification_report(y_true_classes, y_pred_classes, target_names=label_order))


In [None]:
model.save('model.h5')

In [None]:
import pickle
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
# คำนวณ Confusion Matrix
cm = confusion_matrix(y_true_classes, y_pred_classes)

# แสดงผลลัพธ์เป็น Heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_order, yticklabels=label_order)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# สร้าง DataFrame ของชุดทดสอบ (ดึงข้อความต้นฉบับมาแสดง)
df_test = df.iloc[x_test.shape[0]*-1:].copy()
df_test = df_test.reset_index(drop=True)
df_test['predicted_label'] = [label_order[i] for i in y_pred_classes]

# แสดงตัวอย่างข้อความที่ถูกทำนายเป็นแต่ละคลาส
for label in label_order:
    print(f"\n🔹 ตัวอย่างข้อความที่ถูกทำนายเป็น '{label}':")
    examples = df_test[df_test['predicted_label'] == label]['text'].head(5).tolist()
    for i, text in enumerate(examples, 1):
        print(f"{i}. {text}")