# Thai Text Classification Model

This notebook implements a text classification model for Thai language using transformer-based models.

In [None]:
# Install required packages
!pip install transformers datasets evaluate pythainlp sentencepiece torch pandas sklearn

In [None]:
# Import necessary libraries
import sys
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Add the parent directory to Python path for custom utility imports
if not os.path.exists('../utils'):
    !mkdir -p ../utils
    !wget -O ../utils/thai_nlp_utils.py https://raw.githubusercontent.com/username/repo/main/utils/thai_nlp_utils.py

sys.path.append('..')
from utils.thai_nlp_utils import clean_thai_text, prepare_dataset_splits, save_model_to_hf

## 1. Data Preparation

In [None]:
# Sample code to load data (replace with your dataset)
# For example, loading a CSV with text and labels
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    # Ensure necessary columns exist
    required_cols = ['text', 'label']
    for col in required_cols:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found in dataset")
    return df

# Load your Thai text classification dataset
# sample_path = 'your_dataset_path.csv'
# df = load_dataset(sample_path)

# For demonstration, create a sample dataset
sample_data = {
    'text': [
        'ฉันชอบดูหนัง',
        'อาหารร้านนี้อร่อยมาก',
        'การเมืองไทยวุ่นวายเหลือเกิน',
        'ราคาน้ำมันสูงขึ้นทุกวัน',
        'ฉันรักประเทศไทย'
    ],
    'label': [0, 1, 2, 2, 0]  # Sample labels: 0=entertainment, 1=food, 2=news
}
df = pd.DataFrame(sample_data)

# Display dataset information
print(f"Dataset shape: {df.shape}")
print(f"Label distribution:\n{df['label'].value_counts()}")
df.head()

In [None]:
# Clean text data
df['text_cleaned'] = df['text'].apply(clean_thai_text)

# Split data into train, validation, and test sets
train_df, val_df, test_df = prepare_dataset_splits(df, 'text_cleaned', 'label')

print(f"Train set: {train_df.shape}")
print(f"Validation set: {val_df.shape}")
print(f"Test set: {test_df.shape}")

## 2. Model Preparation

In [None]:
# Define model parameters
MODEL_NAME = "airesearch/wangchanberta-base-att-spm-uncased"  # Thai BERT model
NUM_LABELS = len(df['label'].unique())
MAX_LENGTH = 128

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS
)

In [None]:
# Create dataset class
class ThaiTextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets for training, validation and testing
train_dataset = ThaiTextClassificationDataset(
    train_df['text_cleaned'].values,
    train_df['label'].values,
    tokenizer,
    MAX_LENGTH
)

val_dataset = ThaiTextClassificationDataset(
    val_df['text_cleaned'].values,
    val_df['label'].values,
    tokenizer,
    MAX_LENGTH
)

test_dataset = ThaiTextClassificationDataset(
    test_df['text_cleaned'].values,
    test_df['label'].values,
    tokenizer,
    MAX_LENGTH
)

## 3. Model Training

In [None]:
# Define metrics computation function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=True,  # Use mixed precision training
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
# Train the model
train_results = trainer.train()

# Print training metrics
print(f"Training metrics: {train_results}")

## 4. Model Evaluation

In [None]:
# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print(f"Test metrics: {test_results}")

# Detailed analysis on test set
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)

# Create a confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(test_df['label'], preds)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Print classification report
print(classification_report(test_df['label'], preds))

## 5. Save Model to Hugging Face

In [None]:
# Save model to Hugging Face Hub
# Replace with your Hugging Face token and desired repository name
HF_TOKEN = "your_huggingface_token"  # Get token from https://huggingface.co/settings/tokens
MODEL_REPO_NAME = "your-username/thai-text-classification"

# Login to Hugging Face
from huggingface_hub import notebook_login
notebook_login()

# Save model and tokenizer
save_model_to_hf(model, tokenizer, MODEL_REPO_NAME, HF_TOKEN)

In [None]:
# Test the saved model with a sample input
from transformers import pipeline

# Load the model from Hugging Face
classifier = pipeline(
    "text-classification", 
    model=MODEL_REPO_NAME,
    tokenizer=MODEL_REPO_NAME
)

# Test with example text
sample_texts = [
    "หนังเรื่องนี้สนุกมากๆ",
    "อาหารที่ร้านนี้รสชาติดีมาก"
]

for text in sample_texts:
    result = classifier(text)
    print(f"Text: {text}")
    print(f"Prediction: {result}\n")