In [2]:
!unzip /content/thai-dialect-classification-central-northern.zip

Archive:  /content/thai-dialect-classification-central-northern.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

In [4]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [5]:
print(f"Training data shape: {train_df.shape}")
print(train_df['Class'].value_counts())
print(train_df.head())

Training data shape: (2000, 3)
Class
Northern    1000
Central     1000
Name: count, dtype: int64
   id                                           Sentence     Class
0   0  ตึ้ง บ่า มี เวียก มี ก๋าน หยัง เยียะ กั๋น เน้า...  Northern
1   1  ของ ชิ้น นี้ ฉัน เป็น คน ซื้อ มา เธอ จะ มา ทึก...   Central
2   2  เป๋น แม่ ญิง หื้อ หมั่น หลับ เดิ้ก ลุก เจ๊า หื...  Northern
3   3  เมื่อ คืน วาน ฝน ตก หนัก พา ยุ เข้า ลม ไป พัด ...   Central
4   4  น้ำ บ่อ บ้าน เปิ้น เลิ้ก เก้า ข้อ ปอ เถิง หน้า...  Northern


In [6]:
X_train, X_val, y_train, y_val = train_test_split(
    train_df['Sentence'],
    train_df['Class'],
    test_size=0.2,
    random_state=42,
    stratify=train_df['Class']
)

In [7]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        analyzer='char',
        ngram_range=(1, 5),
        max_features=50000,
        sublinear_tf=True
    )),
    ('classifier', LinearSVC(max_iter=10000))
])

In [8]:
param_grid = {
    'tfidf__ngram_range': [(1, 3), (1, 4), (1, 5)],
    'tfidf__max_features': [30000, 50000],
    'classifier__C': [0.1, 1, 10]
}

In [9]:
print("Performing grid search for hyperparameter tuning...")
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

Performing grid search for hyperparameter tuning...


In [10]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [11]:
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'classifier__C': 0.1, 'tfidf__max_features': 30000, 'tfidf__ngram_range': (1, 4)}
Best cross-validation score: 0.99875


In [12]:
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation accuracy: {val_accuracy:.4f}")
print(classification_report(y_val, y_val_pred))

Validation accuracy: 1.0000
              precision    recall  f1-score   support

     Central       1.00      1.00      1.00       200
    Northern       1.00      1.00      1.00       200

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



In [13]:
print("Training final model on the entire training set...")
best_model.fit(train_df['Sentence'], train_df['Class'])

Training final model on the entire training set...


In [14]:
print("Making predictions on test set...")
test_predictions = best_model.predict(test_df['Sentence'])

Making predictions on test set...


In [17]:
submission = pd.DataFrame({
    'id': test_df.index,
    'Class': test_predictions
})
submission.to_csv('submission.csv', index=False)
print("Submission file created!")

Submission file created!


In [16]:
from joblib import dump
dump(best_model, 'thai_dialect_classifier_model.joblib')
print("Model saved!")

Model saved!
