In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Load Lao sentences from file
with open("laosdata.txt", "r", encoding="utf-8") as f:
    lao_sentences = f.read().splitlines()

# Load Vietnamese sentences from file
viet_sentences = []
with open("vietdata.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line and not line.startswith("{") and not line.startswith("#"):
            viet_sentences.append(line)

# Assign labels for each datasets
df_lao = pd.DataFrame({"text": lao_sentences, "label": "Lao"})
df_viet = pd.DataFrame({"text": viet_sentences, "label": "Vie"})

# Merge into one dataset and save to csv
df = pd.concat([df_lao, df_viet], ignore_index=True)
df.to_csv("viet_lao_dataset.csv", index=False, encoding="utf-8")
df.head()

Unnamed: 0,text,label
0,"ສາຍB (ອາຍຸ14 – 17 ປີ ) ,",Lao
1,ສາຍ C (ອາຍຸ18–25ປີ).,Lao
2,ໃນເວລາທີ່ພວກເຂົາພົບກັບການວາດພາບທາງເລຂາຄະນິດແບບ...,Lao
3,cortexສາຍຕາເບື້ອງຕົ້ນຈະ ຖືກເປີດໃຊ້ງານ.,Lao
4,ທ່ານຮອງນາຍົກລັດຖະມົນຕີສະເໜີບັນດາອົງການຕິດຕາມກວ...,Lao


In [3]:
# Split into texts and labels
X_text = df["text"]
y_labels = df["label"]

# Convert characters to n-gram vectors
vectorizer = CountVectorizer(analyzer="char", ngram_range=(2, 5), min_df=2)
X = vectorizer.fit_transform(X_text)

# Encode labels into numeric values
encoder = LabelEncoder()
y = encoder.fit_transform(y_labels)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [4]:
# Convert training features and labels to PyTorch tensor
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

# Convert test features and labels to PyTorch tensor
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create a TensorDataset and a DataLoader
train_ds = TensorDataset(X_train_tensor, y_train_tensor)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)

In [5]:
# Define a 2-layer neural network
class LangDetector(nn.Module):
    def __init__(self, input_dim, hidden_dim=100, output_dim=2):
        super(LangDetector, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

# Initialize model, loss function, and optimizer
model = LangDetector(input_dim=X_train.shape[1], hidden_dim=100, output_dim=2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(10):
    for xb, yb in train_dl:
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()

In [6]:
# Evaluate model on test set
with torch.no_grad():                       
    preds = model(X_test_tensor)            
    predicted = torch.argmax(preds, dim=1)  
    acc = (predicted == y_test_tensor).float().mean()  
    print("Test Accuracy:", acc.item())

Test Accuracy: 0.9993190169334412


In [10]:
# Chuyển tensor sang numpy
y_true = y_test_tensor.numpy()
y_pred = predicted.numpy()

# In precision, recall, F1-score cho từng lớp
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=encoder.classes_, digits=4))



Classification Report:
              precision    recall  f1-score   support

         Lao     0.9987    1.0000    0.9993      1500
         Vie     1.0000    0.9986    0.9993      1437

    accuracy                         0.9993      2937
   macro avg     0.9993    0.9993    0.9993      2937
weighted avg     0.9993    0.9993    0.9993      2937



In [None]:
# Example sentences to test
test_sentences = [
    "Máy tính cao như con chó", # Vietnamese
    "ເຈົ້າກິນເຂົ້າແລ້ວບໍ?", # Lao
    "Kết bạn trên Facebook đi, Charles", # Vietnamese
    "ໃຫ້ຂ້ອຍສັ່ງ n2 Timmy", # Lao
    "Gọi 113 đi Đoàn ơi", #Vietnamese
    "ຂອບໃຈຫຼາຍໆ", # Lao
]

# Predict probabilities for Lao/Vietnamese
def predict_language(text, model, vectorizer, encoder):
    X_input = vectorizer.transform([text])
    X_tensor = torch.tensor(X_input.toarray(), dtype=torch.float32)
    with torch.no_grad():
        logits = model(X_tensor)
        probs = F.softmax(logits, dim=1).numpy()[0]
    labels = encoder.classes_
    return {labels[i]: float(probs[i]) for i in range(len(labels))}

# Run predictions on test sentences
for sentence in test_sentences:
    probs = predict_language(sentence, model, vectorizer, encoder)
    print(f"\nInput: {sentence}")
    for lang, p in probs.items():
        print(f"{lang}: {p*100:.2f}%")


Input: Máy tính cao như con chó
Lao: 0.00%
Vie: 100.00%

Input: ເຈົ້າກິນເຂົ້າແລ້ວບໍ?
Lao: 100.00%
Vie: 0.00%

Input: Kết bạn trên Facebook đi, Charles
Lao: 0.00%
Vie: 100.00%

Input: ໃຫ້ຂ້ອຍສັ່ງ n2 Timmy
Lao: 100.00%
Vie: 0.00%

Input: Gọi 113 đi Đoàn ơi
Lao: 0.00%
Vie: 100.00%

Input: ຂອບໃຈຫຼາຍໆ
Lao: 100.00%
Vie: 0.00%
