In [21]:
import os
from torch.optim import AdamW
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import torch

In [2]:
def file2list(fname):
    with open(fname, encoding='utf-8') as fp:  # Specify the encoding
        lines = fp.readlines()
        return [line.strip() for line in lines]

In [3]:
pos_text = file2list("Downloads/asp_pos (1).txt")
neg_text = file2list("Downloads/asp_neg.txt")

In [4]:
pos_df = pd.DataFrame(pos_text,columns=['review'])
pos_df['label'] = "positive"

neg_df = pd.DataFrame(neg_text,columns=['review'])
neg_df['label'] = "negative"
neg_df

data = pd.concat([pos_df, neg_df])
data.sample(100)

Unnamed: 0,review,label
553,ได้รับถูกต้องตามที่สั่ง จัดส่งไม่เร็วและไม่ช้า...,positive
296,กดสบายและเสียงคมชัดพอใจมาก แรกๆ ไม่เข้าใจปุ่มห...,positive
311,ส่งไว แพ็กมาดี ตรงตามที่สั่งรถสชาติอร่อย มี 5 ...,positive
560,ทุกคนฟังดิฉันนะคะ คือเสื้อมันผ้าดีมากมันเริ่ดด...,positive
230,ได้รับสินค้าแล้วนะคะทางร้านจัดส่งรวดเร็วทันใจบ...,positive
...,...,...
508,เป็นครีมที่มีรีวิวเยอะมากมีกลิ่นหอมอ่อนๆไม่ฟุ้...,positive
140,สินค้าชำรุดที่ฝานิดหน่อยคับ แต่ทางร้านส่งฝามาใ...,negative
563,คุณภาพสมราคา สะพายถ่ายรูปชิคๆได้ เเต่ไม่น่าจะท...,positive
102,สินค้าดี ราคาาไม่แพงง ขนส่งงบริการรดี คุ้มมค่า...,positive


In [5]:
category_counts = data['label'].value_counts()
category_counts

label
positive    630
negative    235
Name: count, dtype: int64

In [6]:
!pip install pythainlp





In [7]:
from pythainlp.tokenize import word_tokenize
import re

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def thai_clean_text(text):
    st = ""

    text = deEmojify(text)
    text = text.replace("\n"," ")
    for w in word_tokenize(text):
        st = st + w + " "

    return  re.sub(' +', ' ', st)

data['review'] = data.review.apply(thai_clean_text)
data.sample(5)

Unnamed: 0,review,label
176,สินค้า ใช้ ระยะเวลา จัดส่ง นาน มาก ๆ ค่ะ เพราะ...,negative
622,ร้านค้า จัด ส่งสินค้า ให้ ไว มาก กก กก กก การ ...,positive
4,สินค้า ตรง ปก ไม่ จก ตา ราคา จับต้อง ได้ ใช้ โ...,positive
220,เนื้อผ้า ดี หนา มาก คุ้มค่า คุ้ม ราคา ดีไซน์ ด...,positive
156,สินค้า ตรง ตาม ปก ร้านค้า บริการ ดี บริการ รวด...,positive


In [8]:
from sklearn.preprocessing import LabelEncoder
reviews = data['review'].values
labels = data['label'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

In [9]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify=encoded_labels, test_size=0.3, random_state=42)

In [10]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [11]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device).long()  # Convert to Long
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [12]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions, zero_division=1)

In [13]:
 bert_model_name = 'bert-base-uncased'
 num_classes = 2
 max_length = 128
 batch_size = 16
 num_epochs = 10
 learning_rate = 2e-5

In [14]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [15]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_sentences, train_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TextClassificationDataset(test_sentences, test_labels, tokenizer, max_length)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [23]:
model = BERTClassifier(bert_model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [24]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [25]:
for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/10
Validation Accuracy: 0.7115
              precision    recall  f1-score   support

           0       0.25      0.03      0.05        71
           1       0.73      0.97      0.83       189

    accuracy                           0.71       260
   macro avg       0.49      0.50      0.44       260
weighted avg       0.60      0.71      0.62       260

Epoch 2/10
Validation Accuracy: 0.7269
              precision    recall  f1-score   support

           0       1.00      0.00      0.00        71
           1       0.73      1.00      0.84       189

    accuracy                           0.73       260
   macro avg       0.86      0.50      0.42       260
weighted avg       0.80      0.73      0.61       260

Epoch 3/10
Validation Accuracy: 0.7269
              precision    recall  f1-score   support

           0       1.00      0.00      0.00        71
           1       0.73      1.00      0.84       189

    accuracy                           0.73       260
   macro av

In [None]:
torch.save(model.state_dict(), "bert_classifier.pt")

In [None]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return "positive" if preds.item() == 1 else "negative"

In [None]:
test_text = thai_clean_text("สินค้าเยี่ยมมาก")
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print(f"Predicted sentiment: {sentiment}")


In [None]:
tokenized_text = word_tokenize(thai_clean_text(test_text), keep_whitespace=False)
tokenized_text

In [None]:
for i in tokenized_text:
    print(i)
    test_text = thai_clean_text(i)
    sentiment = predict_sentiment(test_text, model, tokenizer, device)
    print(f"Predicted sentiment: {sentiment}")
    print("----------------------")