In [None]:
# Install dependencies
!pip install joblib underthesea emoji torch scikit-learn tqdm

In [None]:
#!pip uninstall -y transformers accelerate peft
!pip install transformers==4.36.2 accelerate==0.23.0 peft==0.7.1


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import re
import emoji
from underthesea import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib
import torch
from tqdm import tqdm
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
import torch
def load_and_preprocess_data(file_path, stop_words_path):
    df = pd.read_csv(file_path)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df = df.dropna(subset=['comment', 'label'])
    stop_words = set(pd.read_csv(stop_words_path, header=None)[0].tolist())

    def process_sentence(sentence):
        s = str(sentence).lower()
        s = re.sub(r'\d[\d\.,]*\d', '<NUMBER>', s)
        s = re.sub(r'(https?://\S+|www\.\S+)', '<URL>', s)
        s = re.sub(r'@\w+', '<USER>', s)
        s = re.sub(r'#\w+', '<HASHTAG>', s)
        s = emoji.replace_emoji(s, replace="<EMOJI>")
        s = re.sub(r'[^\wÀ-ỹ0-9<>\?\!\.,;:\- ]+', ' ', s)
        s = re.sub(r'\s+', ' ', s).strip()
        tokens = word_tokenize(s)
        tokens = [w for w in tokens if w not in stop_words and len(w)>1]
        return " ".join(tokens)

    tqdm.pandas(desc="Preprocessing")
    df['comment'] = df['comment'].progress_apply(process_sentence)
    train_df, test_df = train_test_split(
        df[['comment','label']], test_size=0.2,
        stratify=df['label'], random_state=42
    )
    return train_df, test_df

train_df, test_df = load_and_preprocess_data(
    "/content/drive/MyDrive/cuoi_ky/train_model/Data/data - data.csv",
    "/content/drive/MyDrive/cuoi_ky/train_model/Data/vietnamese-stopwords-dash.txt"
)

In [None]:
train_df

In [None]:
# Encode labels
encoder = LabelEncoder()
train_labels = encoder.fit_transform(train_df['label'])
test_labels = encoder.transform(test_df['label'])
joblib.dump(encoder, '/content/drive/MyDrive/cuoi_ky/train_model/Model/label_encoder.pkl')

# Tokenize
model_name = "wonrax/phobert-base-vietnamese-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(
    train_df['comment'].tolist(), truncation=True,
    padding=True, max_length=64
)
test_encodings = tokenizer(
    test_df['comment'].tolist(), truncation=True,
    padding=True, max_length=64
)
# Prepare dataset for Trainer
torch.manual_seed(42)
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

# Load model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(encoder.classes_)
)
model.to(device)

In [None]:
# TrainingArguments using epoch-based eval/save + early stopping
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/cuoi_ky/train_model/Model',
    do_train=True,
    do_eval=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    fp16=True
)
# Compute metrics
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

In [None]:
# Trainer with EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [None]:
# Train & evaluate
trainer.train()
trainer.evaluate()

In [None]:
# Lưu mô hình và tokenizer
model.save_pretrained("/content/drive/MyDrive/cuoi_ky/train_model/Model/phobert_model")
tokenizer.save_pretrained("/content/drive/MyDrive/cuoi_ky/train_model/Model/phobert_tokenizer")
