In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
from sklearn.model_selection import StratifiedKFold # Sử dụng StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import numpy as np # Để tính trung bình các độ đo

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Đọc dữ liệu từ 4 file CHEAT dataset
file_paths = {
    "init": "dataset/CHEAT-main/data/ieee-init.xlsx",
    "generation": "dataset/CHEAT-main/data/ieee-chatgpt-generation.xlsx",
    "polish": "dataset/CHEAT-main/data/ieee-chatgpt-polish.xlsx",
    "fusion": "dataset/CHEAT-main/data/ieee-chatgpt-fusion.xlsx"
}

In [3]:
dataframes = []
for label, file_path in enumerate(file_paths.values()):
    df = pd.read_excel(file_path)
    df["label"] = label  # Thêm cột nhãn
    dataframes.append(df)
# Gộp toàn bộ dataset
df = pd.concat(dataframes, ignore_index=True)
df = df[['abstract', 'label']].dropna()  # Xóa dòng thiếu dữ liệu

# Chuẩn bị dữ liệu và nhãn
texts = df["abstract"].tolist()
labels = df["label"].tolist()

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [4]:
def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
# Tokenize toàn bộ dữ liệu
encodings = tokenize_texts(texts, tokenizer)

In [5]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item
dataset = TextDataset(encodings, labels)

In [6]:
# Khởi tạo mô hình
class RoBERTa_LSTM(nn.Module):
    def __init__(self, roberta_model_name="roberta-base", lstm_hidden_size=128, num_labels=4):
        super(RoBERTa_LSTM, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)  # Load RoBERTa
        self.lstm = nn.LSTM(input_size=768, hidden_size=lstm_hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(lstm_hidden_size * 2, num_labels)  # 2 * hidden_size vì LSTM là bidirectional
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        lstm_input = roberta_output.last_hidden_state  # Lấy output từ RoBERTa
        lstm_output, _ = self.lstm(lstm_input)
        lstm_output = self.dropout(lstm_output[:, -1, :])  # Lấy output cuối của LSTM
        output = self.fc(lstm_output)
        return output

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Đang sử dụng:", device)
model = RoBERTa_LSTM().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

Đang sử dụng: cuda


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def train_epoch(model, dataloader, criterion, optimizer, device, epoch, num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    return total_loss / len(dataloader)

In [9]:

def evaluate_epoch(model, dataloader, criterion, device, epoch, num_epochs):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Eval]")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return total_loss / len(dataloader), all_labels, all_preds

In [10]:
# Thực hiện kiểm tra chéo
num_folds = 5 # Số fold
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42) # Khởi tạo StratifiedKFold
all_fold_metrics = {
    "accuracy": [],
    "precision": [],
    "recall": [],
    "f1": [],
}

In [11]:
for fold, (train_index, val_index) in enumerate(skf.split(np.zeros(len(labels)), labels)): # Chia dữ liệu theo fold
    print(f"Fold {fold + 1}/{num_folds}")
    train_dataset = torch.utils.data.Subset(dataset, train_index) # Tạo tập train
    val_dataset = torch.utils.data.Subset(dataset, val_index) # Tạo tập val
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) # Tạo dataloader
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

    # Huấn luyện mô hình trên fold hiện tại
    num_epochs = 3
    for epoch in range(num_epochs):
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device, epoch, num_epochs)
        val_loss, val_labels, val_preds = evaluate_epoch(model, val_loader, criterion, device, epoch, num_epochs)
        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")

    # Đánh giá mô hình trên fold hiện tại
    accuracy = accuracy_score(val_labels, val_preds)
    precision = precision_score(val_labels, val_preds, average="weighted")
    recall = recall_score(val_labels, val_preds, average="weighted")
    f1 = f1_score(val_labels, val_preds, average="weighted")

    print(f"Fold {fold + 1} Metrics:")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

    all_fold_metrics["accuracy"].append(accuracy)
    all_fold_metrics["precision"].append(precision)
    all_fold_metrics["recall"].append(recall)
    all_fold_metrics["f1"].append(f1)

Fold 1/5


Epoch 1/3 [Train]: 100%|██████████| 2535/2535 [09:13<00:00,  4.58it/s, loss=0.786] 
Epoch 1/3 [Eval]: 100%|██████████| 634/634 [00:37<00:00, 16.86it/s]


Epoch 1: Train Loss = 0.4600, Val Loss = 0.4142


Epoch 2/3 [Train]: 100%|██████████| 2535/2535 [09:14<00:00,  4.57it/s, loss=0.145] 
Epoch 2/3 [Eval]: 100%|██████████| 634/634 [00:34<00:00, 18.16it/s]


Epoch 2: Train Loss = 0.3018, Val Loss = 0.3693


Epoch 3/3 [Train]: 100%|██████████| 2535/2535 [08:54<00:00,  4.74it/s, loss=0.169]  
Epoch 3/3 [Eval]: 100%|██████████| 634/634 [00:34<00:00, 18.14it/s]


Epoch 3: Train Loss = 0.2368, Val Loss = 0.4127
Fold 1 Metrics:
Accuracy: 0.8639, Precision: 0.8559, Recall: 0.8639, F1-score: 0.8573
Fold 2/5


Epoch 1/3 [Train]: 100%|██████████| 2535/2535 [08:57<00:00,  4.72it/s, loss=0.223]  
Epoch 1/3 [Eval]: 100%|██████████| 634/634 [00:35<00:00, 17.81it/s]


Epoch 1: Train Loss = 0.2247, Val Loss = 0.4056


Epoch 2/3 [Train]: 100%|██████████| 2535/2535 [09:02<00:00,  4.67it/s, loss=0.232]  
Epoch 2/3 [Eval]: 100%|██████████| 634/634 [00:35<00:00, 17.76it/s]


Epoch 2: Train Loss = 0.1717, Val Loss = 0.3047


Epoch 3/3 [Train]: 100%|██████████| 2535/2535 [09:01<00:00,  4.68it/s, loss=0.0664] 
Epoch 3/3 [Eval]: 100%|██████████| 634/634 [00:35<00:00, 17.80it/s]


Epoch 3: Train Loss = 0.1453, Val Loss = 0.5234
Fold 2 Metrics:
Accuracy: 0.8391, Precision: 0.8603, Recall: 0.8391, F1-score: 0.8415
Fold 3/5


Epoch 1/3 [Train]: 100%|██████████| 2535/2535 [09:02<00:00,  4.68it/s, loss=0.314]  
Epoch 1/3 [Eval]: 100%|██████████| 634/634 [00:35<00:00, 17.85it/s]


Epoch 1: Train Loss = 0.1458, Val Loss = 0.2350


Epoch 2/3 [Train]: 100%|██████████| 2535/2535 [09:00<00:00,  4.69it/s, loss=0.129]  
Epoch 2/3 [Eval]: 100%|██████████| 634/634 [00:35<00:00, 17.92it/s]


Epoch 2: Train Loss = 0.1206, Val Loss = 0.2276


Epoch 3/3 [Train]: 100%|██████████| 2535/2535 [09:01<00:00,  4.69it/s, loss=0.186]  
Epoch 3/3 [Eval]: 100%|██████████| 634/634 [00:35<00:00, 17.73it/s]


Epoch 3: Train Loss = 0.1073, Val Loss = 0.2956
Fold 3 Metrics:
Accuracy: 0.9070, Precision: 0.9093, Recall: 0.9070, F1-score: 0.9064
Fold 4/5


Epoch 1/3 [Train]: 100%|██████████| 2535/2535 [08:56<00:00,  4.73it/s, loss=0.021]  
Epoch 1/3 [Eval]: 100%|██████████| 634/634 [00:34<00:00, 18.17it/s]


Epoch 1: Train Loss = 0.1144, Val Loss = 0.1688


Epoch 2/3 [Train]: 100%|██████████| 2535/2535 [08:57<00:00,  4.72it/s, loss=0.0114] 
Epoch 2/3 [Eval]: 100%|██████████| 634/634 [00:35<00:00, 18.07it/s]


Epoch 2: Train Loss = 0.0946, Val Loss = 0.1237


Epoch 3/3 [Train]: 100%|██████████| 2535/2535 [08:57<00:00,  4.72it/s, loss=0.0568] 
Epoch 3/3 [Eval]: 100%|██████████| 634/634 [00:34<00:00, 18.13it/s]


Epoch 3: Train Loss = 0.0809, Val Loss = 0.2781
Fold 4 Metrics:
Accuracy: 0.9131, Precision: 0.9260, Recall: 0.9131, F1-score: 0.9156
Fold 5/5


Epoch 1/3 [Train]: 100%|██████████| 2535/2535 [08:55<00:00,  4.74it/s, loss=0.0167] 
Epoch 1/3 [Eval]: 100%|██████████| 634/634 [00:34<00:00, 18.19it/s]


Epoch 1: Train Loss = 0.0905, Val Loss = 0.1272


Epoch 2/3 [Train]: 100%|██████████| 2535/2535 [08:57<00:00,  4.72it/s, loss=0.209]   
Epoch 2/3 [Eval]: 100%|██████████| 634/634 [00:35<00:00, 17.93it/s]


Epoch 2: Train Loss = 0.0746, Val Loss = 0.3896


Epoch 3/3 [Train]: 100%|██████████| 2535/2535 [08:55<00:00,  4.74it/s, loss=0.0826]  
Epoch 3/3 [Eval]: 100%|██████████| 634/634 [00:34<00:00, 18.18it/s]

Epoch 3: Train Loss = 0.0689, Val Loss = 0.2154
Fold 5 Metrics:
Accuracy: 0.9381, Precision: 0.9418, Recall: 0.9381, F1-score: 0.9386





In [12]:
# Tính trung bình các độ đo trên các fold
print("✅✅✅ Average Metrics Over All Folds:")
print(f"Accuracy: {np.mean(all_fold_metrics['accuracy']):.4f}")
print(f"Precision: {np.mean(all_fold_metrics['precision']):.4f}")
print(f"Recall: {np.mean(all_fold_metrics['recall']):.4f}")
print(f"F1-score: {np.mean(all_fold_metrics['f1']):.4f}")

✅✅✅ Average Metrics Over All Folds:
Accuracy: 0.8922
Precision: 0.8987
Recall: 0.8922
F1-score: 0.8919


In [13]:
model_path = "roberta_lstm_model_cheo.pth"  # Chọn đường dẫn và tên file để lưu
torch.save(model.state_dict(), model_path) # Lưu state_dict (các tham số học được) của mô hình
print(f"✅ Mô hình đã được lưu tại {model_path}")

✅ Mô hình đã được lưu tại roberta_lstm_model_cheo.pth
