In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Đọc dữ liệu từ 4 file CHEAT dataset
file_paths = {
    "init": "D:/ALL_Python/EXAMPLE_DOAN/NLP_NANG_CAO/model_nlp_final/dataset/CHEAT-main/data/ieee-init.xlsx",
    "generation": "D:/ALL_Python/EXAMPLE_DOAN/NLP_NANG_CAO/model_nlp_final/dataset/CHEAT-main/data/ieee-chatgpt-generation.xlsx",
    "polish": "D:/ALL_Python/EXAMPLE_DOAN/NLP_NANG_CAO/model_nlp_final/dataset/CHEAT-main/data/ieee-chatgpt-polish.xlsx",
    "fusion": "D:/ALL_Python/EXAMPLE_DOAN/NLP_NANG_CAO/model_nlp_final/dataset/CHEAT-main/data/ieee-chatgpt-fusion.xlsx"
}

In [3]:
dataframes = []
for label, file_path in enumerate(file_paths.values()):
    df = pd.read_excel(file_path)
    df["label"] = label  # Thêm cột nhãn
    dataframes.append(df)

In [4]:
# Gộp toàn bộ dataset
df = pd.concat(dataframes, ignore_index=True)
df = df[['abstract', 'label']].dropna()  # Xóa dòng thiếu dữ liệu

# Chia dữ liệu thành train và test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["abstract"].tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenize dữ liệu train và test
train_encodings = tokenize_texts(train_texts, tokenizer)
test_encodings = tokenize_texts(test_texts, tokenizer)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [7]:
class BERT_LSTM(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", lstm_hidden_size=128, num_labels=4):
        super(BERT_LSTM, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)  # Load BERT
        self.lstm = nn.LSTM(input_size=768, hidden_size=lstm_hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(lstm_hidden_size * 2, num_labels)  # 2 * hidden_size vì LSTM là bidirectional
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        lstm_input = bert_output.last_hidden_state  # Lấy output từ BERT
        lstm_output, _ = self.lstm(lstm_input)
        lstm_output = self.dropout(lstm_output[:, -1, :])  # Lấy output cuối của LSTM
        output = self.fc(lstm_output)
        return output


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Đang sử dụng:", device)

Đang sử dụng: cuda


In [9]:
# Khởi tạo mô hình
model = BERT_LSTM()
model.to(device)


BERT_LSTM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

In [11]:
num_epochs = 3  # Số epoch
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for batch in progress_bar:
        optimizer.zero_grad()
        
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"🔥 Epoch {epoch+1}: Loss = {total_loss / len(train_loader):.4f}")

Epoch 1/3: 100%|██████████| 2535/2535 [08:43<00:00,  4.85it/s, loss=0.221] 


🔥 Epoch 1: Loss = 0.4959


Epoch 2/3: 100%|██████████| 2535/2535 [08:58<00:00,  4.71it/s, loss=0.425] 


🔥 Epoch 2: Loss = 0.3139


Epoch 3/3: 100%|██████████| 2535/2535 [08:49<00:00,  4.79it/s, loss=0.365] 

🔥 Epoch 3: Loss = 0.2236





In [12]:
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# In các độ đo
print("Accuracy:", accuracy_score(all_labels, all_preds))
print("Precision:", precision_score(all_labels, all_preds, average="weighted"))
print("Recall:", recall_score(all_labels, all_preds, average="weighted"))
print(" F1-score:", f1_score(all_labels, all_preds, average="weighted"))


Accuracy: 0.7872781065088758
Precision: 0.7867694263440398
Recall: 0.7872781065088758
 F1-score: 0.7809133678486463
