In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import torch.nn as nn
from transformers import BertModel

In [5]:
# 1. Load data
train_df = pd.read_csv("../data/yahoo_answers/train.csv", header=None, names=["label", "title", "content", "answer"])
test_df = pd.read_csv("../data/yahoo_answers/test.csv", header=None, names=["label", "title", "content", "answer"])


In [6]:
print(train_df.head(5))

   label                                              title  \
0      5  why doesn't an optical mouse work on a glass t...   
1      6       What is the best off-road motorcycle trail ?   
2      3             What is Trans Fat? How to reduce that?   
3      7                         How many planes Fedex has?   
4      7  In the san francisco bay area, does it make se...   

                                             content  \
0                          or even on some surfaces?   
1                  long-distance trail throughout CA   
2  I heard that tras fat is bad for the body.  Wh...   
3  I heard that it is the largest airline in the ...   
4  the prices of rent and the price of buying doe...   

                                              answer  
0  Optical mice use an LED and a camera to rapidl...  
1  i hear that the mojave road is amazing!<br />\...  
2  Trans fats occur in manufactured foods during ...  
3  according to the www.fedex.com web site:\nAir ...  
4  rentin

In [7]:
def combine_text(row):
    return f"{row['title']} {row['content']} {row['answer']}"

train_df["text"] = train_df.apply(combine_text, axis=1)
test_df["text"] = test_df.apply(combine_text, axis=1)

train_df["label"] = train_df["label"] - 1
test_df["label"] = test_df["label"] - 1

In [8]:
print(train_df.head(5))

   label                                              title  \
0      4  why doesn't an optical mouse work on a glass t...   
1      5       What is the best off-road motorcycle trail ?   
2      2             What is Trans Fat? How to reduce that?   
3      6                         How many planes Fedex has?   
4      6  In the san francisco bay area, does it make se...   

                                             content  \
0                          or even on some surfaces?   
1                  long-distance trail throughout CA   
2  I heard that tras fat is bad for the body.  Wh...   
3  I heard that it is the largest airline in the ...   
4  the prices of rent and the price of buying doe...   

                                              answer  \
0  Optical mice use an LED and a camera to rapidl...   
1  i hear that the mojave road is amazing!<br />\...   
2  Trans fats occur in manufactured foods during ...   
3  according to the www.fedex.com web site:\nAir ...   
4  r

In [9]:
# Sample 1/10 of the data
train_df = train_df.sample(frac=0.1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=0.1, random_state=42).reset_index(drop=True)

In [10]:
# 2. Tokenizer & Dataset
max_len = 256
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

class YahooDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
train_dataset = YahooDataset(train_df["text"].tolist(), train_df["label"].tolist(), tokenizer, max_len)
test_dataset = YahooDataset(test_df["text"].tolist(), test_df["label"].tolist(), tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=2)

In [12]:
# 4. Xây dựng mô hình tùy chỉnh (BERT-base với dropout, classifier)
class CustomBertClassifier(nn.Module):
    def __init__(self, num_labels=10, dropout=0.1):
        super(CustomBertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            return_dict=True)
        pooled_output = outputs.last_hidden_state[:, 0]  # [CLS] token
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

model = CustomBertClassifier(num_labels=10, dropout=0.1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [14]:
# 5. Optimizer với layer-wise LR decay
no_decay = ["bias", "LayerNorm.weight"]
base_lr = 2e-5
layer_decay = 0.95

optimizer_grouped_parameters = []
layers = [model.bert.embeddings] + list(model.bert.encoder.layer)
layers.reverse()

lr = base_lr
for layer in layers:
    params = list(layer.named_parameters())
    optimizer_grouped_parameters.append({
        "params": [p for n, p in params if not any(nd in n for nd in no_decay)],
        "weight_decay": 0.01,
        "lr": lr
    })
    optimizer_grouped_parameters.append({
        "params": [p for n, p in params if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
        "lr": lr
    })
    lr *= layer_decay

In [15]:
# Classifier head
classifier_params = list(model.classifier.named_parameters())
optimizer_grouped_parameters.append({
    "params": [p for n, p in classifier_params if not any(nd in n for nd in no_decay)],
    "weight_decay": 0.01,
    "lr": base_lr
})
optimizer_grouped_parameters.append({
    "params": [p for n, p in classifier_params if any(nd in n for nd in no_decay)],
    "weight_decay": 0.0,
    "lr": base_lr
})

optimizer = AdamW(optimizer_grouped_parameters, eps=1e-8)

total_steps = len(train_loader) * 4
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)


In [16]:
# 6. Train
best_acc = 0.0
early_stop_count = 0
patience = 2

for epoch in range(4):
    model.train()
    train_loss = 0.0
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(logits, labels)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    # Validation
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(true_labels, predictions)
    print(f"Validation Accuracy: {acc}")

    if acc > best_acc:
        best_acc = acc
        early_stop_count = 0
        torch.save(model.state_dict(), "best_model_sampled_custom.bin")
    else:
        early_stop_count += 1
        if early_stop_count >= patience:
            print("Early stopping triggered!")
            break

print("Training complete!")

  0%|          | 0/17500 [00:00<?, ?it/s]

: 

In [None]:
import matplotlib.pyplot as plt

print("Kích thước tập train:", len(train_df))
print("Kích thước tập test:", len(test_df))

# Xem phân bố label
train_df['label'].value_counts().sort_index().plot(kind='bar', title='Train Data Label Distribution')
plt.xlabel("Label")
plt.ylabel("Số lượng")
plt.show()

In [None]:
# Danh sách topic trực tiếp trong code
classes = [
    "Society & Culture",
    "Science & Mathematics",
    "Health",
    "Education & Reference",
    "Computers & Internet",
    "Sports",
    "Business & Finance",
    "Entertainment & Music",
    "Family & Relationships",
    "Politics & Government"
]

print("Các topic (chỉ số 0-9):")
for idx, topic in enumerate(classes):
    print(f"{idx}: {topic}")

# Load mô hình đã fine-tune (CustomBertClassifier)
import torch
import torch.nn as nn
from transformers import BertModel, AutoTokenizer

class CustomBertClassifier(nn.Module):
    def __init__(self, num_labels=10, dropout=0.1):
        super(CustomBertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            return_dict=True)
        pooled_output = outputs.last_hidden_state[:, 0]  # [CLS] token
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Instantiate & load weights
model = CustomBertClassifier(num_labels=10, dropout=0.1)
model.load_state_dict(torch.load("best_model_sampled_custom.bin", map_location=torch.device('cpu')))
model.eval()

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def predict_top_two_topics(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    with torch.no_grad():
        logits = model(
            input_ids=encoding['input_ids'],
            attention_mask=encoding['attention_mask']
        )
        probs = torch.softmax(logits, dim=1).flatten()

    top2_indices = torch.topk(probs, 2).indices.tolist()
    top2_probs = torch.topk(probs, 2).values.tolist()
    top2_topics = [(classes[i], top2_probs[idx]) for idx, i in enumerate(top2_indices)]
    return top2_topics

# Test câu ví dụ
text_example = "How does quantum computing work?"
top2_predictions = predict_top_two_topics(text_example)
print(f"Văn bản: {text_example}")
print("Top 2 topics dự đoán:")
for topic, prob in top2_predictions:
    print(f"{topic} (xác suất: {prob:.4f})")
