In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from torch.optim import AdamW

In [None]:
# 1. Load data
train_df = pd.read_csv("../data/yahoo_answers/train.csv", header=None, names=["label", "title", "content", "answer"])
test_df = pd.read_csv("../data/yahoo_answers/test.csv", header=None, names=["label", "title", "content", "answer"])


In [None]:
print(train_df.head(5))

In [None]:
def combine_text(row):
    return f"{row['title']} {row['content']} {row['answer']}"

train_df["text"] = train_df.apply(combine_text, axis=1)
test_df["text"] = test_df.apply(combine_text, axis=1)

train_df["label"] = train_df["label"] - 1
test_df["label"] = test_df["label"] - 1

In [None]:
print(train_df.head(5))

In [None]:
# Sample 1/10 of the data
train_df = train_df.sample(frac=0.1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=0.1, random_state=42).reset_index(drop=True)

In [None]:
# 2. Tokenizer & Dataset
max_len = 256
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

class YahooDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
train_dataset = YahooDataset(train_df["text"].tolist(), train_df["label"].tolist(), tokenizer, max_len)
test_dataset = YahooDataset(test_df["text"].tolist(), test_df["label"].tolist(), tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=2)

In [None]:
# 3. Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=10)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 4. Optimizer, Scheduler, Loss
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
total_steps = len(train_loader) * 3
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=int(0.1*total_steps), 
                                            num_training_steps=total_steps)

In [None]:
# 5. Train
best_acc = 0.0
early_stop_count = 0
patience = 2

for epoch in range(3):
    model.train()
    train_loss = 0.0
    loop = tqdm(train_loader, leave=True)
    
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())
    
    # Validation
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    acc = accuracy_score(true_labels, predictions)
    print(f"Validation Accuracy: {acc}")
    
    # Early Stopping
    if acc > best_acc:
        best_acc = acc
        early_stop_count = 0
        torch.save(model.state_dict(), "best_model.bin")
    else:
        early_stop_count += 1
        if early_stop_count >= patience:
            print("Early stopping triggered!")
            break

print("Training complete!")


In [None]:
# Phân tích dữ liệu đã lấy sample (1/10)
import matplotlib.pyplot as plt

print("Kích thước tập train:", len(train_df))
print("Kích thước tập test:", len(test_df))

# Xem phân bố label
train_df['label'].value_counts().sort_index().plot(kind='bar', title='Train Data Label Distribution')
plt.xlabel("Label")
plt.ylabel("Số lượng")
plt.show()

In [None]:
# Danh sách topic trực tiếp trong code
classes = [
    "Society & Culture",
    "Science & Mathematics",
    "Health",
    "Education & Reference",
    "Computers & Internet",
    "Sports",
    "Business & Finance",
    "Entertainment & Music",
    "Family & Relationships",
    "Politics & Government"
]

print("Các topic (chỉ số 0-9):")
for idx, topic in enumerate(classes):
    print(f"{idx}: {topic}")

# Load mô hình đã fine-tune
from transformers import BertForSequenceClassification, AutoTokenizer
import torch

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=10)
model.load_state_dict(torch.load("/kaggle/working/best_model.bin", map_location=torch.device('cpu')))
model.eval()

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def predict_topic(text):
    # Tiền xử lý và tokenize
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Dự đoán
    with torch.no_grad():
        outputs = model(
            input_ids=encoding['input_ids'],
            attention_mask=encoding['attention_mask']
        )
        probs = torch.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(probs, dim=1).item()

    predicted_topic = classes[predicted_class]
    return predicted_topic


In [None]:
# Ví dụ: dự đoán topic cho một câu
text_example = "How does quantum computing work?"
predicted = predict_topic(text_example)
print(f"Văn bản: {text_example}")
print(f"Topic dự đoán: {predicted}")

In [None]:
def predict_top_two_topics(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    with torch.no_grad():
        outputs = model(
            input_ids=encoding['input_ids'],
            attention_mask=encoding['attention_mask']
        )
        probs = torch.softmax(outputs.logits, dim=1).flatten()

    top2_indices = torch.topk(probs, 2).indices.tolist()
    top2_probs = torch.topk(probs, 2).values.tolist()
    top2_topics = [(classes[i], top2_probs[idx]) for idx, i in enumerate(top2_indices)]
    return top2_topics


In [None]:
text_example = "he concept of black holes fascinates me because of their ability to warp space and time in a way that defies our understanding of physics"
top2_predictions = predict_top_two_topics(text_example)
print(f"Văn bản: {text_example}")
print("Top 2 topics dự đoán:")
for topic, prob in top2_predictions:
    print(f"{topic} (xác suất: {prob:.4f})")