In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import torch

In [6]:
from datasets import load_dataset

data_down_dir = "/root/storage/nas/JH_server/2025/Synthetic_data/0_dataset"

dataset = load_dataset(f"{data_down_dir}/dbpedia_14")
# 예시 출력
print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 560000
    })
    test: Dataset({
        features: ['label', 'title', 'content'],
        num_rows: 70000
    })
})
{'label': 0, 'title': 'E. D. Abbott Ltd', 'content': ' Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.'}


In [3]:
from collections import defaultdict

label_counter = defaultdict(int)

for row in dataset['train']:
    label_counter[row['label']] += 1

In [None]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir='/root/storage/nas/JH_server/cache/')

def preprocess(inputs):
    return tokenizer(f'{inputs["title"]} {inputs["content"]}', truncation=True, padding="max_length", max_length=512)

train_dataset = dataset["train"].map(preprocess, batched=False)
test_dataset = dataset["test"].map(preprocess, batched=False)

num_labels =  len(set(dataset["test"]["label"]))
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, cache_dir='/root/storage/nas/JH_server/cache/')

accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy.compute(predictions=preds, references=labels)

    return acc

args = TrainingArguments(
    eval_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    save_strategy="no"
)

trainer = Trainer(
    model = model,
    args=args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

                                    

Map:   0%|          | 0/560000 [00:00<?, ? examples/s]

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy
100,No log,0.07932,0.987686
200,No log,0.0704,0.9859
300,No log,0.097064,0.975914
400,No log,0.060383,0.9886
500,0.225500,0.057735,0.988
600,0.225500,0.069454,0.985
