In [None]:
MODEL_ID="Qwen/Qwen3-0.6B"
HF_TOKEN=""

TRAIN_PATH = "E:/WIPS/workspace/dmc-review/train.xlsx"  # 200행
TEST_PATH = "E:/WIPS/workspace/dmc-review/test.xlsx"    # 50행

In [36]:
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForSequenceClassification

import os
import torch

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    token=HF_TOKEN)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side='right'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_use_double_quant=True
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=5,
    device_map='auto',
    quantization_config=bnb_config,
    token=HF_TOKEN  # 토큰 추가
)

model.config.use_cache = False
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split

train_df = pd.read_excel(TRAIN_PATH)
test_df = pd.read_excel(TEST_PATH)

df = pd.concat([train_df, test_df], ignore_index=True)

text_columns = ['발명의 명칭', '요약', '전체청구항', '대표청구항']

def combine_text_columns(row):
    """4개 텍스트 칼럼을 하나로 합치기"""
    texts = []
    for col in text_columns:
        if pd.notna(row[col]) and str(row[col]).strip():  # null이 아니고 빈 문자열이 아닌 경우
            texts.append(f"{col}: {str(row[col]).strip()}")
    return " | ".join(texts)

# 텍스트 칼럼 합치기
df['text'] = df.apply(combine_text_columns, axis=1)

df = pd.DataFrame({"text": df["text"], "labels": df["사용자태그"], "patent_id": df["출원번호"]})

labels_list = sorted(df["labels"].unique())
label2id = {l: i for i, l in enumerate(labels_list)}
id2label = {i: l for l, i in label2id.items()}

df["label_id"] = df["labels"].map(label2id)

print(df.head(5))

def chunk_text(text, tokenizer, max_length=512, stride=50):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_length, len(tokens))
        chunks.append(tokenizer.decode(tokens[start:end], skip_special_tokens=True))
        if end == len(tokens):
            break
        start += max_length - stride
    return chunks

chunked_rows = []
for _, row in df.iterrows():
    chunks = chunk_text(row["text"], tokenizer, max_length=512, stride=256)
    for chunk in chunks:
        chunked_rows.append({
            "text": chunk,
            "labels": row["labels"],
            "label_id": row["label_id"],
            "patent_id": row["patent_id"]
        })

df_chunked = pd.DataFrame(chunked_rows)

print(df_chunked.columns)
print(df_chunked.head())
print(df_chunked["label_id"].isna().sum())

train_df, eval_df = train_test_split(df_chunked, test_size=0.2, stratify=df_chunked['label_id'], random_state=42)
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

def tokenize_fn(example):
    inputs = tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = example["label_id"]
    return inputs

train_dataset = train_dataset.map(tokenize_fn, remove_columns=["labels", "label_id", "patent_id"])
eval_dataset = eval_dataset.map(tokenize_fn, remove_columns=["labels", "label_id", "patent_id"])

                                                text    labels  patent_id  \
0  발명의 명칭: Method of producing zeolite film | 요약:...  CPC_C01B  16/762921   
1  발명의 명칭: Process for preparing an IZM-2 zeolite...  CPC_C01B  17/032093   
2  발명의 명칭: Liquid hydrogen storage material | 요약:...  CPC_C01B  16/193627   
3  발명의 명칭: Preparation method of trifluoroamine o...  CPC_C01B  16/624752   
4  발명의 명칭: Synthetic, multifaceted halogenated, f...  CPC_C01B  16/946892   

   label_id  
0         0  
1         0  
2         0  
3         0  
4         0  
Index(['text', 'labels', 'label_id', 'patent_id'], dtype='object')
                                                text    labels  label_id  \
0  발명의 명칭: Method of producing zeolite film | 요약:...  CPC_C01B         0   
1   gel for growing the fine crystals;a third ste...  CPC_C01B         0   
2  발명의 명칭: Process for preparing an IZM-2 zeolite...  CPC_C01B         0   
3   Process for preparing an IZM-2 zeolite, compr...  CPC_C01B         0   
4  ,BF

Map: 100%|██████████| 998/998 [00:01<00:00, 662.78 examples/s]
Map: 100%|██████████| 250/250 [00:00<00:00, 671.49 examples/s]


In [39]:
from transformers import DataCollatorWithPadding
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.metrics import classification_report
import torch.nn.functional as F
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
def compute_metrics(pred):
    labels=pred.label_ids
    preds=pred.predictions.argmax(-1)
    precision, recall, f1,_ =precision_recall_fscore_support(labels, preds, average='macro')
    acc=accuracy_score(labels,preds)
    print("\n Classification Report")
    print(classification_report(labels, preds, digits=2))
    logits_tensor=torch.tensor(pred.predictions)
    labels_tensor=torch.tensor(pred.label_ids)
    loss=F.cross_entropy(logits_tensor,labels_tensor).item()
    return{
        'accuracy':acc,
        'f1':f1,
        'precision':precision,
        'recall':recall,
        'eval_loss':loss
    }

In [40]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
peft_config=LoraConfig(
    lora_alpha=128,
    lora_dropout=0.1,
    r=64,
    bias='none',
    task_type='SEQ_CLS',
    target_modules=['k_proj','gate_proj','v_proj','up_proj','q_proj','o_proj','down_proj']
)
model=prepare_model_for_kbit_training(model)
model=get_peft_model(model,peft_config)

In [41]:
from trl import SFTConfig
output_dir= "../output"
training_arguments=SFTConfig(
    output_dir=output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,
    optim='paged_adamw_32bit',
    lr_scheduler_type='cosine',
    num_train_epochs=2,
    warmup_steps=50,
    logging_steps=10,
    fp16=True,
    gradient_checkpointing=True,
    dataset_text_field='text',
    max_length=512,
    label_names=['labels']
)

In [None]:
import os
from trl import SFTTrainer
trainer=SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    args=training_arguments,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    peft_config=peft_config
)
trainer.train()
print(trainer.evaluate())
trainer.model.save_pretrained('Qwen3-0.6B-QLoRA2')

Truncating train dataset: 100%|██████████| 998/998 [00:04<00:00, 234.16 examples/s]
Truncating eval dataset: 100%|██████████| 250/250 [00:00<00:00, 39026.95 examples/s]
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss
10,5.8082
20,6.8136
30,5.1591
40,4.8916
50,4.3339
60,4.5283
70,3.4759
80,3.4514
90,3.6544
100,3.7916


In [21]:
from transformers import AutoModelForSequenceClassification
from peft import PeftModel

# -------------------------------
# QLoRA SEQ_CLS 어댑터 머지
# -------------------------------

print("=== QLoRA Adapter Merge ===")

# 1. 베이스 모델을 SEQ_CLS로 직접 로드 (어댑터 훈련 시와 동일)
base_model = AutoModelForSequenceClassification.from_pretrained(
    "Qwen/Qwen3-0.6B",
    num_labels=5,
    ignore_mismatched_sizes=False
)

print("Base model structure:")
for name, _ in base_model.named_modules():
    if 'score' in name or 'classifier' in name:
        print(f"  Found: {name}")

# 2. 어댑터 로드 및 머지
adapter_path = "Qwen3-0.6B-QLoRA2"

model = PeftModel.from_pretrained(
    base_model,
    adapter_path,
    ignore_mismatched_sizes=False,
    device_map='auto'
)

# 3. 머지 및 저장
merged_model = model.merge_and_unload()
merged_model.save_pretrained("C:/wips/output/Qwen3_merged_method3")
print("머지 완료!")

=== QLoRA Adapter Merge ===


Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model structure:
  Found: score
머지 완료!


In [24]:
print(f"CUDA 사용 가능: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU 이름: {torch.cuda.get_device_name(0)}")
    print(f"GPU 메모리: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    print(f"현재 GPU 메모리 사용량: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")

# 토크나이저 로드
print("토크나이저 로딩 중...")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print("토크나이저 로딩 완료!")

# 모델 로드 (수정된 부분)
print("모델 로딩 중...")
model = AutoModelForSequenceClassification.from_pretrained(
    r"C:/wips/output/Qwen3_merged_method3",
    num_labels=5,  # 동적으로 계산된 라벨 수 사용
    torch_dtype=torch.float16,
    device_map="auto",  # 자동 디바이스 배치만 사용
    low_cpu_mem_usage=True
)
print("모델 로딩 완료!")

# pad_token_id 설정
model.config.pad_token_id = tokenizer.pad_token_id

# 모델 정보 출력
print(f"모델 디바이스: {next(model.parameters()).device}")
print(f"모델 dtype: {next(model.parameters()).dtype}")

# GPU 메모리 사용량 확인
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"GPU 메모리 사용량: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")

CUDA 사용 가능: True
GPU 이름: NVIDIA GeForce GTX 1660 Ti
GPU 메모리: 6.00 GB
현재 GPU 메모리 사용량: 1.08 GB
토크나이저 로딩 중...


`torch_dtype` is deprecated! Use `dtype` instead!


토크나이저 로딩 완료!
모델 로딩 중...
모델 로딩 완료!
모델 디바이스: cuda:0
모델 dtype: torch.float16
GPU 메모리 사용량: 2.19 GB


In [None]:
import torch
import json
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from sklearn.metrics import classification_report

function_schema = {
    "name": "classify_patent_claim",
    "description": "특허 청구항을 CPC 카테고리로 분류합니다.",
    "parameters": {
        "type": "object",
        "properties": {
            "pred_label": {"type": "string", "description": "예측된 CPC 코드"},
            "reason": {"type": "object", "description": "예측 확률"}
        },
        "required": ["pred_label", "reason"]
    }
}

train_df = pd.read_excel(TRAIN_PATH)
test_df = pd.read_excel(TEST_PATH)

df = pd.concat([train_df, test_df], ignore_index=True)

text_columns = ['발명의 명칭', '요약', '전체청구항', '대표청구항']

def combine_text_columns(row):
    """4개 텍스트 칼럼을 하나로 합치기"""
    texts = []
    for col in text_columns:
        if pd.notna(row[col]) and str(row[col]).strip():  # null이 아니고 빈 문자열이 아닌 경우
            texts.append(f"{col}: {str(row[col]).strip()}")
    return " | ".join(texts)

# 텍스트 칼럼 합치기
df['text'] = df.apply(combine_text_columns, axis=1)

df = pd.DataFrame({"text": df["text"], "labels": df["사용자태그"], "patent_id": df["출원번호"]})

labels_list = sorted(df["labels"].unique())
label2id = {l: i for i, l in enumerate(labels_list)}
id2label = {i: l for l, i in label2id.items()}

df["label_id"] = df["labels"].map(label2id)

print(df.head(5))

def chunk_text(text, tokenizer, max_length=512, stride=50):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_length, len(tokens))
        chunks.append(tokenizer.decode(tokens[start:end], skip_special_tokens=True))
        if end == len(tokens):
            break
        start += max_length - stride
    return chunks

chunked_rows = []
for _, row in df.iterrows():
    chunks = chunk_text(row["text"], tokenizer, max_length=512, stride=256)
    for chunk in chunks:
        chunked_rows.append({
            "text": chunk,
            "labels": row["labels"],
            "label_id": row["label_id"],
            "patent_id": row["patent_id"]
        })

df_chunked = pd.DataFrame(chunked_rows)

print(df_chunked.columns)
print(df_chunked.head())
print(df_chunked["label_id"].isna().sum())

train_df, eval_df = train_test_split(df_chunked, test_size=0.2, stratify=df_chunked['label_id'], random_state=42)
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

def tokenize_fn(example):
    enc = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    enc["labels"] = example["label_id"]
    return enc

train_dataset = train_dataset.map(tokenize_fn, remove_columns=["labels", "label_id", "patent_id"])
remove_cols = [c for c in ["text", "patent_id"] if c in eval_dataset.column_names]
eval_dataset = eval_dataset.map(tokenize_fn, remove_columns=remove_cols)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
test_loader = DataLoader(eval_dataset, batch_size=8, collate_fn=data_collator)

# 2. 모델 예측 → probs 만들기
model.eval()
all_logits = []

print("테스트셋 로짓 계산 중...")
with torch.no_grad():
    for batch in test_loader:
        print(batch.keys())
        print(batch['input_ids'].shape)
        print(batch['labels'].shape)
        batch = {k: v.to(model.device) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
        outputs = model(**batch)
        all_logits.append(outputs.logits.cpu())

logits = torch.cat(all_logits, dim=0)
probs = torch.softmax(logits, dim=-1).numpy()

print("logits/probs 계산 완료:", probs.shape)

# 3. 이후 코드는 기존과 동일
eval_df = eval_df.reset_index(drop=True)
eval_df['chunk_index'] = range(len(eval_df))
eval_df['chunk_len'] = eval_df['text'].apply(lambda x: len(tokenizer.encode(x, add_special_tokens=False)))

patent_results = []
for patent_id, group in eval_df.groupby('patent_id'):
    indices = group['chunk_index'].tolist()
    weights = group['chunk_len'].values
    
    chunk_probs = probs[indices]
    mean_prob = chunk_probs.max(axis=0)
    
    pred_idx = mean_prob.argmax()
    pred_label = id2label[pred_idx]
    reason = {id2label[i]: float(mean_prob[i]) for i in range(len(mean_prob))}
    
    function_call = {
        "name": function_schema["name"],
        "arguments": json.dumps({
            "pred_label": pred_label,
            "reason": reason
        })
    }
    args = json.loads(function_call["arguments"])
    
    patent_results.append({
        "patent_id": patent_id,
        "pred_label": args["pred_label"],
        "reason": args["reason"],
        "true_label": group['labels'].iloc[0]
    })

results_df = pd.DataFrame(patent_results)

# 4. 평가
true_labels = results_df["true_label"].tolist()
pred_labels = results_df["pred_label"].tolist()

print("== Start evaluation ==")
print(classification_report(true_labels, pred_labels, digits=4))

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, pred_labels, average="weighted")
acc = accuracy_score(true_labels, pred_labels)
print(f"Accuracy: {acc:.4f}, F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")


                                                text    labels  patent_id  \
0  발명의 명칭: Method of producing zeolite film | 요약:...  CPC_C01B  16/762921   
1  발명의 명칭: Process for preparing an IZM-2 zeolite...  CPC_C01B  17/032093   
2  발명의 명칭: Liquid hydrogen storage material | 요약:...  CPC_C01B  16/193627   
3  발명의 명칭: Preparation method of trifluoroamine o...  CPC_C01B  16/624752   
4  발명의 명칭: Synthetic, multifaceted halogenated, f...  CPC_C01B  16/946892   

   label_id  
0         0  
1         0  
2         0  
3         0  
4         0  
Index(['text', 'labels', 'label_id', 'patent_id'], dtype='object')
                                                text    labels  label_id  \
0  발명의 명칭: Method of producing zeolite film | 요약:...  CPC_C01B         0   
1  olite film is sequentially taken out of the co...  CPC_C01B         0   
2  발명의 명칭: Process for preparing an IZM-2 zeolite...  CPC_C01B         0   
3  O2 between 1 and 100,R(OH)2/XO2 between 0.006 ...  CPC_C01B         0   
4  5. 

Map: 100%|██████████| 693/693 [00:00<00:00, 698.48 examples/s]
Map: 100%|██████████| 174/174 [00:00<00:00, 746.14 examples/s]


테스트셋 로짓 계산 중...
KeysView({'labels': tensor([2, 3, 3, 4, 4, 3, 4, 0]), 'label_id': tensor([2, 3, 3, 4, 4, 3, 4, 0]), '__index_level_0__': tensor([263, 826, 494, 622, 592, 813, 586,  71]), 'input_ids': tensor([[  6554,  44204,   4269,  ...,    264,  14409,   4982],
        [   632,     11,    323,  ..., 151643, 151643, 151643],
        [     8,    525,  11691,  ..., 151643, 151643, 151643],
        ...,
        [ 35741,  32237,  34619,  ...,  32237,  34619,  83343],
        [126835,  79632,  20401,  ...,   3717,    220,     19],
        [   220,     19,     13,  ...,     15,     15,  11616]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])})
torch.Size([8, 512])
torch.Size([8])
KeysView({'labels': tensor([3, 1, 2, 2, 1, 0, 2, 4]), 'label_id': tensor([3, 1, 2, 2, 1, 0, 2, 4]), '__index_level_0__': tenso