In [1]:
import gc
import torch

def cleanup_globals(vars_to_keep: list):
    global_vars = list(globals().keys())
    protected_vars = ['In', 'Out', 'get_ipython', 'exit', 'quit', 'gc', 'torch', 'cleanup_globals']

    for var in global_vars:
        if var not in vars_to_keep and not var.startswith('_') and var not in protected_vars:
            try:
                del globals()[var]
                print(f"{var} 삭제됨")
            except:
                continue
    torch.cuda.empty_cache()
    gc.collect()

In [3]:
import boto3
import pandas as pd
import io
from datasets import Dataset

try:
    credentials_df = pd.read_csv('./ganghyun-dev_accessKeys.csv')

    if not credentials_df.empty:
        aws_access_key_id = credentials_df['Access key ID'].iloc[0].strip()
        aws_secret_access_key = credentials_df['Secret access key'].iloc[0].strip()
    else:
        print("Error: 'aws_credentials.csv' is empty.")
        exit()

except FileNotFoundError:
    print("Error: 'aws_credentials.csv' not found in Drive.")
    print("Please create a file named 'aws_credentials.csv' in your Google Drive with your AWS credentials.")
    exit()
except KeyError:
    print("Error: 'Access key ID' or 'Secret access key' column not found in 'aws_credentials.csv'.")
    print("Please ensure your CSV file has these columns.")
    exit()
except Exception as e:
    print(f"Error loading AWS credentials from CSV: {e}")
    exit()

bucket_name = "dr.hong-s3"

file_key = "dataset/origin_template_classification_dataset.xlsx"

s3_client = boto3.client('s3',
                         aws_access_key_id=aws_access_key_id,
                         aws_secret_access_key=aws_secret_access_key)

try:
    file_content = s3_client.get_object(Bucket=bucket_name, Key=file_key)['Body'].read()
    print("파일을 성공적으로 메모리로 불러왔습니다.")

except Exception as e:
    print(f"S3에서 파일을 불러오는 중 오류가 발생했습니다: {e}")
    exit()

# 엑셀 파일을 pandas datafrome 으로 변환
print("파일을 pandas dataframe 로 변환")
df = pd.read_excel(io.BytesIO(file_content))

print("원본 데이터 상위 5개")
print(df.head)
print()

print("Hugging Face Dataset 으로 변환")
cls_dataset = Dataset.from_pandas(df)

cls_train_test_dataset = cls_dataset.train_test_split(test_size=0.2, shuffle=True)
cls_train_dataset = cls_train_test_dataset["train"]
cls_test_dataset = cls_train_test_dataset["test"]

print("최종 분할된 데이터 셋")
print(cls_train_dataset)
print(cls_test_dataset)

cleanup_globals(["cls_train_dataset", "cls_test_dataset"])

파일을 성공적으로 메모리로 불러왔습니다.
파일을 pandas dataframe 로 변환
원본 데이터 상위 5개
<bound method NDFrame.head of                                                template  is_approved  \
0     {"title": "회사소개서 발송", "text": "안녕하세요 #{수신자명}님,...            1   
1     {"title": "서비스 소개서 발송", "text": "안녕하세요 #{수신자명}...            1   
2     {"title": "(전용) 강의 일정 안내 / 화케터", "text": "안녕하세...            1   
3     {"title": "(공용) 후기 작성 요청_이미지형_01", "text": "[템...            1   
4     {"title": "(공용) 인보이스 알림_이미지형_01", "text": "■ #...            1   
...                                                 ...          ...   
1261  {"title": "(공용) 적립금 소멸 안내_이미지형_01", "text": "안...            0   
1262  {"title": "(전용) AS 안내_기본형_01 / 라벨르", "text": "...            0   
1263  {"title": "(공용) 문서 도착 알림_이미지형_08", "text": "■ ...            0   
1264  {"title": "(공용) 링크_ 문서 도착 알림_이미지형_01", "text":...            0   
1265  {"title": "(공용) 링크_문서 도착 알림_이미지형_01", "text": ...            0   

     reject_reason  
0              NaN  
1

In [4]:
import os
import gc
import torch
from huggingface_hub import snapshot_download
from huggingface_hub.utils import RepositoryNotFoundError

def download_model_snapshot(model_id: str, local_dir: str) -> str:
    print(f"'{model_id}' 모델을 '{local_dir}' 경로에 다운로드합니다...")
    try:
        # snapshot_download는 알아서 기존 파일을 체크하고 필요한 것만 다운로드합니다.
        model_path = snapshot_download(
            repo_id=model_id,
            local_dir=local_dir
            # resume_download=True, # 기본값이 True이므로 명시하지 않아도 됨
        )
        print("✅ 모델 준비 완료!")
        return model_path
    except RepositoryNotFoundError:
        print(f"❌ 오류: 모델 ID '{model_id}'를 찾을 수 없습니다.")
        return None
    except Exception as e:
        print(f"❌ 다운로드 중 오류가 발생했습니다: {e}")
        return None

# 실행
checkpoint = "klue/bert-base" # CC-BY-SA-4.0 라이선스 제약 있음
model_path = download_model_snapshot(checkpoint, "./downloaded_model/" + checkpoint.replace("/", "--"))

cleanup_globals(["cls_train_dataset", "cls_test_dataset", "model_path"])

'klue/bert-base' 모델을 './downloaded_model/klue--bert-base' 경로에 다운로드합니다...


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

.gitattributes:   0%|          | 0.00/744 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

✅ 모델 준비 완료!
os 삭제됨
snapshot_download 삭제됨
RepositoryNotFoundError 삭제됨
download_model_snapshot 삭제됨
checkpoint 삭제됨


In [11]:
# 데이터 셋 전처리
from transformers import AutoTokenizer
from datasets import DatasetDict
import torch
import pandas as pd

# 토크나이저 디스크에서 메모리로 로드
def load_tokenizer_from_local(model_path: str):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    return tokenizer

tokenizer = load_tokenizer_from_local(model_path)

def tokenize_function(elements):
    batch_tokenized = {'input_ids': [], 'attention_mask': [], 'labels': []}
    
    for i in range(len(elements['template'])):
        template = elements['template'][i]
        reason = elements['reject_reason'][i]
        
        if pd.isna(reason):
            # reason이 없는 경우: [CLS] template [SEP]
            tokenized = tokenizer(
                text=template,
                padding=False,
                max_length=512,
                truncation=True
            )
        else:
            # reason이 있는 경우: [CLS] template [SEP] reason [SEP]
            tokenized = tokenizer(
                text=template,
                text_pair=str(reason),
                padding=False,
                max_length=512,
                truncation=True
            )
        
        batch_tokenized['input_ids'].append(tokenized['input_ids'])
        batch_tokenized['attention_mask'].append(tokenized['attention_mask'])
        batch_tokenized['labels'].append(elements['is_approved'][i])
    
    return batch_tokenized

# 데이터셋에 토큰화 함수 적용
print("\nApplying tokenization function to the dataset...")
tokenized_cls_train_datasets = cls_train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=cls_train_dataset.column_names
)
tokenized_cls_eval_datasets = cls_test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=cls_test_dataset.column_names
)

tokenized_cls_datasets = DatasetDict({
    "train": tokenized_cls_train_datasets,
    "eval": tokenized_cls_eval_datasets
})

print("Tokenized train dataset features:", tokenized_cls_train_datasets.features)
print("\nTokenized test dataset features:", tokenized_cls_eval_datasets.features)
print(f"\nTrain dataset size: {len(tokenized_cls_train_datasets)}")
print(f"Test dataset size: {len(tokenized_cls_eval_datasets)}")

tokenized_path = "./tokenized_datasets/" + "klue/bert-base".replace("/", "--")
tokenized_cls_datasets.save_to_disk(tokenized_path)

cleanup_globals(["model_path", "tokenizer", "tokenized_path"])



Applying tokenization function to the dataset...


Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/254 [00:00<?, ? examples/s]

Tokenized train dataset features: {'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8')), 'labels': Value('int64')}

Tokenized test dataset features: {'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8')), 'labels': Value('int64')}

Train dataset size: 1012
Test dataset size: 254


Saving the dataset (0/1 shards):   0%|          | 0/1012 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/254 [00:00<?, ? examples/s]

cls_train_dataset 삭제됨
cls_test_dataset 삭제됨
AutoTokenizer 삭제됨
DatasetDict 삭제됨
load_tokenizer_from_local 삭제됨
tokenize_function 삭제됨
pd 삭제됨
tokenized_cls_train_datasets 삭제됨
tokenized_cls_eval_datasets 삭제됨
tokenized_cls_datasets 삭제됨


In [12]:
import torch

# GPU가 사용 가능한지 확인
if torch.cuda.is_available():
    # 현재 사용 중인 메모리 (바이트)
    allocated_bytes = torch.cuda.memory_allocated(device=0)
    # 캐시된 메모리 (바이트)
    reserved_bytes = torch.cuda.memory_reserved(device=0)

    # GB 단위로 변환
    gb_factor = 1024 * 1024 * 1024
    allocated_gb = allocated_bytes / gb_factor
    reserved_gb = reserved_bytes / gb_factor

    print(f"현재 사용 중인 GPU 메모리: {allocated_gb:.2f} GB")
    print(f"현재 캐시된 GPU 메모리: {reserved_gb:.2f} GB")

else:
    print("GPU를 사용할 수 없습니다.")

현재 사용 중인 GPU 메모리: 0.00 GB
현재 캐시된 GPU 메모리: 0.00 GB


In [13]:
cleanup_globals([])

model_path 삭제됨
tokenizer 삭제됨
tokenized_path 삭제됨
allocated_bytes 삭제됨
reserved_bytes 삭제됨
gb_factor 삭제됨
allocated_gb 삭제됨
reserved_gb 삭제됨


In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = "./downloaded_model/klue--bert-base"

def load_model_and_tokenizer_from_local(model_path: str):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        local_files_only=True,
        # classifier_dropout=0.2, # 과적합 발생하면 시도
        num_labels=2,
        id2label={0: "Not Approved", 1: "Approved"},
        label2id={"Not Approved": 0, "Approved": 1},
        problem_type="single_label_classification"
    )
    return tokenizer, model

tokenizer, model = load_model_and_tokenizer_from_local(model_path)

cleanup_globals(["tokenizer", "model"])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./downloaded_model/klue--bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AutoTokenizer 삭제됨
AutoModelForSequenceClassification 삭제됨
model_path 삭제됨
load_model_and_tokenizer_from_local 삭제됨


In [16]:
from datasets import load_from_disk
from transformers import DataCollatorWithPadding

tokenized_path = "./tokenized_datasets/" + "klue/bert-base".replace("/", "--")
tokenized_cls_datasets = load_from_disk(tokenized_path)
tokenized_cls_train_datasets = tokenized_cls_datasets['train']
tokenized_cls_eval_datasets = tokenized_cls_datasets['eval']

# DataCollator 정의 - 텍스트 분류는 DataCollatorWithPadding 사용
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding='longest', # 배치 내에서 가장 긴 시퀀스에 맞춰 패딩
    pad_to_multiple_of=8, # 학습 속도를 약간 높여줌
)

In [20]:
import torch
import wandb
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments

# wandb 초기화
wandb.init(
    entity="dr-hong",
    project="dr-hong",
    name="klue-bert-base",
    config={
        "learning_rate": 3e-5,
        "epochs": 3,
        "batch_size": 32,
        "model_name": "klue/bert-base",
    }
)

# compute_metrics 함수 정의. bert 에 적합한 평가지표 선택: accuracy, f1, precision, recall
def compute_metrics(eval_pred, batch_eval_metrics=False):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# 학습 인자 정의
training_args = TrainingArguments(
    output_dir="./results/" + "klue/bert-base".replace("/", "--"),
    do_train=True,
    do_eval=True,
    eval_strategy="steps",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    eval_delay=32,
    learning_rate=3e-5, # differential learning rate 적용 고려해 볼 수 있음
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_dir="./logs/" + "klue/bert-base".replace("/", "--"),
    logging_strategy="steps",
    logging_steps=5,
    save_strategy="steps",
    save_steps=16,
    save_total_limit=1,
    bf16=True,
    tf32=True,
    eval_steps=8,
    dataloader_num_workers=4,
    disable_tqdm=False,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="all",
    dataloader_persistent_workers=True,
    resume_from_checkpoint="./results/" + "klue/bert-base".replace("/", "--"),
    gradient_checkpointing=True,
    auto_find_batch_size=True,
    torchdynamo="inductor",
    torch_compile=True,
    torch_compile_backend="inductor",
    torch_compile_mode="default",
)

# Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_cls_train_datasets,
    eval_dataset=tokenized_cls_eval_datasets,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

try:
    train_output = trainer.train()

    eval_results = trainer.evaluate()

except Exception as e:
    print(f"학습 중 오류 발생: {str(e)}")
    raise e

# 최종 모델 저장
model.save_pretrained("./finetuned_model/" + "klue/bert-base".replace("/", "--"))

# wandb 종료
wandb.finish()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
32,0.091,0.216205,0.952756,0.947706,0.955168,0.952756
40,0.1057,0.155157,0.96063,0.957276,0.96232,0.96063
48,0.0715,0.231053,0.956693,0.952544,0.958729,0.956693
56,0.1245,0.193842,0.956693,0.952544,0.958729,0.956693
64,0.0583,0.154294,0.96063,0.957276,0.96232,0.96063
72,0.0439,0.160847,0.96063,0.957276,0.96232,0.96063
80,0.029,0.176957,0.96063,0.957276,0.96232,0.96063
88,0.0083,0.229782,0.96063,0.957276,0.96232,0.96063
96,0.0592,0.231531,0.96063,0.957276,0.96232,0.96063


W0910 01:42:21.333000 3842 torch/fx/experimental/symbolic_shapes.py:6823] [0/1] _maybe_guard_rel() was called on non-relation expression Eq(s16, 1) | Eq(s27, s16)
Online softmax is disabled on the fly since Inductor decides to
split the reduction. Cut an issue to PyTorch if this is an
important use case and you want to speed it up with online
softmax.

W0910 01:43:14.155000 3842 torch/fx/experimental/symbolic_shapes.py:6823] [0/2] _maybe_guard_rel() was called on non-relation expression Eq(s52, s92) | Eq(s92, 1)
W0910 01:43:14.158000 3842 torch/fx/experimental/symbolic_shapes.py:6823] [0/2] _maybe_guard_rel() was called on non-relation expression Eq(s16, 1) | Eq(s27, s16)
Online softmax is disabled on the fly since Inductor decides to
split the reduction. Cut an issue to PyTorch if this is an
important use case and you want to speed it up with online
softmax.

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to 

0,1
eval/accuracy,▁█▅▅█████▁
eval/f1,▁█▅▅█████▁
eval/loss,▇▁█▅▁▂▃██▇
eval/precision,▁█▄▄█████▁
eval/recall,▁█▅▅█████▁
eval/runtime,▇▁▁▁▁▁▁▁▁█
eval/samples_per_second,▁████████▁
eval/steps_per_second,▁████████▁
train/epoch,▁▁▂▂▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇█████
train/global_step,▁▁▂▂▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇█████

0,1
eval/accuracy,0.95276
eval/f1,0.94771
eval/loss,0.2162
eval/precision,0.95517
eval/recall,0.95276
eval/runtime,0.745
eval/samples_per_second,340.923
eval/steps_per_second,10.738
total_flos,322376820579840.0
train/epoch,3


In [24]:
# 추론 테스트
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd

# 학습된 모델 로드
base_model_path = "./downloaded_model/" + "klue/bert-base".replace("/", "--")
finetuned_model_path = "./finetuned_model/" + "klue/bert-base".replace("/", "--")
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
model = AutoModelForSequenceClassification.from_pretrained(finetuned_model_path)
model.eval()

def predict_template(template, reject_reason=None):
    # 토큰화
    if reject_reason and not pd.isna(reject_reason):
        inputs = tokenizer(
            text=template,
            text_pair=str(reject_reason),
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
    else:
        inputs = tokenizer(
            text=template,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
    
    # 추론
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
        confidence = predictions[0][predicted_class].item()
    
    return {
        "prediction": "Approved" if predicted_class == 1 else "Not Approved",
        "confidence": confidence,
        "probabilities": {
            "Not Approved": predictions[0][0].item(),
            "Approved": predictions[0][1].item()
        }
    }

# 테스트 예시
test_template = '{"title": "회사소개서 발송", "text": "안녕하세요 #{수신자명}님, 저희 회사를 소개드립니다."}'
result = predict_template(test_template)
print(f"Template: {test_template}")
print(f"Prediction: {result['prediction']}")
print(f"Confidence: {result['confidence']:.4f}")
print(f"Probabilities: {result['probabilities']}")

Template: {"title": "회사소개서 발송", "text": "안녕하세요 #{수신자명}님, 저희 회사를 소개드립니다."}
Prediction: Approved
Confidence: 0.9935
Probabilities: {'Not Approved': 0.0064859273843467236, 'Approved': 0.9935140609741211}
