In [None]:
import gc
import torch

def cleanup_globals(vars_to_keep: list):
    global_vars = list(globals().keys())
    protected_vars = ['In', 'Out', 'get_ipython', 'exit', 'quit', 'gc', 'torch', 'cleanup_globals']

    for var in global_vars:
        if var not in vars_to_keep and not var.startswith('_') and var not in protected_vars:
            try:
                del globals()[var]
                print(f"{var} 삭제됨")
            except:
                continue
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
import boto3
import pandas as pd
import io
from datasets import Dataset

try:
    credentials_df = pd.read_csv('./ganghyun-dev_accessKeys.csv')

    if not credentials_df.empty:
        aws_access_key_id = credentials_df['Access key ID'].iloc[0].strip()
        aws_secret_access_key = credentials_df['Secret access key'].iloc[0].strip()
    else:
        print("Error: 'aws_credentials.csv' is empty.")
        exit()

except FileNotFoundError:
    print("Error: 'aws_credentials.csv' not found in Drive.")
    print("Please create a file named 'aws_credentials.csv' in your Google Drive with your AWS credentials.")
    exit()
except KeyError:
    print("Error: 'Access key ID' or 'Secret access key' column not found in 'aws_credentials.csv'.")
    print("Please ensure your CSV file has these columns.")
    exit()
except Exception as e:
    print(f"Error loading AWS credentials from CSV: {e}")
    exit()

bucket_name = "dr.hong-s3"

file_key = "dataset/origin_template_classification_dataset.xlsx"

s3_client = boto3.client('s3',
                         aws_access_key_id=aws_access_key_id,
                         aws_secret_access_key=aws_secret_access_key)

try:
    file_content = s3_client.get_object(Bucket=bucket_name, Key=file_key)['Body'].read()
    print("파일을 성공적으로 메모리로 불러왔습니다.")

except Exception as e:
    print(f"S3에서 파일을 불러오는 중 오류가 발생했습니다: {e}")
    exit()

# 엑셀 파일을 pandas datafrome 으로 변환
print("파일을 pandas dataframe 로 변환")
df = pd.read_excel(io.BytesIO(file_content)).fillna(None)

print("원본 데이터 상위 5개")
print(df.head)
print()

print("Hugging Face Dataset 으로 변환")
cls_dataset = Dataset.from_pandas(df)

cls_train_test_dataset = cls_dataset.train_test_split(test_size=0.2, shuffle=True)
cls_train_dataset = cls_train_test_dataset["train"]
cls_test_dataset = cls_train_test_dataset["test"]

print("최종 분할된 데이터 셋")
print(cls_train_dataset)
print(cls_test_dataset)

cleanup_globals(["cls_train_dataset", "cls_test_dataset"])

In [None]:
import os
import gc
import torch
from huggingface_hub import snapshot_download
from huggingface_hub.utils import RepositoryNotFoundError

def download_model_snapshot(model_id: str, local_dir: str) -> str:
    print(f"'{model_id}' 모델을 '{local_dir}' 경로에 다운로드합니다...")
    try:
        # snapshot_download는 알아서 기존 파일을 체크하고 필요한 것만 다운로드합니다.
        model_path = snapshot_download(
            repo_id=model_id,
            local_dir=local_dir
            # resume_download=True, # 기본값이 True이므로 명시하지 않아도 됨
        )
        print("✅ 모델 준비 완료!")
        return model_path
    except RepositoryNotFoundError:
        print(f"❌ 오류: 모델 ID '{model_id}'를 찾을 수 없습니다.")
        return None
    except Exception as e:
        print(f"❌ 다운로드 중 오류가 발생했습니다: {e}")
        return None

# 실행
checkpoint = "klue/bert-base" # CC-BY-SA-4.0 라이선스 제약 있음
model_path = download_model_snapshot(checkpoint, "./downloaded_model/" + checkpoint.replace("/", "--"))

cleanup_globals(["cls_train_dataset", "cls_test_dataset", "model_path"])

In [None]:
# 데이터 셋 전처리
from transformers import AutoTokenizer
from datasets import DatasetDict
import torch

# 토크나이저 디스크에서 메모리로 로드
def load_tokenizer_from_local(model_path: str):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    return tokenizer

tokenizer = load_tokenizer_from_local(model_path)

def tokenize_function(elements):
    tokenized = tokenizer(
        text=elements['template'],
        text_pair=[reason if reason is not None else None for reason in elements['reason']],
        padding=False, # DataCollator 에서 padding 함
        max_length=512,
    )
    tokenized["labels"] = elements["is_approved"]
    return tokenized

# 데이터셋에 토큰화 함수 적용
print("\nApplying tokenization function to the dataset...")
tokenized_cls_train_datasets = cls_train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=cls_train_dataset.column_names
)
tokenized_cls_eval_datasets = cls_test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=cls_test_dataset.column_names
)

tokenized_cls_datasets = DatasetDict({
    "train": tokenized_cls_train_datasets,
    "eval": tokenized_cls_eval_datasets
})

print("Tokenized train dataset features:", tokenized_cls_train_datasets.features)
print("\nTokenized test dataset features:", tokenized_cls_eval_datasets.features)
print(f"\nTrain dataset size: {len(tokenized_cls_train_datasets)}")
print(f"Test dataset size: {len(tokenized_cls_eval_datasets)}")

tokenized_path = "./tokenized_datasets/" + "klue/bert-base".replace("/", "--")
tokenized_gen_datasets.save_to_disk(tokenized_path)

cleanup_globals(["model_path", "tokenizer", "tokenized_path"])


In [None]:
import torch

# GPU가 사용 가능한지 확인
if torch.cuda.is_available():
    # 현재 사용 중인 메모리 (바이트)
    allocated_bytes = torch.cuda.memory_allocated(device=0)
    # 캐시된 메모리 (바이트)
    reserved_bytes = torch.cuda.memory_reserved(device=0)

    # GB 단위로 변환
    gb_factor = 1024 * 1024 * 1024
    allocated_gb = allocated_bytes / gb_factor
    reserved_gb = reserved_bytes / gb_factor

    print(f"현재 사용 중인 GPU 메모리: {allocated_gb:.2f} GB")
    print(f"현재 캐시된 GPU 메모리: {reserved_gb:.2f} GB")

else:
    print("GPU를 사용할 수 없습니다.")

In [None]:
cleanup_globals([])

In [None]:
from transtormers import AutoTokenizer, AutoModelForSequenceClassification

model_path = "./downloaded_model/klue--bert-base"

def load_model_and_tokenizer_from_local(model_path: str):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        local_files_only=True,
        # classifier_dropout=0.2, # 과적합 발생하면 시도
        num_labels=2,
        id2label={0: "Not Approved", 1: "Approved"},
        label2id={"Not Approved": 0, "Approved": 1},
        problem_type="single_label_classification"
    )
    return tokenizer, model

tokenizer, model = load_model_and_tokenizer_from_local(model_path)

cleanup_globals(["tokenizer", "model"])


In [None]:
from datasets import load_from_disk
from transformers import DataCollatorWithPadding

tokenized_path = "./tokenized_datasets/" + "klue/bert-base".replace("/", "--")
tokenized_cls_datasets = load_from_disk(tokenized_path)
tokenized_cls_train_datasets = tokenized_cls_datasets['train']
tokenized_cls_eval_datasets = tokenized_cls_datasets['eval']

# DataCollator 정의 - 텍스트 분류는 DataCollatorWithPadding 사용
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding='longest', # 배치 내에서 가장 긴 시퀀스에 맞춰 패딩
    pad_to_multiple_of=8, # 학습 속도를 약간 높여줌
)

In [None]:
import torch
import wandb
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments

# wandb 초기화
wandb.init(
    entity="dr-hong",
    project="dr-hong",
    name="klue-bert-base",
    config={
        "learning_rate": 3e-5,
        "epochs": 3,
        "batch_size": 32,
        "model_name": "klue/bert-base",
    }
)

# compute_metrics 함수 정의. bert 에 적합한 평가지표 선택: accuracy, f1, precision, recall
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# 학습 인자 정의
training_args = TrainingArguments(
    output_dir="./results/" + "klue/bert-base".replace("/", "--"),
    do_train=True,
    do_eval=True,
    eval_strategy="steps",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    eval_delay=32,
    learning_rate=3e-5, # differential learning rate 적용 고려해 볼 수 있음
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_dir="./logs/" + "klue/bert-base".replace("/", "--"),
    logging_strategy="steps",
    logging_steps=5,
    save_strategy="steps",
    save_steps=15,
    save_total_limit=1,
    bf16=True,
    tf32=True,
    eval_steps=8,
    dataloader_num_workers=4,
    disable_tqdm=False,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="all",
    dataloader_persistent_workers=True,
    resume_from_checkpoint="./results/" + "klue/bert-base".replace("/", "--"),
    gradient_checkpointing=True,
    auto_find_batch_size=True,
    torchdynamo="inductor",
    torch_compile=True,
    torch_compile_backend="inductor",
    torch_compile_mode="default",
    batch_eval_metrics=True,
)

# Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_cls_train_datasets,
    eval_dataset=tokenized_cls_eval_datasets,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

try:
    train_output = trainer.train()

    eval_results = trainer.evaluate()

except Exception as e:
    print(f"학습 중 오류 발생: {str(e)}")
    raise e

# 최종 모델 저장
model.save_pretrained("./final_model/" + "klue/bert-base".replace("/", "--"))

# wandb 종료
wandb.finish()