In [9]:
import os
import shutil

def recursive_unpack(filename, extract_dir):
    try:
        # 지정된 파일을 압축 해제
        shutil.unpack_archive(filename, extract_dir)
        print(f"Unpacked: {filename} to {extract_dir}")
    except (shutil.ReadError, ValueError) as e:
        print(f"Failed to unpack {filename}: {e}")
        return

    # 해제된 디렉토리 내의 파일을 탐색하여 중첩 압축 파일 처리
    for root, dirs, files in os.walk(extract_dir):
        for file in files:
            file_path = os.path.join(root, file)
            if any(file_path.endswith(ext) for ext in ['.zip', '.tar.gz', '.tgz', '.tar']):
                new_extract_dir = os.path.join(root, file.rsplit(".", 1)[0])
                os.makedirs(new_extract_dir, exist_ok=True)
                recursive_unpack(file_path, new_extract_dir)
                os.remove(file_path)  # 중첩 압축 파일 삭제 (선택 사항)

# Data 폴더 내의 최상위 압축 파일 해제 및 모든 중첩 압축 파일 해제
data_dir = '/home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/Data'
for file in os.listdir(data_dir):
    file_path = os.path.join(data_dir, file)
    if any(file_path.endswith(ext) for ext in ['.zip', '.tar.gz', '.tgz', '.tar']):
        extract_dir = os.path.join(data_dir, file.rsplit(".", 1)[0])
        os.makedirs(extract_dir, exist_ok=True)
        recursive_unpack(file_path, extract_dir)

Unpacked: /home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/Data/RowData.zip to /home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/Data/RowData
Unpacked: /home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/Data/RowData/01.원천데이터/TS_04T_예술경험_02S_초등_저학년.zip to /home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/Data/RowData/01.원천데이터/TS_04T_예술경험_02S_초등_저학년
Unpacked: /home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/Data/RowData/01.원천데이터/TS_01T_의사소통_03S_초등_고학년.zip to /home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/Data/RowData/01.원천데이터/TS_01T_의사소통_03S_초등_고학년
Unpacked: /home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/Data/RowData/01.원천데이터/TS_04T_예술경험_01S_유아.zip to /home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/Data/RowData/01.원천데이터/TS_04T_예술경험_01S_유아
Unpacked: /home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/Data/RowData/

In [1]:
import os
import json
import pandas as pd
from datasets import Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from transformers import BitsAndBytesConfig
import torch
import gc
from torch.nn import CrossEntropyLoss



In [2]:
# 데이터 파일 경로 설정
fairytale_folder = "/home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/Data/RowData/01.원천데이터"
book_folder = "/home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/Data/TL_unscramble"

# Hugging Face 액세스 토큰
# token = "***************************"

# JSON 파일에서 텍스트 추출
texts = []

for root, dirs, files in os.walk(fairytale_folder):
    for file_name in files:
        if file_name.endswith('.json'):
            file_path = os.path.join(root, file_name)
            with open(file_path, 'r', encoding='utf-8-sig') as json_file:
                json_data = json.load(json_file)
                for paragraph in json_data.get("paragraphInfo", []):
                    texts.append(paragraph["srcText"])
for root, dirs, files in os.walk(book_folder):
    for file_name in files:
        if file_name.endswith('.json'):
            file_path = os.path.join(root, file_name)
            with open(file_path, 'r', encoding='utf-8-sig') as json_file:
                json_data = json.load(json_file)
                for paragraph in json_data.get("paragraphs", []):
                    for sentence in paragraph.get("sentences", []):
                        texts.append(sentence["text"])

df = pd.DataFrame(texts, columns=["text"])
dataset = Dataset.from_pandas(df)

In [3]:
# Hugging Face 모델과 토크나이저 로드
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# 토크나이징 함수 정의 (동적 패딩 적용, max_length 없이 설정)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="longest", truncation=True)

# 청크 저장 폴더 생성
output_dir = "/home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/model/temp_data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 데이터셋 청크 단위로 토크나이징 및 저장
num_chunks = 50

# 주석된 부분 활성화하여 청크 저장
# for i in range(num_chunks):
#     dataset_chunk = dataset.select(range(start_idx, end_idx))
    
#     tokenized_chunk = dataset_chunk.map(
#         tokenize_function,
#         batched=True,
#         batch_size=8,
#         keep_in_memory=False,
#         writer_batch_size=1
#     )
    
#     chunk_path = os.path.join(output_dir, f"tokenized_chunk_{i}")
#     tokenized_chunk.save_to_disk(chunk_path)
#     del dataset_chunk, tokenized_chunk
#     gc.collect()
#     torch.cuda.empty_cache()

tokenized_datasets = [Dataset.load_from_disk(os.path.join(output_dir, f"tokenized_chunk_{i}")) for i in range(num_chunks)]
final_tokenized_dataset = concatenate_datasets(tokenized_datasets)

# 20% 데이터로 파인튜닝, 80% 데이터로 학습
train_split = final_tokenized_dataset.train_test_split(test_size=0.8, seed=42)
finetune_dataset = train_split["train"]
remaining_data = train_split["test"]

# remaining_data를 train, validation, test로 분할
remaining_split = remaining_data.train_test_split(test_size=0.3, seed=42)  # 70% 학습 데이터, 나머지 30%는 validation과 test로 사용
validation_test_split = remaining_split["test"].train_test_split(test_size=1/3, seed=42)  # 30% 중 1/3은 test, 나머지 2/3은 validation

train_dataset = remaining_split["train"]  # 70%
validation_dataset = validation_test_split["train"]  # 20%
test_dataset = validation_test_split["test"]  # 10%

In [4]:
# 양자화 및 모델 로드 설정
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")

# LoRA 설정 및 적용
lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.05)
model = get_peft_model(model, lora_config)

# CustomTrainer 클래스 정의
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs["input_ids"]
        outputs = model(**inputs)
        logits = outputs.get("logits")
        
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

# 학습 파라미터 설정 (평가 생략)
training_args = TrainingArguments(
    output_dir="/home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/model/llama_finetuned_finetune",
    overwrite_output_dir=True,  # 기존 설정 덮어쓰기
    eval_strategy="no",  # 평가를 생략하여 속도 향상
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='/home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/model/logs',
    fp16=True,
)

# # 전체 학습 스텝 계산 후 절반으로 로깅 및 체크포인트 설정
# total_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs
# training_args.logging_steps = total_steps // 2  # 전체 스텝의 절반
# training_args.save_steps = total_steps // 2     # 전체 스텝의 절반

# 파인튜닝 (20% 데이터 사용)
finetune_trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=finetune_dataset,  # 20% 데이터셋
    eval_dataset=validation_dataset
)

checkpoint_dir = "/home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/model/llama_finetuned_finetune"

# 마지막 체크포인트가 있는지 확인 후 이어서 학습 시작
if os.path.exists(checkpoint_dir) and len(os.listdir(checkpoint_dir)) > 0:
    finetune_trainer.train(resume_from_checkpoint=False)
else:
    finetune_trainer.train()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33m1112hoya[0m ([33mGlobals[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,1.2122
1000,1.043
1500,1.026
2000,1.022
2500,1.0234
3000,1.0219
3500,1.0102
4000,1.0112
4500,1.0058
5000,0.9921


0,1
train/epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇██
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇█
train/grad_norm,▁▃▃▃▅▄▅▅▃▇▆▃▄▄▆▂▅▅▆▇▆▆▅▆▄▅▃▇▅▅▆▇█▅▇▄▆▆█▅
train/learning_rate,██▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
train/loss,█▆▆▅▆▄▄▄▄▄▃▄▃▂▂▃▂▃▂▃▃▂▂▃▂▂▂▂▂▂▂▁▂▂▁▂▂▁▁▁

0,1
total_flos,2.861921908711489e+18
train/epoch,1.0
train/global_step,119457.0
train/grad_norm,0.51817
train/learning_rate,0.0
train/loss,0.9323
train_loss,0.95557
train_runtime,72280.6175
train_samples_per_second,52.886
train_steps_per_second,1.653


In [5]:
finetune_trainer.save_model("/home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/model/llama_finetuned_finetune")

In [None]:
# 파인튜닝된 모델 불러오기 (finetune_model로 설정)
finetune_model = AutoModelForCausalLM.from_pretrained("/home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/model/llama_finetuned_finetune", quantization_config=quantization_config, device_map="auto")
finetune_model = get_peft_model(finetune_model, lora_config)

# 최종 학습 (80% 데이터 사용)
trainer = CustomTrainer(
    model=finetune_model,
    args=training_args,
    train_dataset=train_dataset,  # 80% 데이터셋
    eval_dataset=validation_dataset
)

# 최종 학습 수행 및 모델 저장
trainer.train()
trainer.save_model("/home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/model/llama_finetuned_final")

Step,Training Loss
500,1.2249
1000,1.0444
1500,1.0298
2000,1.0268
2500,1.0231
3000,1.0125
3500,1.0065
4000,1.0027
4500,1.0042
5000,0.9887


In [None]:
# 모델 학습 결과 저장
trainer.save_model("/home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/model/llama_finetuned")
tokenizer.save_pretrained("/home/dnslabs/jupyter-server-workspace/Ryumyungjae_Workspace/Book_ai/model/llama_finetuned")

# 메모리 정리
del model, train_dataset, eval_dataset, final_tokenized_dataset
gc.collect()
torch.cuda.empty_cache()