In [None]:
## 기존 dataset 수정
from transformers import AutoTokenizer
# 1. Hugging Face 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit")
EOS_TOKEN= tokenizer.eos_token


In [11]:
from datasets import load_dataset
dataset = load_dataset("aripos1/gorani_dataset")

In [12]:
import pandas as pd
from datasets import Dataset, DatasetDict


# 3. 엑셀 파일을 데이터프레임으로 로드
df = pd.read_excel("datasetF.xlsx")  # 엑셀 파일 경로
# ✅ 컬럼명을 소문자로 변환하고 공백 제거 (자동화)
df.columns = df.columns.str.lower().str.strip()

# 2. Alpaca 스타일 프롬프트 템플릿 정의
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Glossary:
{}

## target_language:
{}

### Response:
{}"""  # 새로운 템플릿 적용

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    metadatas       = examples["metadata"]
    target_languages = examples["target_language"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, metadata, target_language, output in zip(instructions, inputs, metadatas,target_languages, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, metadata,target_language, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass




In [13]:
dataset["test"] = dataset["test"].map(formatting_prompts_func, batched = True,)
dataset["train"] = dataset["train"].map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/5416 [00:00<?, ? examples/s]

Map:   0%|          | 0/21662 [00:00<?, ? examples/s]

In [14]:
print(dataset["test"][1]['text'])

Below is an instruction that describes a task, paired with an input that provides further context. 
Write a response that appropriately completes the request.

### Instruction:
Translate the Korean into target language. Refer to the glossary, and if it is not in the glossary, translate it. Do not provide explanations.

### Input:
원적산의 경치는 정말 아름다워요.

### Glossary:
{'ENG': 'Wonjeok Mountain', 'JPN': '原積算', 'KO': '원적산'}

## target_language:
ENG

### Response:
The scenery of Wonjeok Mountain is truly beautiful.<|eot_id|>


In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'target_language', 'metadata', 'text'],
        num_rows: 21662
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'target_language', 'metadata', 'text'],
        num_rows: 5416
    })
})

In [18]:
import os
from huggingface_hub import login
from dotenv import load_dotenv
load_dotenv()

# 9. Hugging Face 데이터셋 공간을 만들고 Parquet 파일 업로드
hf_token = os.getenv('HFT')  # 보안 강화를 위해 입력받음
login(token=hf_token)  # Hugging Face 로그인

dataset.push_to_hub("aripos1/gorani_dataset")  # 데이터셋 업로드

print("✅ Alpaca 템플릿 적용 완료! EOS_TOKEN 추가 완료! Parquet 변환 및 Hugging Face 업로드 완료!")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/22 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

✅ Alpaca 템플릿 적용 완료! EOS_TOKEN 추가 완료! Parquet 변환 및 Hugging Face 업로드 완료!
