# Google Colab

In [None]:
!pip install -q openai pandas tiktoken


In [None]:
import os
import random
import pandas as pd
from openai import OpenAI
import tiktoken
from tqdm import tqdm
from datasets import load_dataset


In [None]:
enc = tiktoken.encoding_for_model("text-embedding-3-small")
MAX_TOKENS = 8191

In [None]:
ds = load_dataset("lcw99/wikipedia-korean-20221001", split="train")
ds = ds.shuffle(seed=42).select(range(1000))

In [None]:
def count_tokens(text: str) -> int:
    return len(enc.encode(text))


In [None]:
from google.colab import userdata
client = OpenAI(api_key=userdata.get('OPENAI_API_KEY'))

def get_embedding(text: str):
    resp = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return resp.data[0].embedding

In [None]:
import time

results = []
for ex in tqdm(ds, total=len(ds)):
    title, text = ex["title"], ex["text"]
    if len(enc.encode(text)) > MAX_TOKENS:
        continue
    emb = get_embedding(text)
    time.sleep(0.01)
    results.append({"title": title, "text": text, "": emb})

100%|██████████| 1000/1000 [05:03<00:00,  3.29it/s]


In [None]:
import json

with open("/content/drive/MyDrive/ClusterAE/datasets/wikipedia-korean-20221001-embeddings-1k.jsonl", "w", encoding="utf-8") as f:
    for item in results:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")
print("저장 완료")

✅ wiki_ko_10k_openai3s.jsonl로 저장 완료
