In [None]:
from dotenv import load_dotenv
load_dotenv()

from os import getenv
from time import sleep
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from json import dumps
from langchain_text_splitters import RecursiveCharacterTextSplitter
from openai import OpenAI

from utils.storage import list_processed_mmd_files, download_plain_text

In [None]:
client = OpenAI()

In [None]:
splitter = RecursiveCharacterTextSplitter(
	chunk_size=10000,
	chunk_overlap=100,
	length_function=len,
	is_separator_regex=False,
)

In [None]:
files = list_processed_mmd_files()
len(files)

In [None]:
with ThreadPoolExecutor() as executor:
	plain = list(tqdm(executor.map(download_plain_text, files), total=len(files)))

In [None]:
MAX_LINES_PER_FILE = 50000

currentFileIndex = 0
currentLineCount = 0
currentFile = None

def getNextBatchFile():
    global currentFileIndex, currentLineCount, currentFile

    if currentFile:
        currentFile.close()

    currentFileIndex += 1
    currentLineCount = 0

    filename = f"output/batch_job_{currentFileIndex:03d}.jsonl"
    currentFile = open(filename, "w")

    return currentFile

currentFile = getNextBatchFile()

try:
    for text, filename in tqdm(zip(plain, files), total=len(files)):
        for i, section in enumerate(text.split("\n\n")):
            chunks = splitter.split_text(section)
            for j, chunk in enumerate(chunks):
                if currentLineCount >= MAX_LINES_PER_FILE:
                    currentFile = getNextBatchFile()

                currentFile.write(dumps({
                    "custom_id": f"{filename}_{i}_{j}",
                    "method": "POST",
                    "url": "/v1/embeddings",
                    "body": {
                        "model": "text-embedding-3-large",
                        "input": chunk,
                        "encoding_format": "float"
                    }
                }) + "\n")
                currentLineCount += 1
finally:
    if currentFile:
        currentFile.close()

print(f"Created {currentFileIndex} batch files")

In [None]:
batchFiles = []
for i in range(currentFileIndex):
	batchFiles.append(client.files.create(
		file=open(f"output/batch_job_{i+1:03d}.jsonl", "rb"),
		purpose="batch"
	))

In [None]:
jobs = []
for batchFile in tqdm(batchFiles):
	jobs.append(client.batches.create(
		input_file_id=batchFile.id,
		endpoint="/v1/embeddings",
		completion_window="24h",
	))