In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from OpenAIService import OpenAIService
import os

openai_service = OpenAIService()

In [2]:
LOCATION_PROCESSED = 'data_processed'
LOCATION_BATCHES = f"{LOCATION_PROCESSED}/batches"
ALL_SUBMISSIONS_FILENAME = f"{LOCATION_PROCESSED}/all_submissions_filtered.jsonl"
MAX_BATCH_SIZE = 50_000
# MAX_BATCH_SIZE = 20

In [3]:
# load all submissions
df = pd.read_json(ALL_SUBMISSIONS_FILENAME, lines=True)
# sample 10 submissions
# df = df.sample(101, random_state=42)

In [9]:
all_lines = []

for index, row in df.iterrows():
    submission_text = f"{row['title']}"

    flair = row['link_flair_text']
    if not pd.isna(flair):
        submission_text += f"\n{flair}"

    selftext = row['selftext']
    if not pd.isna(selftext):
        submission_text += f"\n{selftext}"
    
    unique_id = f"{row['subreddit']}_{row['id']}"
    # print(f"Creating line for {unique_id}")

    line = openai_service.create_line(unique_id, submission_text)

    all_lines.append(line)

# print(all_lines)

In [5]:
# creating batches
batches = {}
# generate unique string for batching session
unique_batching_string = np.random.bytes(4).hex()
print(f"Unique batching string: {unique_batching_string}")

for i in range(0, len(all_lines), MAX_BATCH_SIZE):
    from_index = i
    to_index = min(i+MAX_BATCH_SIZE, len(all_lines))
    batch_name = f"{unique_batching_string}_{from_index}_to_{to_index}"

    print(f"Creating batch {batch_name} with {to_index - from_index} line(s)")
    batch = all_lines[from_index:to_index]

    batches[batch_name] = batch

Unique batching string: a8019ac6
Creating batch a8019ac6_0_to_50000 with 50000 line(s)
Creating batch a8019ac6_50000_to_100000 with 50000 line(s)
Creating batch a8019ac6_100000_to_150000 with 50000 line(s)
Creating batch a8019ac6_150000_to_200000 with 50000 line(s)
Creating batch a8019ac6_200000_to_250000 with 50000 line(s)
Creating batch a8019ac6_250000_to_260882 with 10882 line(s)


In [6]:
# save batches to disk
for batch_name, lines in batches.items():
    if not os.path.exists(LOCATION_BATCHES):
        os.makedirs(LOCATION_BATCHES)

        # create batch folder
    batch_folder = f"{LOCATION_BATCHES}/{unique_batching_string}"
    if not os.path.exists(batch_folder):
        os.makedirs(batch_folder)

    # save lines to disk
    batch_filename = f"{batch_folder}/{batch_name}.jsonl"
    with open(batch_filename, 'w') as f:
        for line in lines:
            f.write(f"{line}\n")