In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from OpenAIService import OpenAIService
import os

openai_service = OpenAIService()

In [2]:
SUBMISSION_SHOPAHOLIC_SYSTEM_PROMPT = """This GPT is designed to help the user classify whether the author of an advice-seeking Reddit submission is a shopaholic(1) or not(0). Specifically, the user of the GPT wants to know if the submission's author has any impulsive buying or shopaholic tendencies. The author may not explicitly indicate/admit their problem, but the clues can be hidden between the lines. The GPT can make a judgment call even if the user does not explicitly admit their problem. The GPT should try to be as accurate as possible, and if ever in doubt about submission, it should make its best guess. The GPT only ever receives submissions pasted directly from reddit as input. Typically the format would start off with the title, then a flair and finally the selftext. However, this is not always the case. The GPT is only ever allowed to respond with either: 1/0"""
LOCATION_PROCESSED = 'data_processed'
LOCATION_BATCHES = f"{LOCATION_PROCESSED}/batches"
ALL_SUBMISSIONS_FILENAME = f"{LOCATION_PROCESSED}/all_submissions_filtered_and_labelled.jsonl"
MAX_BATCH_SIZE = 50_000
# MAX_BATCH_SIZE = 20

In [7]:
# load all submissions
df = pd.read_json(ALL_SUBMISSIONS_FILENAME, lines=True)
print(f"Total submissions: {len(df)}")

# filter out non-advice seeking submissions
df = df[df['is_advice_seeking'] == True]
print(f"Advice seeking submissions: {len(df)}")

# print average number of comments
print(f"Average number of comments: {df['num_comments'].mean()}")

Total submissions: 260882
Advice seeking submissions: 130268
Average number of comments: 28.990312279301133


In [4]:
all_lines = []

for index, row in df.iterrows():
    submission_text = f"{row['title']}"

    flair = row['link_flair_text']
    if not pd.isna(flair):
        submission_text += f"\n{flair}"

    selftext = row['selftext']
    if not pd.isna(selftext):
        submission_text += f"\n{selftext}"
    
    unique_id = f"{row['subreddit']}_{row['id']}"
    # print(f"Creating line for {unique_id}")

    line = openai_service.create_line(unique_id, submission_text, system_prompt=SUBMISSION_SHOPAHOLIC_SYSTEM_PROMPT)

    all_lines.append(line)

# print(all_lines)

In [5]:
# creating batches
batches = {}
# generate unique string for batching session
unique_batching_string = np.random.bytes(4).hex()
print(f"Unique batching string: {unique_batching_string}")

for i in range(0, len(all_lines), MAX_BATCH_SIZE):
    from_index = i
    to_index = min(i+MAX_BATCH_SIZE, len(all_lines))
    batch_name = f"{unique_batching_string}_{from_index}_to_{to_index}"

    print(f"Creating batch {batch_name} with {to_index - from_index} line(s)")
    batch = all_lines[from_index:to_index]

    batches[batch_name] = batch

Unique batching string: db6ffccf
Creating batch db6ffccf_0_to_50000 with 50000 line(s)
Creating batch db6ffccf_50000_to_100000 with 50000 line(s)
Creating batch db6ffccf_100000_to_130268 with 30268 line(s)


In [6]:
# save batches to disk
for batch_name, lines in batches.items():
    if not os.path.exists(LOCATION_BATCHES):
        os.makedirs(LOCATION_BATCHES)

        # create batch folder
    batch_folder = f"{LOCATION_BATCHES}/{unique_batching_string}"
    if not os.path.exists(batch_folder):
        os.makedirs(batch_folder)

    # save lines to disk
    batch_filename = f"{batch_folder}/{batch_name}.jsonl"
    with open(batch_filename, 'w') as f:
        for line in lines:
            f.write(f"{line}\n")