In [1]:
import pandas as pd
import os

In [2]:
LOCATION_PROCESSED = 'data_processed'
LOCATION_BATCH_OUTPUTS = f"{LOCATION_PROCESSED}/batch_outputs_shopaholic"
ALL_SUBMISSIONS_FILENAME = f"{LOCATION_PROCESSED}/all_submissions_filtered_and_labelled.jsonl"

In [3]:
# load all submissions
df = pd.read_json(ALL_SUBMISSIONS_FILENAME, lines=True)
df.set_index('id', inplace=True)

In [4]:
# load the batch outputs
batch_outputs = []
for filename in os.listdir(LOCATION_BATCH_OUTPUTS):
    if filename.endswith('.jsonl'):
        batch_outputs.append(pd.read_json(f"{LOCATION_BATCH_OUTPUTS}/{filename}", lines=True))

# flatten
batch_outputs = pd.concat(batch_outputs, ignore_index=True)

print(f"Loaded {len(batch_outputs)} batch outputs")

Loaded 130268 batch outputs


In [5]:
# create is_shopaholic column with None
df['is_shopaholic'] = None

# iterate over all batch outputs
for batch_output in batch_outputs.itertuples():
    # get the corresponding submission
    custom_id = batch_output.custom_id
    submission_id = custom_id.split('_')[1]
    
    is_shopaholic = batch_output.response["body"]["choices"][0]["message"]["content"] == "1"

    # set the is_shopaholic column
    df.loc[submission_id, 'is_shopaholic'] = is_shopaholic

In [7]:
df.reset_index(inplace=True)
df.tail()

Unnamed: 0,index,id,archived,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_text_color,author_flair_type,...,gallery_data,is_gallery,collections,call_to_action,poll_data,author_is_blocked,_meta,previous_selftext,is_advice_seeking,is_shopaholic
260877,260877,1fyzu1x,0.0,THELEDISME,,,[],,,text,...,,,,,,0.0,,,True,False
260878,260878,1fz1one,0.0,joastchorton,,,[],,,text,...,,,,,,0.0,,,True,False
260879,260879,1fz744c,0.0,Coach_Front,,,[],,,text,...,,,,,,0.0,,,True,False
260880,260880,1fz8mft,0.0,acidfart0101,,,[],,,text,...,,,,,,0.0,,,True,False
260881,260881,1fz8mhq,0.0,CrapNBAappUser,,,[],,,text,...,,,,,,0.0,,,False,


In [8]:
# print the number of advice seeking and non-advice seeking submissions
print(df['is_shopaholic'].value_counts())

# print percentage of advice seeking submissions
print(df['is_shopaholic'].value_counts(normalize=True))

is_shopaholic
False    125271
True       4997
Name: count, dtype: int64
is_shopaholic
False    0.961641
True     0.038359
Name: proportion, dtype: float64


In [9]:
# save the updated dataframe as new jsonl
df.to_json(f"{LOCATION_PROCESSED}/all_submissions_filtered_and_labelled_shopaholic.jsonl", orient='records', lines=True)