In [1]:
import pandas as pd
import os

In [2]:
LOCATION_PROCESSED = 'data_processed'
LOCATION_BATCH_OUTPUTS = f"{LOCATION_PROCESSED}/batch_outputs"
ALL_SUBMISSIONS_FILENAME = f"{LOCATION_PROCESSED}/all_submissions_filtered.jsonl"

In [3]:
# load all submissions
df = pd.read_json(ALL_SUBMISSIONS_FILENAME, lines=True)
df.set_index('id', inplace=True)

In [4]:
# load the batch outputs
batch_outputs = []
for filename in os.listdir(LOCATION_BATCH_OUTPUTS):
    if filename.endswith('.jsonl'):
        batch_outputs.append(pd.read_json(f"{LOCATION_BATCH_OUTPUTS}/{filename}", lines=True))

# flatten
batch_outputs = pd.concat(batch_outputs, ignore_index=True)

print(f"Loaded {len(batch_outputs)} batch outputs")

Loaded 260882 batch outputs


In [5]:
# create is_advice_seeking column with None
df['is_advice_seeking'] = None

# iterate over all batch outputs
for batch_output in batch_outputs.itertuples():
    # get the corresponding submission
    custom_id = batch_output.custom_id
    submission_id = custom_id.split('_')[1]
    
    is_advice_seeking = batch_output.response["body"]["choices"][0]["message"]["content"] == "1"
    # print(f"Submission {submission_id} is advice seeking: {is_advice_seeking}")

    # set the is_advice_seeking column
    df.loc[submission_id, 'is_advice_seeking'] = is_advice_seeking

In [6]:
df.reset_index(inplace=True)
df.head()

Unnamed: 0,id,archived,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_text_color,author_flair_type,brand_safe,...,url_overridden_by_dest,gallery_data,is_gallery,collections,call_to_action,poll_data,author_is_blocked,_meta,previous_selftext,is_advice_seeking
0,6avp5,1.0,webmaster8888,,,[],,,text,1.0,...,,,,,,,,,,False
1,6azco,1.0,webmaster8888,,,[],,,text,1.0,...,,,,,,,,,,False
2,6b014,1.0,webmaster8888,,,[],,,text,1.0,...,,,,,,,,,,False
3,6b21z,1.0,[deleted],,,,,dark,,1.0,...,,,,,,,,,,False
4,6b5i2,1.0,webmaster8888,,,[],,,text,1.0,...,,,,,,,,,,False


In [7]:
# print the number of advice seeking and non-advice seeking submissions
print(df['is_advice_seeking'].value_counts())

# print percentage of advice seeking submissions
print(df['is_advice_seeking'].value_counts(normalize=True))

is_advice_seeking
False    130614
True     130268
Name: count, dtype: int64
is_advice_seeking
False    0.500663
True     0.499337
Name: proportion, dtype: float64


In [8]:
# save the updated dataframe as new jsonl
df.to_json(f"{LOCATION_PROCESSED}/all_submissions_filtered_and_labelled.jsonl", orient='records', lines=True)