# Reddit AITA Huggingface Dataset Creation


2 Input files from datafile_filtering:
1. AITA submissions with at least 50 score
2. Top level comments that had at least 10 score for the AITA submissions with at least 50 score

1 Output file:
1. CSV/ZST file where each row is an AITA submission with at least 50 score that has columns for the top 10 comments where each comment has at least 10 score

In [None]:
%pip install zstandard
%pip install pandas

In [None]:
import pandas as pd
import zstandard as zstd

## Creation of AITA submissions dataframe

In [None]:
# load submissions csv

submissions_df = pd.read_csv('new_datasets/submissions_2019_to_2022_at_least_50_score.csv')

In [None]:
# filter submissions df to include only relevant link_flair_text (decision) values
# relevant AITA classes - a**hole, not the a-hole, no a-holes here, everyone sucks, not enough info

submissions_df = submissions_df[submissions_df['link_flair_text'].isin(['Asshole', 'Not the A-hole', 'No A-holes here', 'Everyone Sucks', 'Not enough info'])]

In [None]:
# rename columns so that they better reflect their data

submissions_df = submissions_df.rename(columns={'id': 'submission_id',
                                      'link_flair_text': 'decision',
                                      'score': 'submission_score',
                                      'title': 'submission_title',
                                      'selftext': 'submission_text',
                                      'url': 'submission_url'})

In [None]:
submissions_df

## Creation of AITA comments dataframe

In [None]:
# load comments csv

comments_df = pd.read_csv('new_datasets/top_level_comments_2019_to_2022_at_least_10_comment_score_at_least_50_submission_score.csv')

In [None]:
# strip the t3_ from the link_id column

comments_df['link_id'] = comments_df['link_id'].str.slice(3)

In [None]:
# rename columns so that they better reflect their data

comments_df = comments_df.rename(columns={'id': 'comment_id',
                                      'score': 'comment_score',
                                      'body': 'comment_text'})

## Merging of AITA submission and comments dataframes

In [None]:
# Create a dataframe of the top 10 comments for each submission

merged_df = submissions_df.merge(comments_df, left_on='submission_id', right_on='link_id') # merge submission and top comments dataframes
merged_df = merged_df.drop('link_id', axis=1) # remove link_id column
top_10_comments = merged_df.groupby('submission_id').apply(lambda x: x.nlargest(10, 'comment_score')['comment_text'].tolist()) # group by submission_id and get the top 10 comments for each submission
top_10_comments_df = pd.DataFrame(top_10_comments.tolist(), index=top_10_comments.index).add_prefix('comment_')

In [None]:
# Merge submissions_df and top_10_comments_df on submission_id
# Result is a dataframe with both submissions and their top 10 comments

submissions_with_top_10_comments = submissions_df.merge(top_10_comments_df, on='submission_id')

In [None]:
# Filter out rows with deleted/removed/null submission texts or top comments

submissions_with_top_10_comments = submissions_with_top_10_comments[(submissions_with_top_10_comments['submission_text'] != '[deleted]') & 
                                                                    (submissions_with_top_10_comments['comment_0'] != '[deleted]') &
                                                                    (submissions_with_top_10_comments['submission_text'] != '[removed]') &
                                                                    (submissions_with_top_10_comments['comment_0'] != '[removed]') &
                                                                    (submissions_with_top_10_comments['submission_text'].notnull()) & 
                                                                    (submissions_with_top_10_comments['comment_0'].notnull())]

In [None]:
# Convert UTC timestamps to datetime

submissions_with_top_10_comments['created_utc'] = pd.to_datetime(submissions_with_top_10_comments['created_utc'], unit='s')


In [None]:
# Rename timestamp and top comment columns for improved clarity

submissions_with_top_10_comments = submissions_with_top_10_comments.rename(columns={'created_utc': 'submission_date',
                                                                                    'comment_0': 'top_comment_1',
                                                                                    'comment_1': 'top_comment_2',
                                                                                    'comment_2': 'top_comment_3',
                                                                                    'comment_3': 'top_comment_4',
                                                                                    'comment_4': 'top_comment_5',
                                                                                    'comment_5': 'top_comment_6',
                                                                                    'comment_6': 'top_comment_7',
                                                                                    'comment_7': 'top_comment_8',
                                                                                    'comment_8': 'top_comment_9',
                                                                                    'comment_9': 'top_comment_10'})

In [None]:
# Remove submission_id column since it isn't important to the dataset

submissions_with_top_10_comments = submissions_with_top_10_comments.drop('submission_id', axis=1)

In [None]:
# Swap decision and submission_title columns

submissions_with_top_10_comments[['decision', 'submission_title']] = submissions_with_top_10_comments[['submission_title', 'decision']]
submissions_with_top_10_comments = submissions_with_top_10_comments.rename(columns={'decision': 'submission_title', 'submission_title': 'decision'})


In [None]:
# Swap submission_score and submission_text columns

submissions_with_top_10_comments[['submission_score', 'submission_text']] = submissions_with_top_10_comments[['submission_text', 'submission_score']]
submissions_with_top_10_comments = submissions_with_top_10_comments.rename(columns={'submission_score': 'submission_text', 'submission_text': 'submission_score'})

In [None]:
submissions_with_top_10_comments

### Saving to output CSV and ZST
- Will be considered as the "raw" version

In [None]:
# save the dataframe as a csv
output_file = '2019_to_2022_submissions_at_least_50_score_top_10_comments.csv'
submissions_with_top_10_comments.to_csv(output_file, index=False)

In [None]:
# compress CSV file to ZST format and save it

input_file = '2019_to_2022_submissions_at_least_50_score_top_10_comments.csv'
output_file = '2019_to_2022_submissions_at_least_50_score_top_10_comments.zst'

with open(input_file, 'rb') as f_in, open(output_file, 'wb') as f_out:
    cctx = zstd.ZstdCompressor() # Create a zstd compressor
    cctx.copy_stream(f_in, f_out) # Compress the input file and write the compressed data to the output file