# Reddit AITA Dataset Creation


2 Input files created using datafile_filtering.py:
1. AITA submissions .csv file
2. Top level comments for the AITA submissions .csv file

1 Output file:
1. .csv file where each row is an AITA submission along with its top 10 comments

## Prepare Environment

In [None]:
%pip install zstandard pandas

In [None]:
import pandas as pd
import zstandard as zstd

In [None]:
%pwd

## Creation of AITA submissions dataframe

In [None]:
# load submissions csv

submissions_df = pd.read_csv('aita-datafiles/2022/submissions_2022_score_50.csv')

In [None]:
submissions_df

In [None]:
# rename columns so that they better reflect their data

submissions_df = submissions_df.rename(columns={'id': 'submission_id',
                                      'score': 'submission_score',
                                      'title': 'submission_title',
                                      'selftext': 'submission_text',
                                      'url': 'submission_url'})

In [None]:
submissions_df

## Creation of AITA comments dataframe

In [None]:
# load comments csv

comments_df = pd.read_csv('aita-datafiles/2022/top_level_comments_2022_score_5_submission_score_50.csv')

In [None]:
# strip the t3_ from the link_id column

comments_df['link_id'] = comments_df['link_id'].str.slice(3)

In [None]:
# rename columns so that they better reflect their data

comments_df = comments_df.rename(columns={'id': 'comment_id',
                                      'score': 'comment_score',
                                      'body': 'comment_text'})

## Merging of AITA submission and comments dataframes

In [None]:
# Create a dataframe of the top 10 comments for each submission

merged_df = submissions_df.merge(comments_df, left_on='submission_id', right_on='link_id') # merge submission and top comments dataframes
merged_df = merged_df.drop('link_id', axis=1) # remove link_id column
top_10_comments = merged_df.groupby('submission_id').apply(lambda x: x.nlargest(10, 'comment_score')['comment_text'].tolist()) # group by submission_id and get the top 10 comments for each submission
top_10_comments_df = pd.DataFrame(top_10_comments.tolist(), index=top_10_comments.index).add_prefix('comment_')

In [None]:
# Merge submissions_df and top_10_comments_df on submission_id
# Result is a dataframe with both submissions and their top 10 comments

submissions_with_top_10_comments = submissions_df.merge(top_10_comments_df, on='submission_id')

In [None]:
# Convert UTC timestamps to datetime

submissions_with_top_10_comments['created_utc'] = pd.to_datetime(submissions_with_top_10_comments['created_utc'], unit='s')


In [None]:
# Rename timestamp and top comment columns for improved clarity

submissions_with_top_10_comments = submissions_with_top_10_comments.rename(columns={'created_utc': 'submission_date',
                                                                                    'comment_0': 'top_comment_1',
                                                                                    'comment_1': 'top_comment_2',
                                                                                    'comment_2': 'top_comment_3',
                                                                                    'comment_3': 'top_comment_4',
                                                                                    'comment_4': 'top_comment_5',
                                                                                    'comment_5': 'top_comment_6',
                                                                                    'comment_6': 'top_comment_7',
                                                                                    'comment_7': 'top_comment_8',
                                                                                    'comment_8': 'top_comment_9',
                                                                                    'comment_9': 'top_comment_10'})

In [None]:
# Remove submission_id column since it isn't important to the dataset

submissions_with_top_10_comments = submissions_with_top_10_comments.drop('submission_id', axis=1)

In [None]:
# Swap submission_score and submission_text columns

submissions_with_top_10_comments[['submission_score', 'submission_text']] = submissions_with_top_10_comments[['submission_text', 'submission_score']]
submissions_with_top_10_comments = submissions_with_top_10_comments.rename(columns={'submission_score': 'submission_text', 'submission_text': 'submission_score'})

In [None]:
submissions_with_top_10_comments

## Saving of Reddit AITA dataset

In [None]:
# save the dataframe as a csv
output_file = 'aita-datafiles/2022/Reddit_AITA_2022_Raw.csv'
submissions_with_top_10_comments.to_csv(output_file, index=False)