# Reddit AITA Dataset Preparation

## Environment preparation

In [1]:
%pip install datasets transformers pandas numpy huggingface_hub ipywidgets

Note: you may need to restart the kernel to use updated packages.


## Loading of inital dataset

In [33]:
import pandas as pd
from datasets import Dataset

# load inital datafile
initial_datafile = 'aita-datafiles/Reddit_AITA_2018_to_2022.csv' 
dataset = Dataset.from_pandas(pd.read_csv(initial_datafile))

## Filter #1: Removal of samples where any of the submission titles, text, or top 10 comments don't exist

In [34]:
# remove samples where any of the top comments are None or empty

def remove_none_comments(example):
    keys_to_check = [
        'submission_title', 'submission_text', 'top_comment_1', 'top_comment_2', 'top_comment_3',
        'top_comment_4', 'top_comment_5', 'top_comment_6', 'top_comment_7', 'top_comment_8',
        'top_comment_9', 'top_comment_10'
    ]
    return all(example.get(key) not in (None, '') for key in keys_to_check)

In [35]:
dataset = dataset.filter(remove_none_comments)

Filter:   0%|          | 0/160557 [00:00<?, ? examples/s]

## Filter #2: Removal of samples where any comments did not contain an AITA decision
- identified using earliest keyword matching according to the phrases for each AITA class in "classes_dictionary"

In [36]:
import re

# Creation of AITA Decision Columns For Each Comment Using Earliest Keyword Matching

classes_dictionary = {
    'NTA': ['not the asshole', 'not the a**hole', 'not the a-hole', 'you would not be the asshole', 'you would not be the a**hole', 'you would not be the a-hole', 'not an asshole', 'not an a**hole', 'not an a-hole', 'you would not be an asshole', 'you would not be an a**hole', 'you would not be an a-hole', 'nta', 'n t a', 'ywnbta', 'y w n b t a'],
    'NAH': ['no assholes here', 'no a**holes here', 'no a-holes here', 'no one is the asshole', 'no one is the a**hole', 'no one is the a-hole', 'no one would be the asshole', 'no one would be the a**hole', 'no one would be the a-hole', 'no one is an asshole', 'no one is an a**hole', 'no one is an a-hole', 'no one would be an asshole', 'no one would be an a**hole', 'no one would be an a-hole', 'nah', 'n a h'],
    'ESH': ['everyone sucks here', 'everyone is the asshole', 'everyone is the a**hole', 'everyone is the a-hole', 'everyone would be the asshole', 'everyone would be the a**hole', 'everyone would be the a-hole', 'everyone is an asshole', 'everyone is an a**hole', 'everyone is an a-hole', 'everyone would be an asshole', 'everyone would be an a**hole', 'everyone would be an a-hole', 'esh', 'e s h'],
    'YTA': ['you\'re the asshole', 'you\'re the a**hole', 'you\'re the a-hole', 'youre the asshole', 'youre the a**hole', 'youre the a-hole', 'you are the asshole', 'you are the a**hole', 'you are the a-hole', 'you would be the asshole', 'you would be the a**hole', 'you would be the a-hole', 'you the asshole', 'you the a**hole', 'you the a-hole', 'you\'re an asshole', 'you\'re an a**hole', 'you\'re an a-hole', 'youre an asshole', 'youre an a**hole', 'youre an a-hole', 'you are an asshole', 'you are an a**hole', 'you are an a-hole', 'you would be an asshole', 'you would be an a**hole', 'you would be an a-hole', 'you an asshole', 'you an a**hole', 'you an a-hole', 'yta', 'y t a', 'ywbta', 'y w b t a']
}

def find_earliest_classification(text):
    '''
    Find the earliest AITA classification in a text.

    Args:
        text (str): The text to search for AITA classifications in.

    Returns:
        str: The earliest classification found in the text.
    '''

    # track earliest match
    earliest_match = None
    earliest_match_pos = float('inf')  # Initially set to infinity

    # convert input text to lowercase
    text = text.lower()

    # go through all classifications and their keywords
    for key, phrases in classes_dictionary.items():
        # Create a regex pattern that includes the classification keywords
        pattern = r'\b(' + '|'.join(map(re.escape, phrases)) + r')\b'

        # Search for any keywords in the input text
        for match in re.finditer(pattern, text, re.IGNORECASE):
            if match.start() < earliest_match_pos:
                # Update the earliest match if this match is earlier
                earliest_match = key
                earliest_match_pos = match.start()

    # return the class that had the earliest match
    return earliest_match

def add_classification(row):
    '''
    Add comment AITA classifications to a row in the dataset.

    Args:
        row (dict): A row from the dataset.

    Returns:
        dict: The row with comment AITA classifications added.
    '''
    # Iterate over top 10 comment keys
    for i in range(1, 11):
        key = f'top_comment_{i}'
        classification_key = key + '_AITA_class_by_keyword'
        if key in row and isinstance(row[key], str): # should be true since we guaranteed that all comments are strings earlier
            # if this row has a top_comment_N key, get the classification and add it to the row
            classification = find_earliest_classification(row[key])
            row[classification_key] = classification
        else:
            # If the top_comment_N key doesn't exist or isn't a string, set the classification key to None
            row[classification_key] = None

    # return the row with the classification added
    return row

In [37]:
dataset = dataset.map(add_classification)

Map:   0%|          | 0/130255 [00:00<?, ? examples/s]

In [38]:
def remove_comments_with_no_AITA_class_keyword(example):
    if example['top_comment_1_AITA_class_by_keyword'] is None or example['top_comment_2_AITA_class_by_keyword'] is None or example['top_comment_3_AITA_class_by_keyword'] is None or example['top_comment_4_AITA_class_by_keyword'] is None or example['top_comment_5_AITA_class_by_keyword'] is None or example['top_comment_6_AITA_class_by_keyword'] is None or example['top_comment_7_AITA_class_by_keyword'] is None or example['top_comment_8_AITA_class_by_keyword'] is None or example['top_comment_9_AITA_class_by_keyword'] is None or example['top_comment_10_AITA_class_by_keyword'] is None:
        return False
    return True

In [39]:
dataset = dataset.filter(remove_comments_with_no_AITA_class_keyword)
dataset = dataset.remove_columns('__index_level_0__')

Filter:   0%|          | 0/130255 [00:00<?, ? examples/s]

# Filter #3: Removal of samples where the submission title or text have been removed or deleted
- Comments are not of concern since any samples that have comment(s) which have been deleted or removed have already been filtered out due to having comment(s) that don't contain an AITA class keyword.

In [40]:
# get the most popular submission titles and texts to determine what Reddit uses for posts that are deleted or removed
## indicates that...
###  for submission titles this is done by either [deleted by user] or [ Removed by Reddit ]
###  for submission text this is done by either [deleted], [removed], or '.' (a single period)

df = dataset.to_pandas()
print(f'MOST POPULAR SUBMISSION TITLES\n {df["submission_title"].value_counts()[:20]}')
print()
print(f'MOST POPULAR SUBMISSION TEXTS\n {df["submission_text"].value_counts()[:20]}')

MOST POPULAR SUBMISSION TITLES
 submission_title
[deleted by user]                                         855
AITA for not inviting my sister to my wedding?              5
AITA for not shaving my legs?                               4
AITA for not wanting to babysit?                            4
AITA for not wanting my dad to walk me down the aisle?      4
AITA for kicking my sister out of my house?                 4
AITA for grounding my daughter                              4
AITA for moving in with my dad?                             3
[ Removed by Reddit ]                                       3
AITA for telling the truth?                                 3
AITA for changing my name?                                  3
AITA for coming out?                                        3
AITA for getting a tattoo?                                  3
AITA for not changing my wedding date?                      3
AITA for not letting my dad walk me down the aisle?         3
AITA for not wearing 

In [41]:
# remove samples where the submission text is [deleted], [removed], or '.' (a single period) or the submission title is [deleted by user] or [ Removed by Reddit ]

def remove_deleted_removed_posts(example):
    if example['submission_text'] in ['[deleted]', '[removed]', '.'] or example['submission_title'] in ['[deleted by user]', '[ Removed by Reddit ]']:
        return False
    return True

dataset = dataset.filter(remove_deleted_removed_posts)

Filter:   0%|          | 0/47466 [00:00<?, ? examples/s]

## Filter #3: Removal of Edits in both Submission Texts and Top Comments

In [42]:
import re

def remove_edits(text):
  """
  Removes the edits portion of a text

  Parameters:
    text: A string containing the text.

  Returns:
    A string with the edits removed, if present.
  """

  global edits_removed_counter

  if text == None:
    return text

  text = text.lower()

  pattern = r"(edit:|edit -|edit-|eta:|eta -|eta-|edited:|edited -|edited-|edit after:|edit after- |edit after -|edit afterwards:|edit afterwards -|edit afterwards-|edited to add:|edited to add -|edited to add-|update:|update-|update -|updated:|updated-|updated -)"
  match = re.search(pattern, text, flags=re.IGNORECASE)
  if match:
      edits_removed_counter += 1 # increment the edits_removed_counter
      return text[:match.start()].strip() # return the text up to the start of the match

  return text

In [43]:
edits_removed_counter = 0

dataset = dataset.map(lambda x: {"submission_title": remove_edits(x["submission_title"])})
dataset = dataset.map(lambda x: {"submission_text": remove_edits(x["submission_text"])})
for i in range(1, 11):
    dataset = dataset.map(lambda x: {f"top_comment_{i}": remove_edits(x[f"top_comment_{i}"])})

print(f"Number of edits removed: {edits_removed_counter}")

Map:   0%|          | 0/31295 [00:00<?, ? examples/s]

Map:   0%|          | 0/31295 [00:00<?, ? examples/s]

Map:   0%|          | 0/31295 [00:00<?, ? examples/s]

Map:   0%|          | 0/31295 [00:00<?, ? examples/s]

Map:   0%|          | 0/31295 [00:00<?, ? examples/s]

Map:   0%|          | 0/31295 [00:00<?, ? examples/s]

Map:   0%|          | 0/31295 [00:00<?, ? examples/s]

Map:   0%|          | 0/31295 [00:00<?, ? examples/s]

Map:   0%|          | 0/31295 [00:00<?, ? examples/s]

Map:   0%|          | 0/31295 [00:00<?, ? examples/s]

Map:   0%|          | 0/31295 [00:00<?, ? examples/s]

Map:   0%|          | 0/31295 [00:00<?, ? examples/s]

Number of edits removed: 14344


In [45]:
# remove samples where the removal of edits resulted in empty submission titles, texts, or top comments
df = dataset.to_pandas()

columns = [f'top_comment_{i}' for i in range(1, 11)]
columns.append('submission_title')
columns.append('submission_text')
empty_string_indices = df[columns].apply(lambda row: any(cell == '' for cell in row), axis=1).index[df[columns].apply(lambda row: any(cell == '' for cell in row), axis=1)].tolist()

df = df[~df.index.isin(empty_string_indices)]

dataset = Dataset.from_pandas(df)

## Filter #4: Removal of outliers (upper and lower 2.5%)

In [47]:
## remove outlier samples based on submission text length
import numpy as np

submission_text_lengths = np.array([len(text) for text in dataset['submission_text']])
UPPER_BOUND = 97.5
LOWER_BOUND = 2.5
dataset = dataset.filter(lambda x: len(x['submission_text']) >= np.percentile(submission_text_lengths, LOWER_BOUND) and len(x['submission_text']) <= np.percentile(submission_text_lengths, UPPER_BOUND))

Filter:   0%|          | 0/30880 [00:00<?, ? examples/s]

## Train/Test Split
- 10% of each AITA class (NTA, YTA, ESH, NAH) in test dataset

In [None]:
from datasets import DatasetDict, Dataset

# Split the dataset based on the values of 'top_comment_1_AITA_class_by_keyword'
nta_dataset = dataset['train'].filter(lambda x: x['top_comment_1_AITA_class_by_keyword'] == 'NTA')
yta_dataset = dataset['train'].filter(lambda x: x['top_comment_1_AITA_class_by_keyword'] == 'YTA')
esh_dataset = dataset['train'].filter(lambda x: x['top_comment_1_AITA_class_by_keyword'] == 'ESH')
nah_dataset = dataset['train'].filter(lambda x: x['top_comment_1_AITA_class_by_keyword'] == 'NAH')

# Create a new DatasetDict with the split datasets
split_dataset = DatasetDict({
    'train': DatasetDict({
        'NTA': nta_dataset,
        'YTA': yta_dataset,
        'ESH': esh_dataset,
        'NAH': nah_dataset
    }),
})

dataset = split_dataset

# Upload dataset to HuggingFace

In [58]:
dataset.push_to_hub('MattBoraske/Reddit-AITA-2018-to-2022')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/datasets/MattBoraske/Reddit-AITA-2018-to-2022/commit/617912a234efa935f9ef5058f08b2d29ca4f2740', commit_message='Upload dataset', commit_description='', oid='617912a234efa935f9ef5058f08b2d29ca4f2740', pr_url=None, pr_revision=None, pr_num=None)