# AITA Dataset Processing
Creates and saves four datasets to HuggingFace Hub
- Multi-class top 50k
- Multi-class top 2k
- Binary (samples with YTA or NTA classification in multi-class top 50k)
- Binary top 2k

## Prepare Environment

In [None]:
%pip install datasets transformers pandas numpy krippendorff huggingface_hub ipywidgets

In [None]:
# Change this to the path of the dataset_creation folder on your machine
%cd C:\Users\mattb\Documents\Github\Reddit_AITA_Finetuning\dataset_creation
!mkdir cleaning_results

In [None]:
from huggingface_hub import login

login()

## Loading of Initial Dataset

In [None]:
import pandas as pd
from datasets import Dataset

initial_datafile = '2019_to_2022_submissions_at_least_50_score_top_10_comments.csv'  # change to data file path
dataset = Dataset.from_pandas(pd.read_csv(initial_datafile))

## Removal of Samples where Top Comment Doesn't Begin with an AITA Classification

In [None]:
import re

# regex pattern that matches rows where 'top_comment_1' starts with 'nta', 'yta', 'esh', 'info', or 'nah'
regex = re.compile(r'^(nta|yta|esh|info|nah)', re.IGNORECASE)

# function to apply the regex filter
def filter_rows(example):
    return bool(regex.match(example['top_comment_1']))

# filter the dataset using the regex pattern
filtered_dataset = dataset.filter(filter_rows)

In [None]:
import json

# Save AITA classification prefix filtering results

rows_removed = dataset.num_rows - filtered_dataset.num_rows
percent_change = (filtered_dataset.num_rows - dataset.num_rows) / dataset.num_rows * 100

AITA_class_prefix_filtering_results = {
    "number of samples before filtering": dataset.num_rows,
    "number of samples after filtering": filtered_dataset.num_rows,
    "number of samples removed": rows_removed,
    "percent change in number of samples": percent_change,
}

output_file = "processing_results/AITA_prefix_filtering_results.json"

with open(output_file, "w") as f:
    json.dump(AITA_class_prefix_filtering_results, f)

In [None]:
dataset = filtered_dataset

## Removal of Edits in both Submission Texts and Top Comments

In [None]:
import re

def remove_edits(text):
  """
  Removes the edits portion of a text

  Parameters:
    text: A string containing the text.

  Returns:
    A string with the edits removed, if present.
  """

  global edits_removed_counter

  if text == None:
    return text

  text = text.lower()

  pattern = r"(edit:|edit -|edit-|eta:|eta -|eta-|edited:|edited -|edited-|edit after:|edit after- |edit after -|edit afterwards:|edit afterwards -|edit afterwards-|edited to add:|edited to add -|edited to add-|update:|update-|update -|updated:|updated-|updated -)"
  match = re.search(pattern, text, flags=re.IGNORECASE)
  if match:
      edits_removed_counter += 1 # increment the edits_removed_counter
      return text[:match.start()].strip() # return the text up to the start of the match

  return text

In [None]:
def get_avg_length(strings):
  """
  Calculates the average length of a list of strings.

  Args:
    strings (list): A list of strings.

  Returns:
    float: The average length of the strings.
  """

  filtered_strings = [s for s in strings if s is not None] # filter out None values
  total_length = sum(len(s) for s in filtered_strings)
  average_length = total_length / len(filtered_strings) if filtered_strings else 0
  return average_length

In [None]:
from collections import defaultdict

# create the results dictionary

edits_removal_results = {
    'submission_texts': defaultdict(list),
    'top_comment_1': defaultdict(list),
    'top_comment_2': defaultdict(list),
    'top_comment_3': defaultdict(list),
    'top_comment_4': defaultdict(list),
    'top_comment_5': defaultdict(list),
    'top_comment_6': defaultdict(list),
    'top_comment_7': defaultdict(list),
    'top_comment_8': defaultdict(list),
    'top_comment_9': defaultdict(list),
    'top_comment_10': defaultdict(list),
}

In [None]:
# add average lengths to result dictionary before removing edits

texts_with_potential_edits = {
    'submission_texts': dataset["submission_text"],
    'top_comment_1': dataset["top_comment_1"],
    'top_comment_2': dataset["top_comment_2"],
    'top_comment_3': dataset["top_comment_3"],
    'top_comment_4': dataset["top_comment_4"],
    'top_comment_5': dataset["top_comment_5"],
    'top_comment_6': dataset["top_comment_6"],
    'top_comment_7': dataset["top_comment_7"],
    'top_comment_8': dataset["top_comment_8"],
    'top_comment_9': dataset["top_comment_9"],
    'top_comment_10': dataset["top_comment_10"],
}

for key, texts in texts_with_potential_edits.items():
    edits_removal_results[key]['avg_length_before_removing_edits'] = get_avg_length(texts)

In [None]:
# remove edits for submissions and comments

edits_removed_counter = 0

# submission texts
dataset = dataset.map(lambda x: {"submission_text": remove_edits(x["submission_text"])})
edits_removal_results['submission_texts']['edits_removed'] = edits_removed_counter

# comments
for i in range(1, 11):
    edits_removed_counter = 0
    dataset = dataset.map(lambda x: {f"top_comment_{i}": remove_edits(x[f"top_comment_{i}"])})
    edits_removal_results[f"top_comment_{i}"]['edits_removed'] = edits_removed_counter

In [None]:
# add string lengths to result dictionary after removing edits

texts_with_potential_edits = {
    'submission_texts': dataset["submission_text"],
    'top_comment_1': dataset["top_comment_1"],
    'top_comment_2': dataset["top_comment_2"],
    'top_comment_3': dataset["top_comment_3"],
    'top_comment_4': dataset["top_comment_4"],
    'top_comment_5': dataset["top_comment_5"],
    'top_comment_6': dataset["top_comment_6"],
    'top_comment_7': dataset["top_comment_7"],
    'top_comment_8': dataset["top_comment_8"],
    'top_comment_9': dataset["top_comment_9"],
    'top_comment_10': dataset["top_comment_10"],
}

for key, texts in texts_with_potential_edits.items():
    edits_removal_results[key]['avg_length_after_removing_edits'] = get_avg_length(texts)

In [None]:
# add percent changes in average lengths from removing edits

def calculate_percent_change(before, after):
    if before == 0:
        return 0
    return ((after - before) / before) * 100

for key in edits_removal_results.keys():
    before = edits_removal_results[key]['avg_length_before_removing_edits']
    after = edits_removal_results[key]['avg_length_after_removing_edits']
    percent_change = calculate_percent_change(before, after)
    edits_removal_results[key]['avg_length_percent_change'] = percent_change

In [None]:
# Save edits removal results

output_file = "processing_results/edits_removal_results.json"

with open(output_file, 'w') as file:
    json.dump(edits_removal_results, file, indent=4)

## Removal of Upper Extreme Outliers
Two-step filtering process:
1. Removal of Samples with submissions that are top 5% of flanT5 token count
2. Removal of Samples with #1 comments that are top 5% in flanT5 token count


In [None]:
from transformers import PreTrainedTokenizer, AutoTokenizer
from datasets import Dataset

def add_token_counts_to_dataset(dataset: Dataset, column: str, tokenizer: PreTrainedTokenizer, new_column_name: str) -> Dataset:
    """
    Adds a new column to a specified partition of a dataset with the number of tokens in each row of a specified column.

    Parameters:
      dataset (Dataset): A Hugging Face dataset object.
      column (str): The name of the column in the dataset partition to process.
      tokenizer: A Hugging Face transformers pretrained tokenizer
      new_column_name (str): The name of the new column to be added to the dataset.

    Returns:
      Dataset: The modified dataset with an additional column for token counts.
    """

    def count_tokens(row):
        row_tokens = tokenizer(row[column], padding=False, truncation=False, return_tensors="pt")
        tokens_count = len([tensor.item() for tensor in row_tokens['input_ids'][0]])
        return {new_column_name: tokens_count}
    
    return dataset.map(count_tokens)

flanT5_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl", trust_remote_code=True)

dataset = add_token_counts_to_dataset(dataset, 'submission_text', flanT5_tokenizer, 'submission_text_token_count')
dataset = add_token_counts_to_dataset(dataset, 'top_comment_1', flanT5_tokenizer, 'top_comment_1_token_count')

In [None]:
import numpy as np

OUTLIER_PERCENTILE_THRESHOLD = 95

def filter_submission_upper_outliers(example):
    return example['submission_text_token_count'] <= submission_text_length_threshold

def filter_comment_upper_outliers(example):
    return example['top_comment_1_token_count'] <= top_comment_length_threshold

# Extract lengths into lists
submission_text_token_counts = dataset['submission_text_token_count']  
top_comment_token_counts = dataset['top_comment_1_token_count']     

# Calculate the outlier percentile thresholds
submission_text_length_threshold = np.percentile(submission_text_token_counts, OUTLIER_PERCENTILE_THRESHOLD)
top_comment_length_threshold = np.percentile(top_comment_token_counts, OUTLIER_PERCENTILE_THRESHOLD)

# Filter the dataset
rows_before = dataset.num_rows
dataset = dataset.filter(filter_submission_upper_outliers)
dataset = dataset.filter(filter_comment_upper_outliers)
rows_after = dataset.num_rows

percent_change = (rows_after - rows_before) / rows_before * 100

# Save outlier filtering results
outlier_filtering_results ={
    "number of samples before filtering": rows_before,
    "number of samples after filtering": rows_after,
    "number of samples removed": rows_before - rows_after,
    "percent change in number of samples": percent_change,
}

output_file = "processing_results/outlier_filtering_results.json"

with open(output_file, "w") as f:
    json.dump(outlier_filtering_results, f)

# Remove the length columns
dataset = dataset.remove_columns(['submission_text_token_count', 'top_comment_1_token_count'])

## Adding of Top Comment AITA Classifications and Ambiguity Scores

In [None]:
import re
import numpy as np

def find_earliest_classification(text):
    '''
    Find the earliest AITA classification in a text.

    Args:
        text (str): The text to search for AITA classifications in.

    Returns:
        str: The earliest classification found in the text.
    '''

    # classifications mapped to their keywords
    classes_dictionary = {
      'NTA': ['not the asshole', 'not the a\*\*hole', 'nta', 'you would not be the asshole', 'you would not be the a**hole', 'ywnbta', 'n t a', 'y w b t a'],
      'NAH': ['no assholes here', 'no a\*\*holes here', 'nah', 'n a h'],
      'ESH': ['everyone sucks here', 'esh', 'e s h'],
      'INFO': ['more information needed', 'more info needed', 'more information required', 'more info required', 'info'],
      'YTA': ['you\'re the asshole', 'you\'re the a\*\*hole', 'youre the asshole', 'youre the a\*\*hole', 'yta', 'you would be the asshole', 'you would be the a\*\*hole', 'ywbta', 'y t a', 'y w b t a']
    }

    # track earliest match
    earliest_match = None
    earliest_match_pos = float('inf')  # Initially set to infinity

    # convert input text to lowercase
    text = text.lower()

    # go through all classifications and their keywords
    for key, phrases in classes_dictionary.items():
        # Create a regex pattern that includes the classification keywords
        pattern = r'\b(' + '|'.join(map(re.escape, phrases)) + r')\b'

        # Search for any keywords in the input text
        for match in re.finditer(pattern, text, re.IGNORECASE):
            if match.start() < earliest_match_pos:
                # Update the earliest match if this match is earlier
                earliest_match = key
                earliest_match_pos = match.start()

    # return the class that had the earliest match
    return earliest_match

def add_classification(row):
    '''
    Add comment AITA classifications to a row in the datset.

    Args:
        row (dict): A row from the dataset.

    Returns:
        dict: The row with comment AITA classifications added.
    '''
    # Iterate over top 10 comment keys
    for i in range(1, 11):
        key = f'top_comment_{i}'
        if key in row and isinstance(row[key], str):
            # if this row has a top_comment_N key, get the classification and add it to the row
            classification = find_earliest_classification(row[key])
            row[key + '_classification'] = classification
        else:
            # If the top_comment_N key doesn't exist, skip setting this key
            row[key + '_classification'] = None

    # return the row with the classification added
    return row

def calculate_ambiguity(classifications):
    '''
    Calculate the ambiguity score for a list of classifications.

    Args:
        classifications (list): A list of classifications.

    Returns:
        float: The ambiguity score.
    '''
    classification_values = {'YTA': 1, 'ESH': 2,
                             'INFO': 3, 'NAH': 4,
                             'NTA': 5}

    # convert classifications to their numeric representations
    numeric_values = [classification_values[c] for c in classifications if c is not None]

    # calculate ambiguity score as a function of mean and std dev
    mean = np.mean(numeric_values)
    std_dev = np.std(numeric_values)
    def f(mean):
        return (2 - abs(3 - mean)) ** 2 # parabolic that is lowest when mean is 1 or 5 and highest at 3 to emphasize ambiguity for YTA & NTA classes
    ambiguity_score = std_dev * f(mean)

    # normalize the ambiguity score on a 0-1 scale
    min_score = 0  # Minimum possible score (when std dev equals 0)
    max_score = 8.0  # Maximum possible score (when classes are equally YTA and NTA which results in max std dev and a central mean)
    normalized_score = (ambiguity_score - min_score) / (max_score - min_score)

    # return normalized ambiguity score
    return normalized_score

def add_ambiguity_score(row):
    # extract classifications from top comments
    classifications = []
    for i in range(1, 11):  # Adjust the range based on the number of top comments
        classification_key = f'top_comment_{i}_classification'
        if classification_key in row and row[classification_key]:
            classifications.append(row[classification_key])

    # calculate the ambiguity score if there are classifications
    if classifications:
        row['ambiguity_score'] = calculate_ambiguity(classifications)
    else:
        row['ambiguity_score'] = None

    return row

In [None]:
# add top comment classifications to dataset
dataset = dataset.map(add_classification)

# convert dataset to dataframe for null filtering
df = dataset.to_pandas()

# remove the rows where the top 1 comment classification is None
rows_before = df.shape[0]
df_filtered = df[df['top_comment_1_classification'].notnull()]
rows_after = df_filtered.shape[0]
rows_removed = rows_before - rows_after
percent_change = (rows_removed / rows_before) * 100

# save results of filtering out rows with null top comment classifications
top_comment_classification_null_filtering_results = {
    "number of samples before filtering": rows_before,
    "number of samples after filtering": rows_after,
    "number of samples removed": rows_removed,
    "percent change in number of samples": percent_change,
}

output_file = "processing_results/top_comment_classification_null_filtering_results.json"
with open(output_file, "w") as f:
    json.dump(top_comment_classification_null_filtering_results, f)

# convert dataframe back to a dataset
dataset = Dataset.from_pandas(df_filtered)

# add ambiguity scores to dataset
dataset = dataset.map(add_ambiguity_score)

## Train/Test Split

In [None]:
dataset = dataset.map(lambda example: {'AITA_decision': example['decision']})

In [None]:
from datasets import ClassLabel

# Create a mapping for the unique decision strings to integer labels
unique_labels = sorted(set(dataset['decision']))
label_to_id = {label: id for id, label in enumerate(unique_labels)}

# Define a function to map each decision to its integer label
def add_decision_class_label(example):
    example['decision_class_label'] = label_to_id[example['decision']]
    return example

# Add the 'decision_class_label' column to the dataset
dataset = dataset.map(add_decision_class_label)

# Update the features of the dataset to include 'decision_class_label'
new_features = dataset.features.copy()
new_features['decision_class_label'] = ClassLabel(names=unique_labels)
dataset = dataset.cast(new_features)

# Remove the original 'decision' column
dataset = dataset.remove_columns('decision')

In [None]:
def update_aita_decision(sample):
    # Check if 'AITA_decision' is 'Asshole' and update it
    if sample['AITA_decision'] == 'Asshole':
        sample['AITA_decision'] = 'A-hole'
    return sample

dataset = dataset.map(update_aita_decision)

In [None]:
dataset = dataset.train_test_split(
    test_size = 0.2,
    stratify_by_column='decision_class_label',
    seed=42 
)

## Adding of Flan-T5 and Llama-2 Instructions

In [None]:
import sys
sys.path.append('..')
from AITA_instruction import AITA_Instruction

In [None]:
# adding of Flan-T5 multiclass instructions

dataset["train"] = dataset["train"].map(
    lambda sample: AITA_Instruction.get_flanT5_instruction(sample, instruction_type="multiclass"), 
    batched=False
)

dataset["test"] = dataset["test"].map(
    lambda sample: AITA_Instruction.get_flanT5_instruction(sample, instruction_type="multiclass"), 
    batched=False
)

# adding of Llama 2 multiclass instructions

dataset["train"] = dataset["train"].map(
    lambda sample: AITA_Instruction.get_llama2_training_instruction(sample, instruction_type="multiclass", partition="training"), 
    batched=False
)

dataset["test"] = dataset["test"].map(
    lambda sample: AITA_Instruction.get_llama2_training_instruction(sample, instruction_type="multiclass", partition="testing"), 
    batched=False
)

In [None]:
dataset["train"] = dataset["train"].remove_columns(["__index_level_0__", "decision_class_label"])
dataset["test"] = dataset["test"].remove_columns(["__index_level_0__", "decision_class_label"])

## Saving of Multi-class Datasets to HuggingFace Hub

### Main dataset (Top 50k)
- top 50k by submission score
- 80/20 train/test split

In [None]:
from datasets import DatasetDict

# Sort the 'train' subset by 'submission_score' and select the top 40000 rows
sorted_train = dataset['train'].sort('submission_score', reverse=True)
top_train = sorted_train.select(range(40000))

# Sort the 'test' subset by 'submission_score' and select the top 10000 rows
sorted_test = dataset['test'].sort('submission_score', reverse=True)
top_test = sorted_test.select(range(10000))

shuffled_train = top_train.shuffle(seed=42)
shuffled_test = top_test.shuffle(seed=42)

# Create a new DatasetDict with the filtered data
dataset = DatasetDict({"train": top_train, "test": top_test})
dataset =  dataset.remove_columns(["AITA_decision"])

dataset.push_to_hub(f'MattBoraske/reddit-AITA-submissions-and-comments-multiclass')

### Top 2k Dataset
- 80/20 train/test split
- Equal representation for each of the five AITA classes
    - Top 320/80 training/testing for each class

In [None]:
from datasets import Dataset, DatasetDict

def filter_top_samples_df(dataset, top_n):
    # Convert to Pandas DataFrame
    df = dataset.to_pandas()

    # Group by 'decision_class_label', sort within groups by 'submission_score', and take top N
    grouped = df.groupby('top_comment_1_classification', group_keys=False).apply(lambda x: x.nlargest(top_n, 'submission_score'))
    return Dataset.from_pandas(grouped)

# Filter the datasets and convert to DataFrames
filtered_train_df = filter_top_samples_df(dataset['train'], 320)
filtered_test_df = filter_top_samples_df(dataset['test'], 80)

shuffled_train = filtered_train_df.shuffle(seed=42)
shuffled_test = filtered_test_df.shuffle(seed=42)

# Create a new DatasetDict
samples_2000_dataset = DatasetDict({
    'train': shuffled_train,
    'test': shuffled_test
})

samples_2000_dataset["train"] = samples_2000_dataset["train"].remove_columns(["__index_level_0__"])
samples_2000_dataset["test"] = samples_2000_dataset["test"].remove_columns(["__index_level_0__"])

samples_2000_dataset.push_to_hub('MattBoraske/reddit-AITA-submissions-and-comments-multiclass-top-2k')

## Saving of Binary Classification Datasets (NTA/YTA only)

### Main Dataset (NTA/YTA in Multi-class 50k)

In [None]:
from datasets import load_dataset
dataset = load_dataset('MattBoraske/reddit-AITA-submissions-and-comments')
dataset = dataset.remove_columns(["flanT5_instruction", "llama2_instruction"]) # removing of Flan-T5 and Llama 2 multiclass instructions


In [None]:
# get AITA_Instruction custom class to generate binary classification flan-t5/llama-2 instructions

import sys
sys.path.append('..')
from AITA_instruction import AITA_Instruction 

# adding of Flan-T5 binary classification instructions

dataset["train"] = dataset["train"].map(
    lambda sample: AITA_Instruction.get_flanT5_instruction(sample, instruction_type="binary"), 
    batched=False
)

dataset["test"] = dataset["test"].map(
    lambda sample: AITA_Instruction.get_flanT5_instruction(sample, instruction_type="binary"), 
    batched=False
)

# adding of Llama 2 binary classification instructions

dataset["train"] = dataset["train"].map(
    lambda sample: AITA_Instruction.get_llama2_training_instruction(sample, instruction_type="binary", partition="training"), 
    batched=False
)

dataset["test"] = dataset["test"].map(
    lambda sample: AITA_Instruction.get_llama2_training_instruction(sample, instruction_type="binary", partition="testing"), 
    batched=False
)

In [None]:
# filter for rows that either have a top comment classification of 'YTA' or 'NTA' and save dataset to HF hub

def filter_rows(example):
    return example['top_comment_1_classification'] in ['YTA', 'NTA']

dataset = {split: ds.filter(filter_rows) for split, ds in dataset.items()}
dataset = DatasetDict(dataset)
dataset = dataset.shuffle(seed=42)
dataset.push_to_hub('MattBoraske/reddit-AITA-submissions-and-comments-binary')

### Top 2k dataset
- 80/20 train/test split
- Equal representation of NTA and YTA classes
    - Top 800/200 training/testing for each class

In [None]:
def filter_top_samples_df(dataset, top_n):
    # Convert to Pandas DataFrame
    df = dataset.to_pandas()

    # Group by 'decision_class_label', sort within groups by 'submission_score', and take top N
    grouped = df.groupby('top_comment_1_classification', group_keys=False).apply(lambda x: x.nlargest(top_n, 'submission_score'))
    return Dataset.from_pandas(grouped)

# Filter the datasets and convert to DataFrames
filtered_train_df = filter_top_samples_df(dataset['train'], 800)
filtered_test_df = filter_top_samples_df(dataset['test'], 200)

shuffled_train = filtered_train_df.shuffle(seed=42)
shuffled_test = filtered_test_df.shuffle(seed=42)

# Create a new DatasetDict
samples_2000_dataset = DatasetDict({
    'train': shuffled_train,
    'test': shuffled_test
})

samples_2000_dataset["train"] = samples_2000_dataset["train"].remove_columns(["__index_level_0__"])
samples_2000_dataset["test"] = samples_2000_dataset["test"].remove_columns(["__index_level_0__"])

samples_2000_dataset.push_to_hub('MattBoraske/reddit-AITA-submissions-and-comments-binary-top-2k')