# Stuff that belongs in another notebook

- ambiguity score analysis for both main and high-quality datasets
- measures of agreement and for both main and high quality datasets
- token histograms for main and high quality datasets (regular ones for flan-t5, llama2_text ones for llama2)


In [None]:
!mkdir results\Decision_Ambiguity_Analysis
!mkdir results\Comment_Agreement_Analysis

## Histogram Plotting Code

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def plot_histogram(data, plot_title, plot_xlabel, plot_ylabel, plot_file, bin_counts_file, log_y=False):
  '''
  Plots a histogram of data

  Parameters:
    data (numpy array): The data to plot.
    plot_title (str): The title of the plot.
    plot_xlabel (str): The label for the x-axis.
    plot_ylabel (str): The label for the y-axis.
    plot_file (str): The file path to save the plot.
    bin_counts_file (str): The file path to save the bin counts.
    log_y (bool): Whether to set the y-axis to logarithmic scale.

  Returns:
    None - The plot and bin counts are saved to the specified files.
  '''

  # Calculate the common bin edges for both train and test submissions
  bin_edges = np.histogram_bin_edges(data, bins=20)

  # Get histogram bin counts for train and test submissions
  counts, _ = np.histogram(data, bins=bin_edges)

  # Plot histograms
  plt.figure(figsize=(12, 6))
  sns.histplot(data, bins=bin_edges, kde=True, color='blue')
  plt.xlabel(plot_xlabel, fontsize=14)
  plt.ylabel(plot_ylabel, fontsize=14)
  
  if log_y == True:
    plt.yscale('log')  # Set y-axis to logarithmic scale

  plt.title(plot_title, fontsize=16)
  plt.savefig(plot_file)
  plt.show()

  # Log Histogram Bin Counts
  bin_labels = [f"{bin_edges[i]:.2f} - {bin_edges[i+1]:.2f}" for i in range(len(bin_edges)-1)]
  counts_df = pd.DataFrame({'Bin Ranges': bin_labels, 'Counts': counts}, index=bin_labels)

  # Pretty Print counts dataframe
  print(counts_df.to_string(index=False))

  # Save counts dataframe to CSV
  counts_df.to_csv(bin_counts_file, index=False)

## Token Analysis Code

In [None]:
from transformers import PreTrainedTokenizer

def get_token_counts(dataset: dict, partition: str, column: str, tokenizer: PreTrainedTokenizer) -> list:
    """
    Counts the number of tokens in each row of a specified column in a dataset partition using a provided tokenizer.

    This function iterates through rows of a specified column in a given dataset partition,
    tokenizes each row using the provided tokenizer, and counts the number of tokens
    generated for each row.

    Parameters:
      dataset (dict): A huggingface dataset object.
      partition (str): The specific partition of the dataset to analyze (e.g., 'train').
      column (str): The name of the column in the dataset partition to process.
      tokenizer: A huggingface transformers pretrained tokenizer

    Returns:
      tokens_counts: A list of integers, where each integer represents the number of tokens in the
      corresponding row of the specified column.
    """

    # Extract the data from the specified column in the given partition of the dataset
    column_data = dataset[partition][column]

    # Initialize an empty list to store token counts for each row
    tokens_counts = []

    # Iterate through each row in the column data
    for row in column_data:
        # Tokenize the row and count the number of tokens
        row_tokens = tokenizer(row, padding=False, truncation=False, return_tensors="pt")
        tokens_count = len([tensor.item() for tensor in row_tokens['input_ids'][0]])

        # Append the token count to the list
        tokens_counts.append(tokens_count)

    # Return the list of token counts
    return tokens_counts

In [None]:
from transformers import PreTrainedTokenizer
from datasets import Dataset

def add_token_counts_to_dataset(dataset: Dataset, partition: str, column: str, tokenizer: PreTrainedTokenizer, new_column_name: str) -> Dataset:
    """
    Adds a new column to a specified partition of a dataset with the number of tokens in each row of a specified column.

    Parameters:
      dataset (Dataset): A Hugging Face dataset object.
      partition (str): The specific partition of the dataset to analyze (e.g., 'train').
      column (str): The name of the column in the dataset partition to process.
      tokenizer: A Hugging Face transformers pretrained tokenizer
      new_column_name (str): The name of the new column to be added to the dataset.

    Returns:
      Dataset: The modified dataset with an additional column for token counts.
    """
 
    def count_tokens(row):
        # Tokenize the text and count the number of tokens
        row_tokens = tokenizer(row[column], padding=False, truncation=False, return_tensors="pt")
        tokens_count = len([tensor.item() for tensor in row_tokens['input_ids'][0]])
        return {new_column_name: tokens_count}

    # Update the specified partition of the dataset
    updated_dataset = dataset[partition].map(count_tokens)

    return updated_dataset

In [None]:
from datasets import Dataset

def filter_by_token_count(dataset: Dataset, max_tokens: int, token_count_column: str) -> Dataset:
    """
    Filters out rows in a dataset where the token count exceeds a specified maximum.

    Parameters:
    dataset (Dataset): A Hugging Face dataset object.
    max_tokens (int): The maximum allowed number of tokens.
    token_count_column (str): The name of the column containing the token counts. Default is 'token_count'.

    Returns:
    Dataset: A new dataset with rows filtered based on the token count criteria.
    """

    def is_within_max_tokens(row):
        # Check if the token count for the row is less than or equal to max_tokens
        return row[token_count_column] <= max_tokens

    # Filter the dataset
    filtered_dataset = dataset.filter(is_within_max_tokens)

    return filtered_dataset

### Ambiguity Score Analysis

In [None]:
# Plot token counts on log y histogram then save it and the bin counts

TITLE = "Ambiguity Scores for Reddit AITA Dataset"

plot_histogram(
    data = dataset["ambiguity_score"],
    plot_title = "Ambiguity Scores for Reddit AITA Dataset",
    plot_xlabel = "Score",
    plot_ylabel = "Frequency",
    plot_file = f"results/Decision_Ambiguity_Analysis/Ambiguity Scores.png",
    bin_counts_file = f"results/Decision_Ambiguity_Analysis/Ambiguity Scores.csv",
    log_y = True
)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# calculation and plotting of ambiguity score percentiles

percentiles = np.percentile(dataset["ambiguity_score"], np.arange(0, 101, 1))
percentile_data = pd.DataFrame({
    "Percentile": np.arange(0, 101, 1),
    "Ambiguity Score": percentiles
})
percentile_data.to_csv("results/Decision_Ambiguity_Analysis/Ambiguity Score Percentiles.csv", index=False)

plt.figure(figsize=(10, 6))  # You can adjust the figure size as needed
plt.plot(percentiles, np.arange(0, 101, 1))
plt.xlabel("Ambiguity Score")
plt.ylabel("Percentile")
plt.title("Ambiguity Score Percentiles for Reddit AITA Dataset")
plt.savefig("results/Decision_Ambiguity_Analysis/Ambiguity Score Percentiles.png")
plt.show()

In [None]:

# Filter the dataset to include only samples with an ambiguity score of 0
zero_ambiguity_dataset = dataset.filter(lambda x: x['ambiguity_score'] == 0)

# Get counts for each dataset split
zero_ambiguity_count = len(zero_ambiguity_dataset)


# Calculate percentages
total_count= len(dataset)
zero_ambiguity_percentage = round((zero_ambiguity_count / total_count) * 100, 3)

# Store results in dataframe and save to output CSV
zero_ambiguity_results = {
    'Number of Samples with Zero Ambiguity': [zero_ambiguity_count],
    'Percentage of Samples with Zero Ambiguity': [zero_ambiguity_percentage]
}

output_file = "results/Decision_Ambiguity_Analysis/zero_ambiguity_samples_results.json"

with open(output_file, 'w') as file:
    json.dump(zero_ambiguity_results, file, indent=4)

## Measures of Agreement Between Top Comments
- Overall: Krippendorff's Alpha
  - Key aspects
    - "Krippendorff's alpha coefficient,[1] named after academic Klaus Krippendorff, is a statistical measure of the agreement achieved when coding a set of units of analysis. Since the 1970s, alpha has been used in content analysis where textual units are categorized by trained readers, in counseling and survey research where experts code open-ended interview data into analyzable terms, in psychological testing where alternative tests of the same phenomena need to be compared, or in observational studies where unstructured happenings are recorded for subsequent analysis."
    - "Krippendorff's alpha generalizes several known statistics, often called measures of inter-coder agreement, inter-rater reliability, reliability of coding given sets of units (as distinct from unitizing) but it also distinguishes itself from statistics that are called reliability coefficients but are unsuitable to the particulars of coding data generated for subsequent analysis."
    - "Krippendorff's alpha is applicable to any number of coders, each assigning one value to one unit of analysis, to incomplete (missing) data, to any number of values available for coding a variable, to binary, nominal, ordinal, interval, ratio, polar, and circular metrics (note that this is not a metric in the mathematical sense, but often the square of a mathematical metric, see levels of measurement), and it adjusts itself to small sample sizes of the reliability data. The virtue of a single coefficient with these variations is that computed reliabilities are comparable across any numbers of coders, values, different metrics, and unequal sample sizes.
  - [Wiki](https://en.wikipedia.org/wiki/Krippendorff%27s_alpha)
  - [Lecture by Krippendorff on Calculation](https://www.asc.upenn.edu/sites/default/files/2021-03/Computing%20Krippendorff%27s%20Alpha-Reliability.pdf)
  - [Article Explanation](https://www.surgehq.ai/blog/inter-rater-reliability-metrics-an-introduction-to-krippendorffs-alpha)
    - Ranges from -1 to 1, with -1 being complete disagreement, 0 being random choice, and 1 being complete agreement
    - 0.8 indicates significant agreement.

- Pairwise: Cohen's Kappa
  - [Wiki](https://en.wikipedia.org/wiki/Cohen%27s_kappa)
  - [Article Explanation](https://towardsdatascience.com/multi-class-metrics-made-simple-the-kappa-score-aka-cohens-kappa-coefficient-bdea137af09c)

In [None]:
import numpy as np
import krippendorff
from sklearn.metrics import cohen_kappa_score
from itertools import combinations

def get_encoded_classifications(dataset):
    """
    Encodes AITA classifications into numeric values, retaining None values.

    Parameters:
    dataset (list of dictionaries): A huggingface dataset

    Returns:
    list[list]: Lists of numeric classifications, with None where input was None
    """

    # Mapping of AITA classifications to numeric values
    classification_values = {'YTA': 1, 'ESH': 2,
                             'INFO': 3, 'NAH': 4,
                             'NTA': 5}

    # Initialize a list of lists, one for each of the top 10 comments
    top_comments = [[] for _ in range(10)]

    # Iterate over each sample in the dataset
    for sample in dataset:
        # Iterate over the top 10 comments
        for i in range(10):
            key = f'top_comment_{i+1}_classification'
            # Append the classification to the corresponding list
            top_comments[i].append(sample.get(key, None))

    # Convert classifications to their numeric representations, keeping None as is
    top_comments_encoded = []
    for i in range(len(top_comments)):
        encoded_comment = [classification_values.get(c, None) for c in top_comments[i]]
        top_comments_encoded.append(encoded_comment)
    return top_comments_encoded


def calculate_krippendorffs_alpha(dataset):
  """
  Calculates Krippendorff's alpha for a given dataset.

  Parameters:
  dataset (list of dictionaries): A huggingface dataset.

  Returns:
  float: Krippendorff's alpha score.
  """

  # Encode top comment classifications
  top_comments_encoded = get_encoded_classifications(dataset)

  # Calculate and return krippendorff's alpha
  data = np.array([[np.nan if x is None else x for x in sublist] for sublist in top_comments_encoded], dtype=float)
  return krippendorff.alpha(data)


def calculate_cohen_kappa(dataset):
  """
  Calculates Cohen's Kappa score for a given dataset.

  Parameters:
  dataset (list of dictionaries): A huggingface dataset.

  Returns:
  dict: A dictionary of Cohen's Kappa scores for each pair of top comments.
  """

  # encode top comment classifications
  top_comments_encoded = get_encoded_classifications(dataset)

  scores = {}
  for list1, list2 in combinations(top_comments_encoded, 2):
      filtered_list1 = []
      filtered_list2 = []
      for true, pred in zip(list1, list2):
          if true is not None and pred is not None:
              filtered_list1.append(true)
              filtered_list2.append(pred)
      score = cohen_kappa_score(filtered_list1, filtered_list2)
      index1 = top_comments_encoded.index(list1)
      index2 = top_comments_encoded.index(list2)
      key = (f"top_comment_{index1 + 1}", f"top_comment_{index2 + 1}")
      scores[key] = score
  return scores


def save_cohen_kappa_scores(cohen_kappa_scores, output_file):
  """
  Saves Cohen's Kappa scores to a CSV file.

  Parameters:
  cohen_kappa_scores (dict): A dictionary of Cohen's Kappa scores.
  output_file (str): The path to the output CSV file.
  """

  # create a list of column and row names
  comments = [f"top_comment_{i}" for i in range(1, 11)]

  # create an empty dataframe and fill with scores
  df = pd.DataFrame(index=comments, columns=comments)
  for (comment1, comment2), score in cohen_kappa_scores.items():
      df.at[comment1, comment2] = round(score, 3)

  # set the lower triangle to NaN, including the diagonal
  for i in range(len(df)):
      for j in range(i + 1):
          df.iat[i, j] = np.nan

  # save the dataframe
  df.to_csv(output_file, index=True)

In [None]:
import json

# calculate Krippendorrf's alphas for train and test datasets then save them to JSON
krippendorffs_alpha = {'krippendorffs alpha': calculate_krippendorffs_alpha(dataset)}

output_file = "results/Comment_Agreement_Analysis/krippendorffs_alpha.json"

with open(output_file, "w") as f:
    json.dump(krippendorffs_alpha, f)

# calculate Cohen's Kappa scores for train and test datasets and save them to CSV
cohen_kappa_scores = calculate_cohen_kappa(dataset)
cohen_kappa_scores = {str(key): value for key, value in cohen_kappa_scores.items()}

output_file = "results/Comment_Agreement_Analysis/cohen_kappa_scores.json"

with open(output_file, "w") as f:
    json.dump(cohen_kappa_scores, f)