# Plotting
This book is used for producing various plots related to data.

In [1]:
import random

from datasets import load_dataset
from data_manipulation.data_processing import count_and_reformat
from data_manipulation.data_processing import sample_uniform_subset
from data_manipulation.data_analysis import plot_distribution, plot_histogram, plot_scatter, plot_loss_curves

#### Load datasets

In [2]:
arxiv = count_and_reformat(dataset=load_dataset("gfissore/arxiv-abstracts-2021")['train'],
                           count_column='abstract',
                           retain_columns=['title', 'abstract'])

Found cached dataset json (/Users/nicolaisivesind/.cache/huggingface/datasets/gfissore___json/gfissore--arxiv-abstracts-2021-23556c248bdbe0fc/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

 Counting words: 100%

#### Plotting

In [2]:
chatgpt_abstracts_raw = load_dataset('csv', data_files='../../datasets/origins/research-abstracts/research_abstracts-raw.csv')[
    'train']
chatgpt_abstracts_clean = load_dataset('csv', data_files='../../datasets/origins/ChatGPT-Research-Abstracts/research_abstracts-final.csv')[
    'train']

Found cached dataset csv (/Users/nicolaisivesind/.cache/huggingface/datasets/csv/default-b7975ec30e73b117/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset csv (/Users/nicolaisivesind/.cache/huggingface/datasets/csv/default-6a0f094caf40a32a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
arxiv_10k = random.sample(arxiv, k=10000)
uniform = sample_uniform_subset(arxiv, 'word_count', 10000, 50, 600)

 Sorting into lists: 99%
 Sampling data points: 100%

In [None]:
plot_distribution(plots=[{'dataset': arxiv, 'column_name': 'word_count', 'color': 'darkmagenta', 'alpha': 0.6,
                          'display': 'arXiv-abstracts-2021 (~2m)', 'mode':False},
                         {'dataset': arxiv_10k, 'column_name': 'word_count', 'color': 'red', 'alpha': 0.6,
                          'display': 'arXiv-abstracts-2021 random subset (10k)', 'mode':True},
                         {'dataset': chatgpt_abstracts_raw, 'column_name': 'real_word_count', 'color': 'blue', 'alpha': 0.6,
                          'display': 'ChatGPT-Research-Abstracts, real (10k)', 'mode':False},
                         {'dataset': chatgpt_abstracts_raw, 'column_name': 'generated_word_count', 'color': 'limegreen', 'alpha': 0.8,
                          'display': 'ChatGPT-Research-Abstracts, generated (10k)', 'mode':True}],
                   h_lines=[{'value': 18, 'color': 'grey', 'alpha': 0.8, 'text': 'IASS_10k ≈ 18', 'offset': (400, 1)}],
                   v_lines=[{'value': 360, 'color': 'grey', 'alpha': 0.8, 'text': 'WC = 360', 'offset': (5, 50)}],
                   start=50,
                   end=600,
                   x_label='WC (length of data points in words)',
                   y_label='n (number of data points)',
                   y_lim=(0, 75),
                   legend_offset=1.02)

In [5]:
plot_histogram(plots=[{'dataset': arxiv, 'column_name': 'word_count', 'color': 'red', 'alpha': 0.6,
                       'display': 'arXiv-abstracts-2021 (~2m)'},
                      {'dataset': chatgpt_abstracts_clean, 'column_name': 'real_word_count', 'color': 'blue', 'alpha': 0.4,
                       'display': 'ChatGPT-Research-Abstracts, real (10k)'},
                      {'dataset': chatgpt_abstracts_clean, 'column_name': 'generated_word_count', 'color': 'limegreen', 'alpha': 0.8,
                       'display': 'ChatGPT-Research-Abstracts, generated (10k)'}],
                     start=350,
                     end=600,
                x_label="WC (length of data points in words)",
                y_label="n (number of data points)",
                y_lim=(0, 52))

In [6]:
plot_scatter(plots=[{'dataset': chatgpt_abstracts_raw, 'x': 'real_word_count', 'y': 'generated_word_count', 'color': 'blue', 'alpha': 0.1,
                     'display': 'Data point in ChatGPT-Research-Abstracts'}],
             correlations=[{'interval': (50, 325), 'spaces': (2, 2, 1), 'positioning':(400, 160), 'color':'magenta', 'alpha':0.8},
                           {'interval': (325, 420), 'spaces': (0, 2, 1),'positioning':(400, 120), 'color':'green', 'alpha':0.8},
                           {'interval': (420, 600), 'spaces': (0, 4, 1), 'positioning':(400, 80), 'color':'darkblue', 'alpha':0.8},
                           {'interval': (50, 600), 'spaces': (2, 0, 1), 'positioning':(400, 40), 'color':'black', 'alpha':0.95}],
             d_lines=[{'start': (0, 0), 'increment': (1, 1), 'color': 'orange', 'alpha': 0.8, 'display': 'Perfect correlation', 'offset': (0, 0)}],
             v_lines=[{'value': 325, 'color': 'grey', 'alpha': 0.8, 'text': 'x=325', 'offset': (5, 520)},
                      {'value': 420, 'color': 'grey', 'alpha': 0.8, 'text': 'x=420', 'offset': (5, 520)}],
             x_label='x: Real abstract word count',
             y_label='y: Generated abstract word count',
             y_lim=(0, 600),
             legend_offset=(0.43, 0.95),
             average_curve={'color': 'red', 'alpha': 0.8, 'display': 'Average word count correlation', 'offset': (10, 10)},
             sigma=2)

In [10]:
plot_loss_curves(plots=[{'dataset': chatgpt_abstracts_raw,
                         'benchmark': 'real_word_count', 'predicted': 'generated_word_count',
                         'positive-color': 'blue','negative-color': 'red', 'alpha': 0.6,
                         'positive-display': 'Mean absolute positive deviation (MAPD)',
                         'negative-display': 'Mean absolute negative deviation (MAND)',
                         'mean-abs-display': 'Mean absolute total deviation (MATD)',
                         'mean-abs-color': 'purple'}],
                 deviations=[{'zero-text': 'Non-deviates:          ', 'positioning':(465, 30), 'color':'black', 'alpha':0.7},
                             {'positive-text': 'Positive deviates:    ', 'positioning':(465, 20), 'color':'blue', 'alpha':0.8},
                             {'negative-text': 'Negative deviates:  ', 'positioning':(465, 10), 'color':'red', 'alpha':0.8}],
                 v_lines=[{'value': 325, 'color': 'grey', 'alpha': 0.8, 'text': 'x=325', 'offset': (5, 175)},
                      {'value': 420, 'color': 'grey', 'alpha': 0.8, 'text': 'x=420', 'offset': (5, 175)}],
                 x_label='Target word count',
                 y_label='Average absolute deviation',
                 legend_offset=(0.43, 0.95),
                 sigma=5)

In [None]:
from scipy.ndimage import gaussian_filter1d
import numpy as np
import matplotlib.pyplot as plt

def calculate_mean_y_per_x(x_values, y_values):
    xy_dict = {}
    for x, y in zip(x_values, y_values):
        if x not in xy_dict:
            xy_dict[x] = {'sum': 0, 'count': 0}
        xy_dict[x]['sum'] += y
        xy_dict[x]['count'] += 1

    mean_y_values = {x: y_data['sum'] / y_data['count'] for x, y_data in xy_dict.items()}
    return list(mean_y_values.keys()), list(mean_y_values.values())

def plot_newline_frequencies(plots: list[dict], x_label=None, y_label=None, title=None,
                             legend_coords=(20, 0.05), sigma=2, text_coords=(0.53, 0.7), v_lines=None):

    with plt.style.context('ggplot'):
        fig, ax = plt.subplots(figsize=(10, 6))

        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.3)

        averages = []
        total_paragraph_breaks = []

        for plot in plots:
            total_w = sum(data_point[plot['word_count']] for data_point in plot['dataset'])
            total_p = sum(data_point[plot['column']].count('\n\n') + 1 for data_point in plot['dataset'])
            total_pb = sum(data_point[plot['column']].count('\n\n') for data_point in plot['dataset'])

            print(len([data_point for data_point in plot['dataset'] if data_point[plot['word_count']] < 185 and data_point[plot['column']].count('\n\n') > 0]))

            averages.append((total_w / total_p))
            total_paragraph_breaks.append(total_pb)


            x_values = [data_point[plot['word_count']] for data_point in plot['dataset']]
            y_values = [data_point[plot['word_count']] / (data_point[plot['column']].count('\n\n') + 1) for data_point in plot['dataset']]

            scatter_plot = ax.scatter(x_values, y_values, alpha=plot['alpha'], color=plot['color'])
            scatter_plot.set_label(plot['display'])

            x_unique, y_mean = calculate_mean_y_per_x(x_values, y_values)
            x_unique, y_mean = zip(*sorted(zip(x_unique, y_mean)))
            y_mean_filtered = gaussian_filter1d(y_mean, sigma)

            line_plot, = ax.plot(x_unique, y_mean_filtered, color=plot['mean_color'], alpha=0.8)
            line_plot.set_label(f"Mean {plot['display']}")

        ax.text(text_coords[0], text_coords[1], f"Mean word per pargraph: Real = {averages[0]:.4f}, Generated = {averages[1]:.4f}\n"
                                                f"Total paragraph breaks: Real = {total_paragraph_breaks[0]}, Generated = {total_paragraph_breaks[1]}",
                transform=ax.transAxes, color='darkblue')

        if x_label:
            ax.set_xlabel(x_label)
        if y_label:
            ax.set_ylabel(y_label)
        if title:
            ax.set_title(title)

        if v_lines:
            for v_line in v_lines:
                ax.axvline(v_line['value'], color=v_line['color'], linestyle='--', alpha=0.8)
                ax.text(v_line['value'] + v_line['offset'][0], v_line['offset'][1], v_line['text'],
                        color=v_line['color'])

        ax.legend(facecolor='white', loc='upper right', bbox_to_anchor=legend_coords)

    plt.show()



plot_newline_frequencies(plots=[{'dataset': chatgpt_abstracts_clean, 'column': 'real_abstract', 'word_count': 'real_word_count', 'color': 'lavender', 'alpha': 0.8, 'display': 'Real abstracts, (CRA-Real)', 'mean_color': 'blue'},
                                {'dataset': chatgpt_abstracts_clean, 'column': 'generated_abstract', 'word_count': 'generated_word_count', 'color': 'mistyrose', 'alpha': 0.8, 'display': 'Generated abstracts (CRA-Generated)', 'mean_color': 'red'}],
                         x_label='x: Word Count',
                         y_label='y: Number of words per paragraph',
                         sigma=4,
                         legend_coords=(0.45, 0.95),
                         text_coords=(0.039, 0.66),
                         v_lines=[{'value': 185, 'color': 'lightgrey', 'alpha': 0.8, 'text': 'x=185', 'offset': (5, 280)}])

2543
27
