# Plotting
This book is used for producing various plots related to data.

In [3]:
import random

from datasets import load_dataset
from data_processing import count_and_reformat

#### Load datasets

In [4]:
arxiv = count_and_reformat(dataset=load_dataset("gfissore/arxiv-abstracts-2021")['train'],
                           count_column='abstract',
                           retain_columns=['title', 'abstract'])

chatgpt_abstracts = load_dataset('csv', data_files='../../datasets/origins/research-abstracts/research_abstracts-deduplicated.csv')[
    'train']


#substitutes = substitute_duplicates_uniform(abstracts, arxiv, 'title', 'word_count', 10000, 50, 600, 42)

Found cached dataset json (/Users/nicolaisivesind/.cache/huggingface/datasets/gfissore___json/gfissore--arxiv-abstracts-2021-23556c248bdbe0fc/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

 Counting words: 100%

Found cached dataset csv (/Users/nicolaisivesind/.cache/huggingface/datasets/csv/default-56ca7a3413583396/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

#### Plotting

In [5]:
chatgpt_abstracts = load_dataset('csv', data_files='../../datasets/origins/research-abstracts/research_abstracts-uniform.csv')[
    'train']

Found cached dataset csv (/Users/nicolaisivesind/.cache/huggingface/datasets/csv/default-702d18da4aab8662/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

In [97]:
#from data_analysis import plot_distribution, plot_histogram
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
from data_processing import sample_uniform_subset

matplotlib.use('MacOSX')


def plot_distribution(plots: list[dict], start, end, sigma=2, x_label=None, y_label=None, save_to=None, title=None, y_lim=None, h_lines=None, v_lines=None, legend_offset=1.0):
    # Set the plot style
    with plt.style.context('ggplot'):
        # Create the figure and axis objects
        fig, ax = plt.subplots(figsize=(10, 6))

        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.3)

        # Set custom y-axis limits if provided
        if y_lim:
            ax.set_ylim(y_lim)

        # Set horizontal lines if provided
        if h_lines:
            for h_line in h_lines:
                ax.axhline(h_line['value'], color=h_line['color'], linestyle='--', alpha=h_line['alpha'])
                ax.text(h_line['offset'][0], h_line['value'] + h_line['offset'][1], h_line['text'], color=h_line['color'])

        # Set vertical lines if provided
        if v_lines:
            for v_line in v_lines:
                ax.axvline(v_line['value'], color=v_line['color'], linestyle='--', alpha=0.8)
                ax.text(v_line['value'] + v_line['offset'][0], v_line['offset'][1], v_line['text'], color=v_line['color'])

        for i, plot in enumerate(plots):
            counts = np.zeros(end - start + 1)

            for data_point in plot['dataset']:
                count = data_point[plot['column_name']]

                if start <= count <= end:
                    index = data_point[plot['column_name']] - start
                    counts[index] += 1

            # Apply the Gaussian filter
            smoothed_counts = gaussian_filter1d(counts, sigma)

            # Plot the smoothed data with a label for the legend
            x_values = np.arange(start, end + 1)
            ax.plot(x_values, smoothed_counts, label=plot['display'], alpha=plot['alpha'], color=plot['color'])

            if plot['mode']:
                # Find the maximum y-value and its index in the smoothed_counts array
                max_y_index = np.argmax(smoothed_counts)
                max_y_value = smoothed_counts[max_y_index]
                max_x_value = x_values[max_y_index]

                # Draw a dashed line from the maximum y-value to the x-axis
                ax.axvline(max_x_value, ymin=0, ymax=max_y_value / ax.get_ylim()[1], color=plot['color'], linestyle='--', alpha=plot['alpha'])

                # Display the x-value at the base of the dashed line
                ax.text(max_x_value + 2 + 3 * len(str(max_x_value)), 0, f"{max_x_value}", color=plot['color'], ha='center', va='bottom')

        # Set labels and title if provided
        if x_label:
            ax.set_xlabel(x_label)
        if y_label:
            ax.set_ylabel(y_label)
        if title:
            ax.set_title(title)

        # Rotate x-axis labels for better readability
        plt.xticks(rotation=90)

        # Add a legend with an offset
        ax.legend(facecolor='white', bbox_to_anchor=(legend_offset, 1))

    # Display the plot
    plt.subplots_adjust(left=0.07, bottom=0.143, right=0.93, top=0.943)

    if save_to:
        plt.savefig(save_to)
    plt.show()


def plot_histogram(plots: list[dict], start, end, sigma=2, save_to=None):
    bins = end-start+1

    # Set the plot style
    with plt.style.context('ggplot'):
        # Create the figure and axis objects
        fig, ax = plt.subplots(figsize=(10, 6))

        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.3)

        for i, plot in enumerate(plots):
            counts = []

            for data_point in plot['dataset']:
                count = data_point[plot['column_name']]

                if start <= count <= end:
                    counts.append(count)

            ax.hist(counts, bins=bins, range=(start, end), alpha=plot['alpha'], label=plot['display'], color=plot['color'])

        # Set labels and title
        ax.set_xlabel('Length of text in words')
        ax.set_ylabel('Number of texts')

        # Rotate x-axis labels for better readability
        plt.xticks(rotation=90)

        # Add a legend
        ax.legend(facecolor='white')

    # Adjust the plot margins
    plt.subplots_adjust(left=0.07, bottom=0.143, right=0.93, top=0.943)

    # Save and display the plot
    if save_to:
        plt.savefig(save_to)
    plt.show()


arxiv_10k = random.sample(arxiv, k=10000)
uniform = sample_uniform_subset(arxiv, 'word_count', 10000, 50, 600)

 Sorting into lists: 99%
 Sampling data points: 100%

In [79]:
plot_distribution(plots=[{'dataset': arxiv, 'column_name': 'word_count', 'color': 'red', 'alpha': 0.2,
                       'display': 'Source dataset (2 000 000 data points)'},
                         {'dataset': chatgpt_abstracts, 'column_name': 'real_word_count', 'color': 'blue', 'alpha': 0.6,
                       'display': 'Selected real abstracts (10 000 data points, uniform word count selection)'},
                      {'dataset': chatgpt_abstracts, 'column_name': 'generated_word_count', 'color': 'limegreen', 'alpha': 0.8,
                       'display': 'Generated abstracts (10 000 data points\n'
                                  '                                    Instructed to match real abstract word count)'}],
                     start=50,
                     end=600)

In [100]:
plot_distribution(plots=[{'dataset': arxiv, 'column_name': 'word_count', 'color': 'darkmagenta', 'alpha': 0.6,
                          'display': 'arXiv-abstracts-2021 (~2m)', 'mode':False},
                         {'dataset': arxiv_10k, 'column_name': 'word_count', 'color': 'red', 'alpha': 0.6,
                          'display': 'arXiv-abstracts-2021 random subset (10k)', 'mode':True},
                         {'dataset': chatgpt_abstracts, 'column_name': 'real_word_count', 'color': 'blue', 'alpha': 0.6,
                          'display': 'ChatGPT-Research-Abstracts, real (10k)', 'mode':False},
                         {'dataset': chatgpt_abstracts, 'column_name': 'generated_word_count', 'color': 'limegreen', 'alpha': 0.8,
                          'display': 'ChatGPT-Research-Abstracts, generated (10k)', 'mode':True}],
                   h_lines=[{'value': 18, 'color': 'grey', 'alpha': 0.8, 'text': 'IASS_10k ≈ 18', 'offset': (400, 1)}],
                   v_lines=[{'value': 364, 'color': 'grey', 'alpha': 0.8, 'text': 'WC = 361', 'offset': (5, 50)}],
                   start=50,
                   end=600,
                   x_label='WC (length of data points in words)',
                   y_label='n (number of data points)',
                   y_lim=(0, 75),
                   legend_offset=1.02)

In [70]:
plot_histogram(plots=[{'dataset': arxiv, 'column_name': 'word_count', 'color': 'red', 'alpha': 0.6,
                       'display': 'Source dataset (2 000 000 data points)'},
                      {'dataset': chatgpt_abstracts, 'column_name': 'real_word_count', 'color': 'blue', 'alpha': 0.4,
                       'display': 'Selected real abstracts (10 000 data points)',
                      {'dataset': chatgpt_abstracts, 'column_name': 'generated_word_count', 'color': 'limegreen', 'alpha': 0.8,
                       'display': 'Generated abstracts (10 000 data points\n'
                                  'Instructed to match real abstract word counts)'}],
                     start=350,
                     end=600)