# Plotting
This book is used for producing various plots related to data.

In [3]:
import random

from datasets import load_dataset
from data_processing import count_and_reformat

#### Load datasets

In [4]:
arxiv = count_and_reformat(dataset=load_dataset("gfissore/arxiv-abstracts-2021")['train'],
                           count_column='abstract',
                           retain_columns=['title', 'abstract'])

chatgpt_abstracts = load_dataset('csv', data_files='../../datasets/origins/research-abstracts/research_abstracts-deduplicated.csv')[
    'train']


#substitutes = substitute_duplicates_uniform(abstracts, arxiv, 'title', 'word_count', 10000, 50, 600, 42)

Found cached dataset json (/Users/nicolaisivesind/.cache/huggingface/datasets/gfissore___json/gfissore--arxiv-abstracts-2021-23556c248bdbe0fc/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

 Counting words: 100%

Found cached dataset csv (/Users/nicolaisivesind/.cache/huggingface/datasets/csv/default-56ca7a3413583396/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

#### Plotting

In [163]:
chatgpt_abstracts = load_dataset('csv', data_files='../../datasets/origins/research-abstracts/research_abstracts-uniform-clean.csv')[
    'train']

Downloading and preparing dataset csv/default to /Users/nicolaisivesind/.cache/huggingface/datasets/csv/default-626230864bac12a1/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /Users/nicolaisivesind/.cache/huggingface/datasets/csv/default-626230864bac12a1/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset csv (/Users/nicolaisivesind/.cache/huggingface/datasets/csv/default-626230864bac12a1/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

In [164]:
#from data_analysis import plot_distribution, plot_histogram
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
from data_processing import sample_uniform_subset, completion_bar

matplotlib.use('MacOSX')


def plot_distribution(plots: list[dict], start, end, sigma=2, x_label=None, y_label=None, save_to=None, title=None, y_lim=None, h_lines=None, v_lines=None, legend_offset=1.0):
    # Set the plot style
    with plt.style.context('ggplot'):
        # Create the figure and axis objects
        fig, ax = plt.subplots(figsize=(10, 6))

        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.3)

        # Set custom y-axis limits if provided
        if y_lim:
            ax.set_ylim(y_lim)

        # Set horizontal lines if provided
        if h_lines:
            for h_line in h_lines:
                ax.axhline(h_line['value'], color=h_line['color'], linestyle='--', alpha=h_line['alpha'])
                ax.text(h_line['offset'][0], h_line['value'] + h_line['offset'][1], h_line['text'], color=h_line['color'])

        # Set vertical lines if provided
        if v_lines:
            for v_line in v_lines:
                ax.axvline(v_line['value'], color=v_line['color'], linestyle='--', alpha=0.8)
                ax.text(v_line['value'] + v_line['offset'][0], v_line['offset'][1], v_line['text'], color=v_line['color'])

        for i, plot in enumerate(plots):
            counts = np.zeros(end - start + 1)

            for data_point in plot['dataset']:
                count = data_point[plot['column_name']]

                if start <= count <= end:
                    index = data_point[plot['column_name']] - start
                    counts[index] += 1

            # Apply the Gaussian filter
            smoothed_counts = gaussian_filter1d(counts, sigma)

            # Plot the smoothed data with a label for the legend
            x_values = np.arange(start, end + 1)
            ax.plot(x_values, smoothed_counts, label=plot['display'], alpha=plot['alpha'], color=plot['color'])

            if plot['mode']:
                # Find the maximum y-value and its index in the smoothed_counts array
                max_y_index = np.argmax(smoothed_counts)
                max_y_value = smoothed_counts[max_y_index]
                max_x_value = x_values[max_y_index]

                # Draw a dashed line from the maximum y-value to the x-axis
                ax.axvline(max_x_value, ymin=0, ymax=max_y_value / ax.get_ylim()[1], color=plot['color'], linestyle='--', alpha=plot['alpha'])

                # Display the x-value at the base of the dashed line
                ax.text(max_x_value + 2 + 3 * len(str(max_x_value)), 0, f"{max_x_value}", color=plot['color'], ha='center', va='bottom')

        # Set labels and title if provided
        if x_label:
            ax.set_xlabel(x_label)
        if y_label:
            ax.set_ylabel(y_label)
        if title:
            ax.set_title(title)

        # Rotate x-axis labels for better readability
        plt.xticks(rotation=90)

        # Add a legend with an offset
        ax.legend(facecolor='white', bbox_to_anchor=(legend_offset, 1))

    # Display the plot
    plt.subplots_adjust(left=0.07, bottom=0.143, right=0.93, top=0.943)

    if save_to:
        plt.savefig(save_to)
    plt.show()


def plot_histogram(plots: list[dict], start, end, sigma=2, save_to=None):
    bins = end-start+1

    # Set the plot style
    with plt.style.context('ggplot'):
        # Create the figure and axis objects
        fig, ax = plt.subplots(figsize=(10, 6))

        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.3)

        for i, plot in enumerate(plots):
            counts = []

            for data_point in plot['dataset']:
                count = data_point[plot['column_name']]

                if start <= count <= end:
                    counts.append(count)

            ax.hist(counts, bins=bins, range=(start, end), alpha=plot['alpha'], label=plot['display'], color=plot['color'])

        # Set labels and title
        ax.set_xlabel('Length of text in words')
        ax.set_ylabel('Number of texts')

        # Rotate x-axis labels for better readability
        plt.xticks(rotation=90)

        # Add a legend
        ax.legend(facecolor='white')

    # Adjust the plot margins
    plt.subplots_adjust(left=0.07, bottom=0.143, right=0.93, top=0.943)

    # Save and display the plot
    if save_to:
        plt.savefig(save_to)
    plt.show()

def calculate_mean_y_per_x(x_values, y_values):
    xy_dict = {}
    for x, y in zip(x_values, y_values):
        if x not in xy_dict:
            xy_dict[x] = {'sum': 0, 'count': 0}
        xy_dict[x]['sum'] += y
        xy_dict[x]['count'] += 1

    mean_y_values = {x: y_data['sum'] / y_data['count'] for x, y_data in xy_dict.items()}
    return list(mean_y_values.keys()), list(mean_y_values.values())

def plot_scatter(plots: list[dict], d_lines=None, h_lines=None, v_lines=None, x_label=None, y_label=None, y_lim=None, legend_offset=(1.0, 1.0), average_curve=None, sigma=2, correlations=None):
    with plt.style.context('ggplot'):
        fig, ax = plt.subplots(figsize=(10, 6))

        ax.patch.set_facecolor('lightgrey')
        ax.patch.set_alpha(0.3)

        if y_lim:
            ax.set_ylim(y_lim)

        if h_lines:
            for h_line in h_lines:
                ax.axhline(h_line['value'], color=h_line['color'], linestyle='--', alpha=h_line['alpha'])
                ax.text(h_line['offset'][0], h_line['value'] + h_line['offset'][1], h_line['text'], color=h_line['color'])

        if v_lines:
            for v_line in v_lines:
                ax.axvline(v_line['value'], color=v_line['color'], linestyle='--', alpha=0.8)
                ax.text(v_line['value'] + v_line['offset'][0], v_line['offset'][1], v_line['text'], color=v_line['color'])

        for plot in plots:
            x_values = [data_point[plot['x']] for data_point in plot['dataset']]
            y_values = [data_point[plot['y']] for data_point in plot['dataset']]

            ax.scatter(x_values, y_values, label=plot['display'], alpha=plot['alpha'], color=plot['color'])

            if average_curve:
                x_unique, y_mean = calculate_mean_y_per_x(x_values, y_values)
                x_unique, y_mean = zip(*sorted(zip(x_unique, y_mean)))
                y_mean_filtered = gaussian_filter1d(y_mean, sigma)
                ax.plot(x_unique, y_mean_filtered, label=average_curve['display'], color=average_curve['color'], alpha=average_curve['alpha'])

            if correlations:
                for correlation in correlations:
                    interval = correlation.get('interval', (min(x_values), max(x_values)))
                    x_interval_values = [x for x in x_values if interval[0] <= x <= interval[1]]
                    y_interval_values = [y for x, y in zip(x_values, y_values) if interval[0] <= x <= interval[1]]
                    print(len(x_interval_values))

                    corr = np.corrcoef(x_interval_values, y_interval_values)[0, 1]
                    ax.text(correlation['positioning'][0], correlation['positioning'][1], f"{correlation['text']} {corr:.2f}", color=correlation['color'], alpha=correlation['alpha'])

        if d_lines:
            for d_line in d_lines:
                x_start, y_start = d_line['start']
                x_increment, y_increment = d_line['increment']
                x_max = max(ax.get_xlim())
                y_max = max(ax.get_ylim())

                x_end = min(x_max, (y_max - y_start) / y_increment * x_increment + x_start)
                y_end = x_end * y_increment / x_increment + y_start - x_start * y_increment / x_increment

                ax.plot([x_start, x_end], [y_start, y_end], label=d_line['display'], color=d_line['color'], linestyle='--', alpha=d_line['alpha'])


        if x_label:
            ax.set_xlabel(x_label)
        if y_label:
            ax.set_ylabel(y_label)

    ax.legend(facecolor='white', bbox_to_anchor=(legend_offset[0], legend_offset[1]))
    plt.show()



def plot_loss_curves(plots, x_label=None, y_label=None, legend_offset=(1.0, 1.0), sigma=2):
    for plot in plots:
        dataset = plot['dataset']
        positive_loss = []
        negative_loss = []

        for data_point in dataset:
            real_word_count = data_point[plot['benchmark']]
            generated_word_count = data_point[plot['predicted']]
            loss = real_word_count - generated_word_count

            if loss > 0:
                positive_loss.append((real_word_count, abs(loss)))
            else:
                negative_loss.append((real_word_count, abs(loss)))

        with plt.style.context('ggplot'):
            fig, ax = plt.subplots(figsize=(10, 6))

            ax.patch.set_facecolor('lightgrey')
            ax.patch.set_alpha(0.3)

            if positive_loss:
                x_values, y_values = zip(*positive_loss)
                x_unique, y_mean = calculate_mean_y_per_x(x_values, y_values)
                x_unique, y_mean = zip(*sorted(zip(x_unique, y_mean)))
                y_mean_filtered = gaussian_filter1d(y_mean, sigma)
                ax.plot(x_unique, y_mean_filtered, label=plot['positive-display'], color=plot['positive-color'], alpha=plot['alpha'])

            if negative_loss:
                x_values, y_values = zip(*negative_loss)
                x_unique, y_mean = calculate_mean_y_per_x(x_values, y_values)
                x_unique, y_mean = zip(*sorted(zip(x_unique, y_mean)))
                y_mean_filtered = gaussian_filter1d(y_mean, sigma)
                ax.plot(x_unique, y_mean_filtered, label=plot['negative-display'], color=plot['negative-color'], alpha=plot['alpha'])

            if x_label:
                ax.set_xlabel(x_label)
            if y_label:
                ax.set_ylabel(y_label)

            ax.legend(facecolor='white', bbox_to_anchor=(legend_offset[0], legend_offset[1]))
            plt.show()




In [165]:
arxiv_10k = random.sample(arxiv, k=10000)
uniform = sample_uniform_subset(arxiv, 'word_count', 10000, 50, 600)

 Sorting into lists: 99%
 Sampling data points: 100%

In [None]:
plot_distribution(plots=[{'dataset': arxiv, 'column_name': 'word_count', 'color': 'red', 'alpha': 0.2,
                       'display': 'Source dataset (2 000 000 data points)'},
                         {'dataset': chatgpt_abstracts, 'column_name': 'real_word_count', 'color': 'blue', 'alpha': 0.6,
                       'display': 'Selected real abstracts (10 000 data points, uniform word count selection)'},
                      {'dataset': chatgpt_abstracts, 'column_name': 'generated_word_count', 'color': 'limegreen', 'alpha': 0.8,
                       'display': 'Generated abstracts (10 000 data points\n'
                                  '                                    Instructed to match real abstract word count)'}],
                     start=50,
                     end=600)

In [105]:
plot_distribution(plots=[{'dataset': arxiv, 'column_name': 'word_count', 'color': 'darkmagenta', 'alpha': 0.6,
                          'display': 'arXiv-abstracts-2021 (~2m)', 'mode':False},
                         {'dataset': arxiv_10k, 'column_name': 'word_count', 'color': 'red', 'alpha': 0.6,
                          'display': 'arXiv-abstracts-2021 random subset (10k)', 'mode':True},
                         {'dataset': chatgpt_abstracts, 'column_name': 'real_word_count', 'color': 'blue', 'alpha': 0.6,
                          'display': 'ChatGPT-Research-Abstracts, real (10k)', 'mode':False},
                         {'dataset': chatgpt_abstracts, 'column_name': 'generated_word_count', 'color': 'limegreen', 'alpha': 0.8,
                          'display': 'ChatGPT-Research-Abstracts, generated (10k)', 'mode':True}],
                   h_lines=[{'value': 18, 'color': 'grey', 'alpha': 0.8, 'text': 'IASS_10k ≈ 18', 'offset': (400, 1)}],
                   v_lines=[{'value': 364, 'color': 'grey', 'alpha': 0.8, 'text': 'WC = 361', 'offset': (5, 50)}],
                   start=50,
                   end=600,
                   x_label='WC (length of data points in words)',
                   y_label='n (number of data points)',
                   y_lim=(0, 75),
                   legend_offset=1.02)

In [106]:
plot_histogram(plots=[{'dataset': arxiv, 'column_name': 'word_count', 'color': 'red', 'alpha': 0.6,
                       'display': 'Source dataset (2 000 000 data points)'},
                      {'dataset': chatgpt_abstracts, 'column_name': 'real_word_count', 'color': 'blue', 'alpha': 0.4,
                       'display': 'Selected real abstracts (10 000 data points)'},
                      {'dataset': chatgpt_abstracts, 'column_name': 'generated_word_count', 'color': 'limegreen', 'alpha': 0.8,
                       'display': 'Generated abstracts (10 000 data points\n'
                                  'Instructed to match real abstract word counts)'}],
                     start=350,
                     end=600)

In [None]:
plot_scatter(plots=[{'dataset': chatgpt_abstracts, 'x': 'real_word_count', 'y': 'generated_word_count', 'color': 'blue', 'alpha': 0.1,
                     'display': 'Data point in ChatGPT-Research-Abstracts'}],
             correlations=[{'interval': (50, 400), 'text': 'r of x ∈ [50, 400]   =', 'positioning':(460, 60), 'color':'magenta', 'alpha':0.8},
                           {'interval': (400, 600), 'text': 'r of x ∈ [400, 600] =', 'positioning':(460, 40), 'color':'green', 'alpha':0.8},
                           {'interval': (50, 600), 'text': 'r of x ∈ [50, 600]   =', 'positioning':(460, 20), 'color':'darkblue', 'alpha':0.8}],
             d_lines=[{'start': (0, 0), 'increment': (1, 1), 'color': 'orange', 'alpha': 0.8, 'display': 'Perfect correlation', 'offset': (0, 0)}],
             x_label='x: Real abstract word count',
             y_label='y: Generated abstract word count',
             y_lim=(0, 600),
             legend_offset=(0.5, 0.95),
             average_curve={'color': 'red', 'alpha': 0.8, 'display': 'Average word count correlation', 'offset': (10, 10)},
             sigma=2)

9858
143
10000


In [99]:


plot_loss_curves(plots=[{'dataset': chatgpt_abstracts, 'benchmark': 'real_word_count', 'predicted': 'generated_word_count', 'positive-color': 'blue','negative-color': 'red', 'alpha': 0.6, 'positive-display': 'Positive loss', 'negative-display': 'Negative loss'}],
                x_label='Absolute loss',
                y_label='Word count goal',
                legend_offset=(0.35, 0.95),
                sigma=2)