# Plotting
This book is used for producing various plots related to data.

In [1]:
import random

from datasets import load_dataset
from data_processing import count_and_reformat

#### Load datasets

In [2]:
arxiv = count_and_reformat(dataset=load_dataset("gfissore/arxiv-abstracts-2021")['train'],
                           count_column='abstract',
                           retain_columns=['title', 'abstract'])

Found cached dataset json (/Users/nicolaisivesind/.cache/huggingface/datasets/gfissore___json/gfissore--arxiv-abstracts-2021-23556c248bdbe0fc/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

 Counting words: 100%

#### Plotting

In [3]:
chatgpt_abstracts = load_dataset('csv', data_files='../../datasets/origins/research-abstracts/research_abstracts-final-raw.csv')[
    'train']

Found cached dataset csv (/Users/nicolaisivesind/.cache/huggingface/datasets/csv/default-94b8f040614b0746/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

In [82]:
#from data_analysis import plot_distribution, plot_histogram
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
from data_processing import sample_uniform_subset, completion_bar
from data_analysis import calculate_mean_y_per_x, plot_distribution, plot_histogram, plot_scatter

matplotlib.use('MacOSX')

def plot_loss_curves(plots, deviations=None, x_label=None, y_label=None, legend_offset=(1.0, 1.0), sigma=2):
    for plot in plots:
        dataset = plot['dataset']
        positive_loss = []
        negative_loss = []
        zero_loss = 0

        for data_point in dataset:
            real_word_count = data_point[plot['benchmark']]
            generated_word_count = data_point[plot['predicted']]
            loss = generated_word_count - real_word_count

            if loss > 0:
                positive_loss.append((real_word_count, abs(loss)))
            elif loss < 0:
                negative_loss.append((real_word_count, abs(loss)))
            else:
                zero_loss += 1

        with plt.style.context('ggplot'):
            fig, ax = plt.subplots(figsize=(10, 6))

            ax.patch.set_facecolor('lightgrey')
            ax.patch.set_alpha(0.3)

            all_losses = positive_loss + negative_loss

            if all_losses:
                x_values, y_values = zip(*all_losses)
                x_unique, y_mean_abs_dev = calculate_mean_y_per_x(x_values, y_values)
                x_unique, y_mean_abs_dev = zip(*sorted(zip(x_unique, y_mean_abs_dev)))
                y_mean_abs_dev_filtered = gaussian_filter1d(y_mean_abs_dev, sigma)
                ax.plot(x_unique, y_mean_abs_dev_filtered, label=plot['mean-abs-display'], color=plot['mean-abs-color'], alpha=plot['alpha'])

            if positive_loss:
                x_values, y_values = zip(*positive_loss)
                x_unique, y_mean = calculate_mean_y_per_x(x_values, y_values)
                x_unique, y_mean = zip(*sorted(zip(x_unique, y_mean)))
                y_mean_filtered = gaussian_filter1d(y_mean, sigma)
                ax.plot(x_unique, y_mean_filtered, label=plot['positive-display'], color=plot['positive-color'], alpha=plot['alpha'])

            if negative_loss:
                x_values, y_values = zip(*negative_loss)
                x_unique, y_mean = calculate_mean_y_per_x(x_values, y_values)
                x_unique, y_mean = zip(*sorted(zip(x_unique, y_mean)))
                y_mean_filtered = gaussian_filter1d(y_mean, sigma)
                ax.plot(x_unique, y_mean_filtered, label=plot['negative-display'], color=plot['negative-color'], alpha=plot['alpha'])

            if deviations:
                for deviation in deviations:
                    if 'zero-text' in deviation:
                        ax.text(deviation['positioning'][0], deviation['positioning'][1], f"{deviation['zero-text']} {zero_loss}", color=deviation['color'], alpha=deviation['alpha'])
                    if 'positive-text' in deviation:
                        ax.text(deviation['positioning'][0], deviation['positioning'][1], f"{deviation['positive-text']} {len(positive_loss)}", color=deviation['color'], alpha=deviation['alpha'])
                    if 'negative-text' in deviation:
                        ax.text(deviation['positioning'][0], deviation['positioning'][1], f"{deviation['negative-text']} {len(negative_loss)}", color=deviation['color'], alpha=deviation['alpha'])

            if x_label:
                ax.set_xlabel(x_label)
            if y_label:
                ax.set_ylabel(y_label)

            plt.subplots_adjust(left=0.07, bottom=0.143, right=0.93, top=0.943)
            ax.legend(facecolor='white', bbox_to_anchor=(legend_offset[0], legend_offset[1]))
            plt.show()




In [5]:
arxiv_10k = random.sample(arxiv, k=10000)
uniform = sample_uniform_subset(arxiv, 'word_count', 10000, 50, 600)

 Sorting into lists: 99%
 Sampling data points: 100%

In [171]:
plot_distribution(plots=[{'dataset': arxiv, 'column_name': 'word_count', 'color': 'darkmagenta', 'alpha': 0.6,
                          'display': 'arXiv-abstracts-2021 (~2m)', 'mode':False},
                         {'dataset': arxiv_10k, 'column_name': 'word_count', 'color': 'red', 'alpha': 0.6,
                          'display': 'arXiv-abstracts-2021 random subset (10k)', 'mode':True},
                         {'dataset': chatgpt_abstracts, 'column_name': 'real_word_count', 'color': 'blue', 'alpha': 0.6,
                          'display': 'ChatGPT-Research-Abstracts, real (10k)', 'mode':False},
                         {'dataset': chatgpt_abstracts, 'column_name': 'generated_word_count', 'color': 'limegreen', 'alpha': 0.8,
                          'display': 'ChatGPT-Research-Abstracts, generated (10k)', 'mode':True}],
                   h_lines=[{'value': 18, 'color': 'grey', 'alpha': 0.8, 'text': 'IASS_10k ≈ 18', 'offset': (400, 1)}],
                   v_lines=[{'value': 364, 'color': 'grey', 'alpha': 0.8, 'text': 'WC = 361', 'offset': (5, 50)}],
                   start=50,
                   end=600,
                   x_label='WC (length of data points in words)',
                   y_label='n (number of data points)',
                   y_lim=(0, 75),
                   legend_offset=1.02)

In [172]:
plot_histogram(plots=[{'dataset': arxiv, 'column_name': 'word_count', 'color': 'red', 'alpha': 0.6,
                       'display': 'Source dataset (2 000 000 data points)'},
                      {'dataset': chatgpt_abstracts, 'column_name': 'real_word_count', 'color': 'blue', 'alpha': 0.4,
                       'display': 'Selected real abstracts (10 000 data points)'},
                      {'dataset': chatgpt_abstracts, 'column_name': 'generated_word_count', 'color': 'limegreen', 'alpha': 0.8,
                       'display': 'Generated abstracts (10 000 data points\n'
                                  'Instructed to match real abstract word counts)'}],
                     start=350,
                     end=600)

In [86]:
plot_scatter(plots=[{'dataset': chatgpt_abstracts, 'x': 'real_word_count', 'y': 'generated_word_count', 'color': 'blue', 'alpha': 0.1,
                     'display': 'Data point in ChatGPT-Research-Abstracts'}],
             correlations=[{'interval': (50, 350), 'text': '∀x ∈ [50, 400],   r =', 'positioning':(400, 60), 'color':'magenta', 'alpha':0.8},
                           {'interval': (350, 600), 'text': '∀x ∈ [400, 600], r =', 'positioning':(400, 40), 'color':'green', 'alpha':0.8},
                           {'interval': (50, 600), 'text': '∀x ∈ [50, 600],   r =', 'positioning':(400, 20), 'color':'darkblue', 'alpha':0.8}],
             d_lines=[{'start': (0, 0), 'increment': (1, 1), 'color': 'orange', 'alpha': 0.8, 'display': 'Perfect correlation', 'offset': (0, 0)}],
             h_lines=[{'value': 18, 'color': 'grey', 'alpha': 0.8, 'text': 'IASS_10k ≈ 18', 'offset': (400, 1)}],
             x_label='x: Real abstract word count',
             y_label='y: Generated abstract word count',
             y_lim=(0, 600),
             legend_offset=(0.5, 0.95),
             average_curve={'color': 'red', 'alpha': 0.8, 'display': 'Average word count correlation', 'offset': (10, 10)},
             sigma=2)

NameError: name 'pearsonr' is not defined

In [84]:
plot_loss_curves(plots=[{'dataset': chatgpt_abstracts,
                         'benchmark': 'real_word_count', 'predicted': 'generated_word_count',
                         'positive-color': 'blue','negative-color': 'red', 'alpha': 0.6,
                         'positive-display': 'Mean absolute positive deviation (MAPD)',
                         'negative-display': 'Mean absolute negative deviation (MAND)',
                         'mean-abs-display': 'Mean absolute total deviation (MATD)',
                         'mean-abs-color': 'purple'}],
                 deviations=[{'zero-text': 'Non-deviates:          ', 'positioning':(465, 30), 'color':'black', 'alpha':0.7},
                             {'positive-text': 'Positive deviates:    ', 'positioning':(465, 20), 'color':'red', 'alpha':0.8},
                             {'negative-text': 'Negative deviates:  ', 'positioning':(465, 10), 'color':'blue', 'alpha':0.8}],
                x_label='Target word count',
                y_label='Average absolute deviation',
                legend_offset=(0.5, 0.95),
                sigma=5)