# Execution file for generating Research Abstract Dataset
See 'dataset_generator.py' for implementation.

In [1]:
import datasets as ds
from data_processing import count_and_reformat, filter_list, sample_uniform_subset, substitute_duplicates_uniform
from data_generation import generate_abstracts, generate_GPT_abstract, get_models

#### Load and preprocess dataset
This will take some time if dataset is large. This is only necessary to do once each jupyter-session as local variables are stored until session/kernel shut down.

In [2]:
# Code execution
dataset = ds.load_dataset("gfissore/arxiv-abstracts-2021")['train']
reformatted_dataset = count_and_reformat(dataset=dataset,
                                  count_column='abstract',
                                  retain_columns=['title', 'abstract'])

Found cached dataset json (/Users/nicolaisivesind/.cache/huggingface/datasets/gfissore___json/gfissore--arxiv-abstracts-2021-23556c248bdbe0fc/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


  0%|          | 0/1 [00:00<?, ?it/s]

 Counting words: 100%

#### Run code segments

In [3]:
uniform_subset = sample_uniform_subset(dataset=reformatted_dataset,
                                       column='word_count',
                                       start=50,
                                       end=600,
                                       subset_size=10000)

 Sorting into lists: 99%
 Sampling data points: 100%

In [None]:
generate_abstracts(data=uniform_subset,
                   target_file_name='research_abstracts-uniform-clean',
                   target_dir_path='./../../datasets/origins/research-abstracts')

No file already exists. Creating blank CSV

 Generating: 2418/10000
 API-error. Reattempting API-call
 Generating: 2709/10000
 API-error. Reattempting API-call
 Generating: 2755/10000

In [3]:
dataset_2 = ds.load_dataset('csv', data_files='../../datasets/origins/research-abstracts/research_abstracts-uniform.csv')['train']

substitutes = substitute_duplicates_uniform(dataset_2, reformatted_dataset, 'title', 'word_count', 10000, 50, 600, 42)
titles = dataset_2.unique('title') + [substitute['title'].replace('\n', '') for substitute in substitutes]


print('list: ', len(titles))
print('unique: ', len(set(titles)))

generate_abstracts(data=substitutes,
                   target_file_name='research_abstracts-uniform',
                   target_dir_path='./../../datasets/origins/research-abstracts')


Found cached dataset csv (/Users/nicolaisivesind/.cache/huggingface/datasets/csv/default-b1e87654a6a27fb5/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

 Sorting into lists: 99%
 Sampling substitutes: 100%list:  10000
unique:  10000
CSV-file already exists. Will append new rows to existing document. Cancel execution if this is not intended.

 Generating: 27/27
Abstract generation complete.


