## How to get all preprints which have not been published in a journal yet?

We use [OpenAlex](https://openalex.org) to retrieve all articles which are a preprint, but have so far not been published by a peer-reviewed journal.

In [None]:
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
from itertools import chain
import pandas as pd
import pyalex

In [None]:
# Variables reduce the size of the output and the time required for execution
year = 2020
n_max = None         # when set to None all papers are queried

In [None]:
query = Works().filter(type="article", publication_year=year, primary_location={'version': 'submittedVersion'}).sort(cited_by_count="desc")

data = []
for item in chain(*query.paginate(per_page=200, n_max=n_max)):
    title = item.get('title', None)
    publication_date = item.get('publication_date', None)
    doi = item.get('doi', None)
    cited_by_count = item.get('cited_by_count', None)
    locations_count = item.get('locations_count', None)
    host_organization_source = item.get('primary_location', {}).get('source', {})
    host_organization = None
    if host_organization_source:
        host_organization = host_organization_source.get('display_name')
    
    data.append({'Title': title, 'Publication Date': publication_date, 'DOI': doi, 'Host Organization': host_organization, 'Cited by Count': cited_by_count, 'Locations Count': locations_count})

df = pd.DataFrame.from_dict(data)
df.to_csv(f'../results/openalex_preprints_{year}.csv')
                