# Retracted Paper 

We use [OpenAlex](https://openalex.org) to retrieve all articles which are a preprint, but have so far not been published by a peer-reviewed journal.

## 1. Load libraries & define helper functions

In [1]:
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers, Funders
from itertools import chain
import pandas as pd
import pyalex

In [2]:
# Determine if any of the locations (journals) the paper is already published
def is_any_location_published(locations):
    for location in locations:
        if location['version'] == 'publishedVersion':
            return True
    return False

In [3]:
# Combine all authos
def join_authors(list_of_authors):
    return ', '.join([author['author']['display_name'] for author in list_of_authors])

In [4]:
# Extract key information from the locations
def join_locations(list_of_locations):
    summary = []
    for location in list_of_locations:
        if location['source']:
            summary.append(f"{location['version']}: {location['source']['host_organization_name']} - {location['landing_page_url']}")
        else:
            summary.append(f"{location['version']} - {location['landing_page_url']}")
    return ', '.join(summary)

## 2. Set the Topic & Year

Set the year and the number of papers you want to obtain

In [5]:
# Variables reduce the size of the output and the time required for execution
topic = 'COVID'
year = 2023
n_max = 500        # when set to None all papers are queried

## 3. Get the preprints

Run te following code to get the preprints for the specified parameters

In [6]:
query = Works().search(topic).filter(type="article", publication_year=year, primary_location={'version': 'submittedVersion'}, locations={'is_published': False}).sort(cited_by_count="desc")

preprints = []

# Iterate over all query results
for item in chain(*query.paginate(per_page=200, n_max=n_max)):
    
    # Get key properties
    oa_id = item.get('id', None)
    title = item.get('title', None)
    publication_date = item.get('publication_date', None)
    doi = item.get('doi', None)
    cited_by_count = item.get('cited_by_count', None)
    locations_count = item.get('locations_count', None)
        
    # Join all authors
    authors = join_authors(item['authorships'])   
    locations = item.get('locations', None)
    locations_overview = join_locations(item['locations'])
    
    # Only append the paper to the preprints if is not published in any other journal
    if locations_count == 1 or not is_any_location_published(locations):
        preprints.append({'id': oa_id, 'title': title, 'publication_date': publication_date, 'doi': doi,
                          'cited': cited_by_count, 'authors': authors,
                          'locations': locations_overview, 'location_count': locations_count})


## 4. Store the data

In [7]:
df = pd.DataFrame.from_dict(preprints)
df.to_csv(f'../results/openalex_preprints_{year}_{n_max}.csv')
df.to_excel(f'../results/openalex_preprints_{year}_{n_max}.xlsx')
df.head()

OSError: Cannot save file into a non-existent directory: '../results'

## 5. Get a sample paper

In [13]:
paper = df.iloc[0]
paper

id                                   https://openalex.org/W1485155423
title               For Hunger-proof Cities: Sustainable Urban Foo...
publication_date                                           2023-09-05
doi                              https://doi.org/10.32920/24084468.v1
cited                                                             204
authors             Mustafa Koç, Rod MacRae, L. J. A. Mougeot, J. ...
locations           submittedVersion - https://doi.org/10.32920/24...
location_count                                                      1
Name: 0, dtype: object

## 5. Do basic analysis

TODO:
- barchart with journals
- barchart with publication year