In [3]:
import requests
import json
import os
import numpy as np
import pandas as pd
import re

In [None]:


def get_new_gdd_articles(output_path, n_recent_articles = None, min_date = None, max_date = None):
    """ 
    Get newly acquired articles from min_date to (optional) max_date. 
    Or get the most recent new articles added to GeoDeepDive.
    Return API resuls as a list of article metadata information.

    Example:
    get_new_gdd_articles(min_date='2023-06-07')
    get_new_gdd_articles(min_date='2023-06-01', max_date = '2023-06-08')
    get_new_gdd_articles(n_recent_articles = 1000)

    """

    # ======== Tests for input data type ==========
    if n_recent_articles is not None:
        if min_date is None and max_date is None:
            if isinstance(n_recent_articles, int) or isinstance(n_recent_articles, float):
                pass
            else:
                raise ValueError("When n_recent_articles is specified, it should be a numeric value.")
        else:
            raise ValueError("When n_recent_articles is specified, min_data and max_date should be None.")
        
    else: # n_recent_articles is None, should specify dates
        if min_date is None:
            raise ValueError("Either n_recent_articles or min_date should be specified")
        else:
            pattern = r'^\d{4}-\d{2}-\d{2}$'

            if not isinstance(min_date, str):
                raise ValueError("min_date should be a string. min_date should be a string with format 'yyyy-mm-dd'.")
            if re.match(pattern, min_date) is False:
                raise ValueError("min_date does not follow the correct format. min_date should be a string with format 'yyyy-mm-dd'.")
            
            if max_date is not None:
                if not isinstance(max_date, str):
                    raise ValueError("max_date should be a string. min_max_datedate should be a string with format 'yyyy-mm-dd'.")
                if re.match(pattern, max_date) is False:
                    raise ValueError("max_date should be a string with format 'yyyy-mm-dd'.")

    # ========== Query API by n most recent article ==========
    if n_recent_articles is not None:
        api_call = "https://geodeepdive.org/api/articles?recent" + f"&max={n_recent_articles}"

    # Query API by date range
    elif min_date is not None:
        if max_date is not None:
            api_call = f"https://xdd.wisc.edu/api/articles?min_acquired={min_date}&max_acquired={max_date}"
        
        else:
            api_call = f"https://xdd.wisc.edu/api/articles?min_acquired={min_date}"
    
    else:
        raise ValueError("Either n_recent_articles or min_date should be specified")
    

    # =========== Format the API return to Json file ==========
    response = requests.get(api_call).json()

    data = response['success']['data']

    # initialize the resulting dataframe
    gdd_df = pd.DataFrame()

    for article in data:
        one_article_dict = {}
        one_article_dict['_gddid'] = [article['_gddid']]

        if article['identifier'][0]['type'] == 'doi':
            one_article_dict['DOI'] = [article['identifier'][0]['id']]
        else: 
            one_article_dict['DOI'] = ['Non-DOI Article ID type']
        
        one_article_dict['url'] = [article['link'][0]['url']]
        one_article_dict['status'] = 'queried'

        one_article = pd.DataFrame(one_article_dict)
        gdd_df = pd.concat([gdd_df, one_article])
    
    gdd_df = gdd_df.reset_index(drop=True)

    result_dict = {}
    result_dict['n_returned_article'] = gdd_df.shape[0]
    result_dict['param_min_date'] = min_date
    result_dict['param_max_date'] = max_date
    result_dict['param_n_recent_articles'] = n_recent_articles
    result_dict['data'] = gdd_df.to_dict()

    # Write the JSON object to a file
    with open(output_path + '/gdd_api_return.json', "w") as file:
        json.dump(result_dict, file)



In [25]:
def crossref_extract(doi_path):
    """Extract metadata from the Crossref API for article's in the doi csv file.
    Extracted data are returned in a pandas dataframe.

    If certain DOI is not found on CrossRef, the DOI will be logged in the prediction_pipeline.log file. 
    
    Args:
        doi_path (str): Path to the doi list csv file.
        doi_col (str): Column name of DOI.
    
    Return:
        pandas Dataframe containing CrossRef metadata.
    """

    with open(doi_path) as json_file:
        data_dictionary = json.load(json_file)

    df = pd.DataFrame(data_dictionary['data'])
    doi_col = 'DOI'

    # a list of doi
    input_doi = df[doi_col].unique().tolist()

    # Initialize
    crossref = pd.DataFrame()

    # Loop through all doi, concatenate metadata into dataframe
    for doi in input_doi:
        cross_ref_url = f"https://api.crossref.org/works/{doi}"

         # make a request to the API
        cross_ref_response = requests.get(cross_ref_url)

        if cross_ref_response.status_code == 200:

            ref_json = pd.DataFrame(cross_ref_response.json())            
            ref_df = pd.DataFrame(ref_json.loc[:, 'message']).T.reset_index()
            ref_df['valid_for_prediction'] = 1
            if 'abstract' not in ref_df.columns:
                ref_df['abstract'] = ''

            crossref = pd.concat([crossref, ref_df])

        else: 
            pass
    
    # Clean up columns and return the resulting pandas data frame
    crossref_keep_col = ['valid_for_prediction', 'DOI',
        'URL',
        'abstract',
        'author',
        'container-title',
        'is-referenced-by-count', # times cited
        'language',
        'published', # datetime
        'publisher', 
        'subject', # keywords of journal
        'subtitle', # subtitle are missing sometimes
        'title'
        ]
    
    crossref = crossref.loc[:, crossref_keep_col].reset_index(drop = True)


    # join gdd_id to the metadata df
    print(df.columns)
    df = df.loc[:, [doi_col, 'gddid']]
    df['DOI'] = df['DOI'].str.lower() # CrossRef return lowercase DOI
    result_df = pd.merge(df, crossref, on='DOI', how='left')
    result_df = result_df.rename(columns = {'container-title': 'journal'})

    # Add valid_for_prediction indicator
    result_df['valid_for_prediction'] = result_df['valid_for_prediction'].fillna(value=0).astype(int)

    return result_df


In [9]:
import requests
import json
import os
import numpy as np
import pandas as pd
import re

min_date = '2023-06-05'
api_call = f"https://xdd.wisc.edu/api/articles?min_acquired={min_date}&full_results=true"

response = requests.get(api_call).json()

data = response['success']['data']
type(data)


list

In [28]:
min_date = '2023-06-05'

api_call = f"https://xdd.wisc.edu/api/articles?min_acquired={min_date}&full_results=true"

response = requests.get(api_call).json()
next_page = response['success']['next_page']

i = 0 # limit 3 times max

while (next_page != '') and (i < 3):
    print(f"going to the next page at {next_page}")
    next_response = requests.get(next_page)
    print('=========next response done=========')
    print(next_response)
    next_response_json = next_response.json()
    print('=========next response json done=========')

    data.append(next_response_json['success']['data'])
    
    print(f"There are {len(data)} articles.")

    next_page = next_response_json['success']['next_page']
    i += 1

    print(f"next page is now {next_page}, iteration {i}")


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [16]:
url = "https://xdd.wisc.edu/api/articles"
params = {"scroll_id": "08e46e34-2878-4e57-8fe5-72182e41883b"}

response = requests.get(url, params=params)
response
# json_response = response.json()

<Response [502]>