## Generating Linkages
This notebook generates potential linkages between datasets and publications by running full text searches through various APIs. The search terms used are dataset `titles` (dataset `alt_titles` can also be used, though they are not here). NOAA datasets are used in this example.

In [51]:
import os
import json
import pandas as pd
import re
import importlib
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
import datetime
import scholapi
importlib.reload(scholapi)

<module 'scholapi' from '/Users/sophierand/RCCustomers/scholapi.py'>

#### Create and instance of the scholapi class from the richcontext.scholapi library (`scholapi`)

In [52]:
schol = scholapi.ScholInfraAPI(config_file="rc.cfg")

#### Read in datasets and filter to those for a specific client

In [3]:
with open('/Users/sophierand/RCDatasets/datasets.json') as json_file:
    datasets = json.load(json_file) 

In [4]:
noaa_datasets = [d for d in datasets if d['provider'] in ["provider-285", "provider-150","provider-287","provider-295","dataset-627"]]

### Full Text Search Functions

In [23]:
def parse_api_results(n,api_name,nresults):
    s_term = n["title"]
    s_id = n["id"]

    if api_name == "openaire":
        search_results = schol.openaire.full_text_search(search_term = s_term, nresults = nresults)

    if api_name == "pubmed":
        search_results = schol.pubmed.full_text_search(search_term = s_term, nresults = nresults)
    
    if api_name == "dimensions":
        search_results = schol.dimensions.full_text_search(search_term = s_term,nresults = nresults)

    if search_results:
        if isinstance(search_results,list) and len(search_results) > 0:
            meta_list = []
            for m in search_results:
                if api_name == "openaire":
                    meta = schol.openaire.parse_oa(result = m)
                if api_name == "pubmed":
                    meta = schol.pubmed.parse_pubmed(result = m)
                if api_name == "dimensions":
                    meta = schol.dimensions.parse_dimensions(result = m)
                if meta:
                    meta_list.append(meta)


            return meta_list
    elif not search_results:
        print('no results for search term {} from API {}'.format(s_term, api_name))
        return None

def convert_md_list(n,meta_list):
    s_term = n["title"]
    s_id = n["id"]
    if len(meta_list) > 0:
        df = pd.DataFrame(meta_list)
        df["authors"] = df['authors'].apply(lambda x:', '.join(x) if isinstance(x,list) else x)
        df['search_term'] = s_term
        df['dataset_id'] = s_id
        return df

def export_linkages(df,s_term,api_name):
    folder_name = "/Users/sophierand/RichContextMetadata/metadata/{}_{}_{}".format(datetime.date.today().strftime("%Y%m%d"),api_name,re.sub(" ","",s_term))
    path_name = folder_name + "/{}.csv".format(re.sub(" ","",s_term))
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)
    df.to_csv(path_name,index = False)

### Running the below cells will create exports in the RichContextMetadata/metadata folder

#### OpenAire

In [38]:
# m = schol.openaire.full_text_search(search_term = "US Interagency Elevation Inventory")

In [None]:
for n in noaa_datasets:
    s_term = n["title"]
    s_id = n["id"]
    meta_list = parse_api_results(n = n, api_name = 'openaire',nresults = 100)
    if meta_list:
        df = convert_md_list(n = n, meta_list = meta_list)
        if df is not None and len(df) > 0:
            export_linkages(df = df, s_term = s_term ,api_name = 'openaire')

#### Pubmed

In [None]:
# meta_full = schol.pubmed.full_text_search(search_term = "Sea Level Rise Inundation")

In [None]:
for n in noaa_datasets:
    s_term = n["title"]
    s_id = n["id"]
    meta_list = parse_api_results(n = n, api_name = 'pubmed',nresults = 100)
    if meta_list:
        df = convert_md_list(n = n, meta_list = meta_list)
        if df is not None and len(df) > 0:
            export_linkages(df = df, s_term = s_term ,api_name = 'pubmed')

#### Dimensions

In [None]:
for n in noaa_datasets:
    s_term = n["title"]
    s_id = n["id"]
    meta_list = parse_api_results(n = n, api_name = 'dimensions',nresults = 100)
    if meta_list:
        df = convert_md_list(n = n, meta_list = meta_list)
        if df is not None and len(df) > 0:
            export_linkages(df = df, s_term = s_term ,api_name = 'dimensions')