## Generating Linkages
This notebook generates potential linkages between datasets and publications by running full text searches through various APIs. The search terms used are dataset `titles` (dataset `alt_titles` can also be used, though they are not here). NOAA datasets are used in this example.

In [15]:
import os
import json
import pandas as pd
import re
import importlib
import requests
import dimcli
from bs4 import BeautifulSoup
from collections import OrderedDict
import datetime
import richcontext.scholapi as scholapi
# importlib.reload(scholapi)

#### Create and instance of the scholapi class from the richcontext.scholapi library (`scholapi`)

In [3]:
schol = scholapi.ScholInfraAPI(config_file="rc.cfg")

#### Read in datasets and filter to those for a specific client

In [4]:
with open('/Users/sophierand/RCDatasets/datasets.json') as json_file:
    datasets = json.load(json_file) 

In [5]:
with open('/Users/sophierand/RCCustomers/customers.json') as json_file:
    customers = json.load(json_file) 

In [14]:
noaa_datasets = [c['datasets'] for c in customers if c['name'] == 'NOAA'][0]
# noaa_datasets

### Full Text Search Functions

In [6]:
def parse_api_results(n,api_name,nresults):
    s_term = n["title"]
    s_id = n["id"]

    if api_name == "openaire":
        search_results = schol.openaire.full_text_search(search_term = s_term, nresults = nresults)

    if api_name == "pubmed":
        search_results = schol.pubmed.full_text_search(search_term = s_term, nresults = nresults)
    
    if api_name == "dimensions":
        search_results = schol.dimensions.full_text_search(search_term = s_term,nresults = nresults)

    if search_results:
        if isinstance(search_results,list) and len(search_results) > 0:
            meta_list = []
            for m in search_results:
                if api_name == "openaire":
                    meta = schol.openaire.parse_oa(result = m)
                if api_name == "pubmed":
                    meta = schol.pubmed.parse_pubmed(result = m)
                if api_name == "dimensions":
                    meta = schol.dimensions.parse_dimensions(result = m)
                if meta:
                    meta_list.append(meta)


            return meta_list
    elif not search_results:
        print('no results for search term {} from API {}'.format(s_term, api_name))
        return None

def convert_md_list(n,meta_list):
    s_term = n["title"]
    s_id = n["id"]
    if len(meta_list) > 0:
        df = pd.DataFrame(meta_list)
        df["authors"] = df['authors'].apply(lambda x:', '.join(x) if isinstance(x,list) else x)
        df['search_term'] = s_term
        df['dataset_id'] = s_id
        return df

def export_linkages(df,s_term,api_name):
    folder_name = "/Users/sophierand/RichContextMetadata/metadata/{}_{}_{}".format(datetime.date.today().strftime("%Y%m%d"),api_name,re.sub(" ","",s_term))
    path_name = folder_name + "/{}.csv".format(re.sub(" ","",s_term))
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)
    df.to_csv(path_name,index = False, encoding='utf-8-sig')

### Running the below cells will create exports in the RichContextMetadata/metadata folder

#### OpenAire

In [44]:
oa_ds_names = ['National Data Buoy Center',
 'Long Range Forecasts',
 'Local Climatological Data',
 'Local Climate Analysis Tool Data',
 'Integrated Surface Database',
 'Global Precipitation Measurement',
 'Global Historical Climatology Network',
 'Global Forecast System',
 'ERA5 Reanalysis',
 'Digital Forecast Database',
 'Digital Elevation Model Data',
 'Coastal Topographic Lidar',
 'Coastal Flood Frequency',
 'Coastal Change Analysis Program Data',
 'Climate Resilience Screening Index and Domain Scores',
 'Billion-Dollar Weather and Climate Disasters',
 'Applied Climate Information System',
 'Next Generation Weather Radar',
 'Shuttle Radar Topography Mission',
 'Global Precipitation Measurement',
 'Global Historical Climatology Network',
 'Tidal Datum',
 'Long Range Forecasts']
oa_ds = [d for d in datasets if d['title'] in oa_ds_names]

In [18]:
m = schol.openaire.full_text_search(search_term = "Sea Level Rise",nresults = 5)

In [20]:
# m[0]

In [45]:
for n in oa_ds:
    s_term = n["title"]
    s_id = n["id"]
    meta_list = parse_api_results(n = n, api_name = 'openaire',nresults = 100)
    if meta_list:
        df = convert_md_list(n = n, meta_list = meta_list)
        if df is not None and len(df) > 0:
            export_linkages(df = df, s_term = s_term ,api_name = 'openaire')

#### Pubmed

In [78]:
meta_full = schol.pubmed.full_text_search(search_term = "Sea Level Rise Inundation")

re-exporting for csvs that were too large

In [16]:
# noaa_ids = [
#     "dataset-612","dataset-613","dataset-620","dataset-615","dataset-616","dataset-624","dataset-622",
#     "dataset-611","dataset-621","dataset-614","dataset-610","dataset-623","dataset-618" 
#             ]
# noaa_datasets_sub_pubmed = [d for d in noaa_datasets if d['id'] in noaa_ids]

In [64]:
pubmed_ds_names = [
 'Coastal Topographic Lidar',
 'Sea Level Rise Inundation',
 'Tidal Datum',
 'Integrated Surface Database',
 'Coastal Flood Frequency',
 'National Land Cover Database',
 'US Interagency Elevation Inventory',
 'Global Forecast System']
pubmed_ds = [d for d in datasets if d['title'] in pubmed_ds_names]
pubmed_ds.extend([{"title":"NEXRAD","id":"dataset-612"},{"id":"dataset-610","title":"Sea Level Rise Data"}])
# pubmed_ds

In [76]:
for n in pubmed_ds:
    s_term = n["title"]
    s_id = n["id"]
    meta_list = parse_api_results(n = n, api_name = 'pubmed',nresults = 100)
    if meta_list:
        df = convert_md_list(n = n, meta_list = meta_list)
        if df is not None and len(df) > 0:
            export_linkages(df = df, s_term = s_term ,api_name = 'pubmed')

no results for search term US Interagency Elevation Inventory from API pubmed


#### Dimensions

In [87]:
# noaa_ids = [
#     "dataset-624","dataset-614","dataset-623","dataset-613","dataset-611","dataset-622"
# ]
# noaa_datasets_sub_dimensions = [d for d in noaa_datasets if d['id'] in noaa_ids]
dimensions_ds_names = [
 'Coastal Flood Frequency',
 'Coastal Change Analysis Program Data',
 'National Land Cover Database']
dimensions_ds = [d for d in datasets if d['title'] in dimensions_ds_names]

In [88]:
for n in dimensions_ds:
    s_term = n["title"]
    s_id = n["id"]
    meta_list = parse_api_results(n = n, api_name = 'dimensions',nresults = 100)
    if meta_list:
        df = convert_md_list(n = n, meta_list = meta_list)
        if df is not None and len(df) > 0:
            export_linkages(df = df, s_term = s_term ,api_name = 'dimensions')

no results for search term Coastal Change Analysis Program Data from API dimensions


In [80]:
meta_full = schol.dimensions.full_text_search(search_term = "Sea Level Rise Inundation")

### Single Search Term

In [50]:
def parse_dimensions(result):
    if result["type"] in ["article","preprint"]:
        meta = OrderedDict()
        meta["title"] = result["title"]
        try:
            meta["journal"] = result["journal"]["title"]
        except:
            pass
        try:
            meta["doi"] = result["doi"]
        except:
            pass
        try:
            author_list = result["authors"]
            meta["authors"] = [b["last_name"] + ", " + b["first_name"] for b in author_list]
        except:
            pass
        return meta
    else:
        return None

In [39]:
dimcli.login(username='sr2661@nyu.edu',
                password='',
                verbose=False
                )
api_obj = dimcli.Dsl(verbose=False)

In [42]:
query = 'search publications for "\\"NOAA\\" AND \\"Social Vulnerability Index\\"" return publications[all] limit 100'
response = api_obj.query(query)
search_results = response.publications

In [51]:
meta_list = []
for m in search_results:
    meta = parse_dimensions(result = m)
    if meta:
        meta_list.append(meta)


In [56]:
df = pd.DataFrame(meta_list)
df["authors"] = df['authors'].apply(lambda x:', '.join(x) if isinstance(x,list) else x)
df['search_term'] = "NOAA AND 'Social Vulnerability Index'"
df['dataset_id'] = "dataset-613"
# df.to_csv("")

In [59]:
folder_name = "/Users/sophierand/RichContextMetadata/metadata/{}_dimensions_svi_noaa".format(datetime.date.today().strftime("%Y%m%d"))
path_name = folder_name + "/dimensions_svi_noaa.csv"
if not os.path.exists(folder_name):
    os.mkdir(folder_name)
df.to_csv(path_name,index = False, encoding='utf-8-sig')

In [None]:
query = 'search publications in full_data_exact for "{}" return publications[all] limit 1000'.format(search_term)

In [30]:
a

<dimcli.Dataset object #4691627200. Records: 10/2215>

In [None]:
'search publications for "\\"NOAA\\" AND \\"phrase 2\\""',

"\\"{}\\""

In [None]:
search publications in authors for "\"Jennifer A Doudna\"" return publications