In [1]:
# TITLE

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

# magics and warnings
%load_ext autoreload
%autoreload 2
import warnings; warnings.simplefilter('ignore')

import os, random, codecs, json, time
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np

seed = 99
random.seed(seed)
np.random.seed(seed)

import nltk, sklearn
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white")
sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 2.5})

In [3]:
# datasets

who_latest = "datasets/WHO_18_03_2020.csv"
dimensions_latest = "datasets/Dimensions_18_03_2020.csv"

df_who = pd.read_csv(who_latest)
df_dimensions = pd.read_csv(dimensions_latest)

In [4]:
# clean DOIs

def clean_doi(d):
    if isinstance(d,str):
        d = d.replace("https://doi.org/","")
        d = d.replace("doi:","")
        return d
    return d

In [5]:
df_who["DOI"] = df_who["DOI"].apply(clean_doi)
df_dimensions["DOI"] = df_dimensions["DOI"].apply(clean_doi)

In [6]:
df_who.head()

Unnamed: 0,Title,Authors,Abstract,Published Year,Published Month,Journal,Volume,Issue,Pages,Accession Number,DOI,Ref,Covidence #,Study,Notes,Tags
0,Platelet-to-lymphocyte ratio is associated wit...,"Qu, Rong; Ling, Yun; Zhang, Yi-Huizhi; Wei, Li...","INTRODUCTION: SinceDecember 2019, novelcoronav...",2020.0,,Journal of medical virology,,,,,10.1002/jmv.25767,9121,#8973,Qu 2020,,* Case study/series; * Opinion piece; Clinical...
1,Epidemiologic and Clinical Characteristics of ...,"Qian, Guo-Qing; Yang, Nai-Bin; Ding, Feng; Ma,...",BACKGROUND: Recent studies have focused initia...,2020.0,,QJM : monthly journal of the Association of Ph...,,,,,10.1093/qjmed/hcaa089,9120,#9128,Qian 2020,,* Epidemiological study; * Opinion piece; Epid...
2,Correlation between travellers departing from ...,"Ping Zhong, M. D. Songxue Guo M. D. Ting Chen ...",Highlight We found a strong correlation betwee...,2020.0,,Journal of Travel Medicine,,,,,,9099,#9222,PingZhong 2020,,* Epidemiological study; * Opinion piece; Epid...
3,On the front lines of coronavirus: the Italian...,"Paterlini, Marta",Italy has rapidly become the country hit secon...,2020.0,,BMJ,368.0,,m1065-m1065,,10.1136/bmj.m1065,9119,#8989,Paterlini 2020,,"* Opinion piece; Epidemiology; Ethics, social ..."
4,Coronavirus cases have dropped sharply in Sout...,"Normile, Dennis",Europe is now the epicenter of the COVID-19 pa...,2020.0,,Science,,,,,10.1126/science.abb7566,9620,#9246,Normile 2020,,* Opinion piece; Epidemiology; Infection preve...


In [7]:
df_dimensions.head()

Unnamed: 0,Date added,Publication ID,DOI,PMID,PMCID,Title,Abstract,Source title,Source UID,Publisher,...,Research Organizations - standardized,GRID IDs,City of Research organization,Country of Research organization,Funder,UIDs of supporting grants,Times cited,Altmetric,Source Linkout,Dimensions URL
0,2020-03-16,pub.1125672140,10.5812/iji.102184,,,COVID-19: The New Threat,,International Journal of Infection,jour.1051749,Kowsar Medical Institute,...,,,,,,,,,,https://app.dimensions.ai/details/publication/...
1,2020-03-16,pub.1125670218,10.1515/cclm-2020-0188,,,Prominent changes in blood coagulation of pati...,Abstract Background As the number of patients...,Clinical Chemistry and Laboratory Medicine,jour.1294896,De Gruyter,...,,,,,National Natural Science Foundation of China,grant.8360207,,2.0,,https://app.dimensions.ai/details/publication/...
2,2020-03-16,pub.1125671401,10.3348/kjr.2020.0163,,,What Is Needed to Make Interventional Radiolog...,,Korean Journal of Radiology,jour.1023226,The Korean Society of Radiology (KAMJE),...,National University of Singapore; Tan Tock Sen...,grid.4280.e; grid.240988.f,Singapore; Singapore,Singapore; Singapore,,,,,,https://app.dimensions.ai/details/publication/...
3,2020-03-16,pub.1125671336,10.32598/jpr.8.2.139,,,COVID-19 Infection in Iranian Children: A Case...,,Journal of Pediatrics Review,jour.1154967,Negah Scientific Publisher,...,,,,,,,,,,https://app.dimensions.ai/details/publication/...
4,2020-03-16,pub.1125671402,10.3348/kjr.2020.0164,,,Computed Tomographic Findings in COVID-19,,Korean Journal of Radiology,jour.1023226,The Korean Society of Radiology (KAMJE),...,Hainan Medical University,grid.443397.e,Haikou,China,,,,,,https://app.dimensions.ai/details/publication/...


In [8]:
# check DOIs
print("WHO")
print(df_who.shape)
print(df_who[pd.notna(df_who["DOI"])].shape)

print("Dimensions")
print(df_dimensions.shape)
print(df_dimensions[pd.notna(df_dimensions["DOI"])].shape)

WHO
(2048, 16)
(1699, 16)
Dimensions
(2956, 31)
(2888, 31)


In [9]:
df_join = df_dimensions.join(df_who.set_index("DOI"), how='inner', on="DOI", lsuffix='dimensions', rsuffix='who')

In [10]:
df_join = df_join[pd.notna(df_join["DOI"])]

In [11]:
df_join.shape

(1021, 46)

In [12]:
df_join.head()

Unnamed: 0,Date added,Publication ID,DOI,PMID,PMCID,Titledimensions,Abstractdimensions,Source title,Source UID,Publisher,...,Journal,Volumewho,Issuewho,Pages,Accession Number,Ref,Covidence #,Study,Notes,Tags
1,2020-03-16,pub.1125670218,10.1515/cclm-2020-0188,,,Prominent changes in blood coagulation of pati...,Abstract Background As the number of patients...,Clinical Chemistry and Laboratory Medicine,jour.1294896,De Gruyter,...,Clinical chemistry and laboratory medicine,,,,,8928,#8776,Han 2020,,"* Case study/series; Clinical aspects, diagnos..."
2,2020-03-16,pub.1125671401,10.3348/kjr.2020.0163,,,What Is Needed to Make Interventional Radiolog...,,Korean Journal of Radiology,jour.1023226,The Korean Society of Radiology (KAMJE),...,Korean journal of radiology,,,,,8907,#8694,Pua 2020,,* Case study/series; Infection prevention and ...
4,2020-03-16,pub.1125671402,10.3348/kjr.2020.0164,,,Computed Tomographic Findings in COVID-19,,Korean Journal of Radiology,jour.1023226,The Korean Society of Radiology (KAMJE),...,,,,,,8906,#8757,Joob 2020,,"* Opinion piece; Clinical aspects, diagnosis, ..."
6,2020-03-16,pub.1125670220,10.1515/cclm-2020-0285,,,Potential preanalytical and analytical vulnera...,Abstract A novel zoonotic coronavirus outbreak...,Clinical Chemistry and Laboratory Medicine,jour.1294896,De Gruyter,...,Clinical chemistry and laboratory medicine,,,,,8926,#8733,Lippi 2020,,"* Narrative review; Clinical aspects, diagnosi..."
7,2020-03-16,pub.1125671400,10.3348/kjr.2020.0157,,,Evolution of Computed Tomography Manifestation...,,Korean Journal of Radiology,jour.1023226,The Korean Society of Radiology (KAMJE),...,Korean journal of radiology,,,,,8908,#8673,Sun 2020,,"* Case study/series; Clinical aspects, diagnos..."


In [13]:
who_dois = df_who[pd.notnull(df_who["DOI"])]["DOI"].tolist()
dimensions_dois = df_dimensions[pd.notnull(df_dimensions["DOI"])]["DOI"].tolist()
dimensions_pmids = df_dimensions[(pd.notnull(df_dimensions["PMID"])) & ~(pd.notnull(df_dimensions["DOI"]))]["PMID"].tolist()

In [14]:
len(set(dimensions_dois).intersection(set(who_dois)))

950

In [15]:
all_dois = list(set(dimensions_dois).union(set(who_dois)))

In [16]:
print(len(all_dois))

3533


In [17]:
extra_pmids = list(set(dimensions_pmids))

In [18]:
print(len(extra_pmids))

17


#### TODO
*Clean up this mess and align the two (or more) datasets to have all publications.*

## Focus on Dimensions to test the Altmetrics API

Examples:
* PMID http://api.altmetric.com/v1/id/241939?key=
* DOI http://api.altmetric.com/v1/doi/10.1038/news.2011.490?key=

In [19]:
# get API key
import configparser
config = configparser.ConfigParser()
config.read("credentials/conf.ini")
api_key = config["ALTMETRICS"]["key"]

In [20]:
import requests

doi_base_url = "http://api.altmetric.com/v1/fetch/doi/"
pmid_base_url = "http://api.altmetric.com/v1/fetch/id/"

In [90]:
payload = {'key': api_key}
r = requests.get(doi_base_url+"10.3389/fdigh.2019.00004", params=payload)

In [91]:
r.json()

{'altmetric_id': 56825787,
 'counts': {'readers': {'citeulike': '0', 'mendeley': '8', 'connotea': '0'},
  'total': {'posts_count': 26},
  'twitter': {'unique_users_count': 24, 'posts_count': 26}},
 'citation': {'altmetric_jid': '565496cd2a83ee7c2d8b4582',
  'authors': ['Giovanni Colavizza', 'Maud Ehrmann', 'Fabio Bortoluzzi'],
  'doi': '10.3389/fdigh.2019.00004',
  'first_seen_on': '2019-03-11T07:51:13+00:00',
  'handles': [],
  'isbns': [],
  'issns': ['2297-2668'],
  'journal': 'Frontiers in Digital Humanities',
  'last_mentioned_on': 1572254582,
  'links': ['https://www.frontiersin.org/articles/10.3389/fdigh.2019.00004/full'],
  'pdf_url': 'https://www.frontiersin.org/articles/10.3389/fdigh.2019.00004/pdf',
  'epubdate': '2019-03-11T00:00:00+00:00',
  'publisher': 'Frontiers',
  'title': 'Index-Driven Digitization and Indexation of Historical Archives',
  'type': 'article',
  'mendeley_url': 'https://www.mendeley.com/catalogue/e8ad1211-d223-306a-98c2-be3ea8615def/'},
 'altmetric_sco

In [59]:
r.json()["posts"]

{'twitter': [{'license': 'gnip',
   'citation_ids': [56825787],
   'author': {'tweeter_id': '1015681547415031808'},
   'tweet_id': '1105013238209626112'},
  {'license': 'gnip',
   'citation_ids': [56825787],
   'author': {'tweeter_id': '303772119'},
   'tweet_id': '1105016337280180224'},
  {'license': 'gnip',
   'citation_ids': [56825787],
   'author': {'tweeter_id': '788660782586265600'},
   'tweet_id': '1105017903185842176'},
  {'license': 'gnip',
   'citation_ids': [56825787],
   'author': {'tweeter_id': '15516583'},
   'tweet_id': '1105030787341144064'},
  {'license': 'gnip',
   'citation_ids': [56825787],
   'author': {'tweeter_id': '710106109676281857'},
   'tweet_id': '1105043233623736322'},
  {'license': 'gnip',
   'citation_ids': [56825787],
   'author': {'tweeter_id': '186378357'},
   'tweet_id': '1105044322146111488'},
  {'license': 'gnip',
   'citation_ids': [56825787],
   'author': {'tweeter_id': '546921239'},
   'tweet_id': '1105051405474324480'},
  {'license': 'gnip',
  

In [139]:
# get and save all results
out_folder = "json_altmetrics"
all_tweet_ids = list()
all_author_ids = list()

for doi in tqdm.notebook(all_dois):
    r = requests.get(doi_base_url+doi, params=payload)
    if not r.status_code == 200:
        print("Not found",doi)
        continue
    f_name = doi.replace(".","_")
    f_name = f_name.replace("/",":")
    with codecs.open(os.path.join(out_folder,f_name)+".json","w") as f:
        json.dump(r.json(), f)
    if isinstance(r.json()["posts"],dict) and "twitter" in r.json()["posts"].keys():
        for tweet in r.json()["posts"]["twitter"]:
            all_tweet_ids.append((doi,tweet["tweet_id"],tweet["author"]["tweeter_id"]))
            all_author_ids.append(tweet["author"]["tweeter_id"])

Not found 10.31525/ct1-nct04288713
Not found 10.1007/s11298-020-7917-9
Not found 10.1016/j.talanta.2020.120865
Not found 10.1111/j.1467-6346.2020.09353.x
Not found 10.1016/j.jhin.2020.03.012
Not found 10.1631/jzus.B2010011
Not found 10.1002/9781119482307
Not found 10.22038/jctm.2020.46924.1264
Not found 10.26434/chemrxiv.11846943.v5
Not found 10.26434/chemrxiv.11936292.v1
Not found 10.21203/rs.3.rs-16376/v1
Not found 10.1007/s35128-020-0312-0
Not found 10.1016/s0262-4079(20)30526-1
Not found 10.5811/westjem.2020.1.46760
Not found 10.4324/9781315660516-27
Not found 10.1016/j.orbis.2020.02.010
Not found 10.2139/ssrn.3547745
Not found 10.4324/9781315660516-5
Not found 10.1016/s0262-4079(20)30375-4
Not found 10.3760/cma.j.issn.1001-0939.2020.03.006
Not found https://belitungraya.org/BRP/index.php/bnj/article/view/1058
Not found 10.1016/s0262-4079(20)30424-3
Not found 10.21203/rs.3.rs-16763/v1
Not found 10.3760/cma.j.cn112338-20200221-00139
Not found 10.1111/tbed.13385
Not found 10.1177/216

Not found 10.21203/rs.2.23981/v1
Not found 10.1201/9781351023504-2
Not found 10.21037/fomm.2020.02.01
Not found 10.1007/s11111-019-00333-6
Not found 10.26434/chemrxiv.11938173.v1
Not found 10.1016/s0262-4079(20)30233-5
Not found 10.1201/9781351023504-5
Not found 10.1292/jvms.19-0491
Not found 10.1111/jam.14532
Not found 10.3760/cma.j.issn.1673-0860.2020.04.001
Not found 10.3760/cma.j.cn112338-20200221-00146
Not found 10.1136/vr.m972
Not found 10.1016/j.explore.2020.02.022
Not found 10.4324/9781315660516-1
Not found 10.1201/9780429001208-9
Not found 10.2139/ssrn.3536663
Not found 10.1080/00206814.2020.1731856
Not found 10.1002/9781119371199.ch14
Not found 10.1097/CM9.0000000000000776
Not found 10.4103/mjdrdypu.mjdrdypu_31_20
Not found 10.21203/rs.3.rs-16659/v1
Not found 10.36106/ijsr/7835401
Not found 10.1016/S0262-4079(20)30476-0
Not found 10.1002/9781119371199.ch4
Not found 10.3760/cma.j.issn.1001-0939.2020.03.008
Not found 10.1002/cbin.11276
Not found 10.1096/fj.201902031rrr
Not foun

Not found 10.23736/S0026-4725.20.05250-0
Not found 10.1089/gen.40.03.04
Not found 10.1016/j.onehlt.2020.100127
Not found 10.31525/ct1-nct04259892
Not found 10.1016/s0262-4079(20)30524-8
Not found 10.1016/j.compbiomed.2020.103670
Not found 10.31525/ct1-nct04284046
Not found 10.4324/9781315660516-25
Not found 10.1201/9780429001208-5
Not found 10.12834/vetit.2173.11599.1
Not found 10.1126/science.abb7506
Not found 10.2196/preprints.18533
Not found 10.22158/ibes.v2n2p26
Not found 10.1016/S0262-4079(20)30402-4
Not found 10.2139/ssrn.3542817
Not found 10.1111/tbed.13422
Not found 10.1016/j.micpath.2019.103922
Not found 10.3760/cma.j.cn112142-20200219-00089
Not found 10.1016/j.psj.2019.11.044
Not found 10.1016/s0262-4079(20)30234-7
Not found 10.14202/vetworld.2020.400-406
Not found 10.3390/jrfm13020036 ERT - Y - EJOU 10.3390/jcm9020596
Not found 10.1096/fj.201902534r
Not found 10.1111/tbed.13339
Not found 10.26434/chemrxiv.11846943.v4
Not found 10.1002/ped4.12178
Not found 10.3934/mbe.2020149

Not found 10.1016/S0262-4079(20)30188-3
Not found 10.34172/hpp.2020.15
Not found 10.1016/S2468-2667(20)30050-5
Not found 10.1126/science.abb5683
Not found 10.1007/s11071-020-05560-3
Not found 10.2139/ssrn.3546741
Not found 10.1016/s0262-4079(20)30475-9
Not found 10.7507/1672-2531.202001121
Not found 10.1080/15376516.2019.1669249
Not found 10.1016/j.scib.2020.02.005
Not found 10.1254/jpssuppl.93.0_2-ES-2
Not found 10.12968/denn.2020.16.3.147
Not found 10.3760/cma.j.issn.0253-2727.2020.0004
Not found 10.26434/chemrxiv.11955273.v1
Not found 10.1016/s1473-3099(20)30048-7
Not found 10.1111/jscm.12225
Not found 10.2139/ssrn.3547219
Not found 10.1089/dna.2019.5340
Not found 10.1126/science.abb6154
Not found 10.21203/rs.3.rs-15734/v1
Not found 10.1016/s0262-4079(20)30521-2
Not found 10.1007/s12033-019-00222-1
Not found 10.1201/9780429001208-4
Not found 10.1016/j.nwh.2020.02.002
Not found 10.1136/vr.m740
Not found 10.3760/cma.j.cn112138-20200219-00097
Not found 10.1038/s41438-020-0240-5
Not fou

Not found 10.1016/j.tmaid.2020.101622
Not found 10.3760/cma.j.cn112150-20200227-00196
Not found 10.1111/ijcp.13365
Not found 10.1016/j.fopow.2020.03.014
Not found 10.2139/ssrn.3545758
Not found 10.31525/ct1-nct04261270
Not found 10.1007/s15006-020-0080-0
Not found 10.1201/9780429001208-8
Not found 10.2478/acph-2019-0015
Not found 10.31525/ct1-nct04287686
Not found 10.1016/j.ajp.2020.101990
Not found 10.1016/b978-0-12-818882-8.00004-8
Not found 10.2196/preprints.18503
Not found 10.1016/j.cmi.2020.02.020
Not found 10.3389/fimmu.2019.03131
Not found 10.21275/sr20228084927
Not found 10.3390/v12020214
Not found 10.1021/cen-09805-buscon4
Not found 10.3760/cma.j.cn112338-20200210-00086
Not found 10.3390/molecules25040942
Not found 10.1201/9780429001208-20


In [153]:
# get and save all results
out_folder = "json_altmetrics"

for pmid in tqdm(extra_pmids):
    r = requests.get(pmid_base_url+str(int(pmid)), params=payload)
    if not r.status_code == 200:
        print("Not found",str(int(pmid)))
        continue
    f_name = str(int(pmid))
    with codecs.open(os.path.join(out_folder,f_name)+".json","w") as f:
        json.dump(r.json(), f)
    if isinstance(r.json()["posts"],dict) and "twitter" in r.json()["posts"].keys():
        for tweet in r.json()["posts"]["twitter"]:
            all_tweet_ids.append((str(int(pmid)),tweet["tweet_id"],tweet["author"]["tweeter_id"]))
            all_author_ids.append(tweet["author"]["tweeter_id"])

HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))

Not found 32138488
Not found 32051073
Not found 32078595
Not found 32078596
Not found 32051074
Not found 32009128
Not found 32096366
Not found 32096367
Not found 32049463
Not found 32153144
Not found 32105049
Not found 32051071



In [168]:
import pickle

pickle.dump(set(all_author_ids), open("all_author_ids.pk", "wb"))
pickle.dump(set(all_tweet_ids), open("all_tweet_ids.pk", "wb"))
with codecs.open("all_tweet_ids.csv", "w") as f:
    f.write("publication_id,tweet_id,user_id\n")
    for tweet in list(set(all_tweet_ids)):
        f.write(",".join(tweet)+"\n")

In [169]:
len(all_author_ids)

1014463

In [170]:
len(set(all_author_ids))

400003

In [171]:
len(all_tweet_ids)

1014452

In [172]:
len(set(all_tweet_ids))

1014452

In [23]:
import pickle

all_author_ids = list(pickle.load(open("all_author_ids.pk","rb")))
all_tweet_ids = list(pickle.load(open("all_tweet_ids.pk","rb")))

## Twitter

In [24]:
config['TWITTER']['api_key']

'tOORsnuIC9CWSBHFxLnUo4jDV'

In [25]:
import tweepy

In [26]:
# Tweepy Hello World

# authentication (OAuth)
auth = tweepy.OAuthHandler(config['TWITTER']['api_key'], config['TWITTER']['api_secret_key'])
auth.set_access_token(config['TWITTER']['access_token'], config['TWITTER']['access_secret_token'])

In [27]:
#api = tweepy.API(auth, wait_on_rate_limit=True)

#public_tweets = api.home_timeline()
#for tweet in public_tweets[:5]:
#    print(tweet.text)

In [31]:
# get all tweets without re-tweets

api = tweepy.API(auth, wait_on_rate_limit=True)

get_tweets = api.statuses_lookup(all_tweet_ids[:100],tweet_mode="extended")
tweets = [t for t in get_tweets]

In [33]:
tweets[0]._json

{'created_at': 'Tue Feb 25 16:33:53 +0000 2020',
 'id': 1232342964120293382,
 'id_str': '1232342964120293382',
 'full_text': 'Un viejo fármaco demuestra ser eficaz contra el coronavirus       Breakthrough: Chloroquine phosphate has shown apparent efficacy in treatment of COVID-19 associated pneumonia in clinical studies. - PubMed - NCBI https://t.co/L8nBxf6kof',
 'truncated': False,
 'display_text_range': [0, 236],
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [],
  'urls': [{'url': 'https://t.co/L8nBxf6kof',
    'expanded_url': 'https://www.ncbi.nlm.nih.gov/pubmed/32074550/',
    'display_url': 'ncbi.nlm.nih.gov/pubmed/3207455…',
    'indices': [213, 236]}]},
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 232147005,
  'id_str': '23214700

## Dimensions

In [21]:
# get credentials key
import configparser
config = configparser.ConfigParser()
config.read("credentials/conf.ini")
dimensions_username = config["DIMENSIONS"]["username"]
dimensions_password = config["DIMENSIONS"]["password"]

In [26]:
# check that all workd
import requests

#   The credentials to be used
login = {
    'username': dimensions_username,
    'password': dimensions_password
}

#   Send credentials to login url to retrieve token. Raise
#   an error, if the return code indicates a problem.
#   Please use the URL of the system you'd like to access the API
#   in the example below.
resp = requests.post('https://app.dimensions.ai/api/auth.json', json=login)
resp.raise_for_status()

#   Create http header using the generated token.
headers = {
    'Authorization': "JWT " + resp.json()['token']
}

#   Execute DSL query.
resp = requests.post(
    'https://app.dimensions.ai/api/dsl.json',
    data='search publications where doi in ["10.1016/j.joi.2017.11.005","10.3389/fdigh.2019.00004","10.1007/s35128-020-0312-0"] return publications[basics]'.encode(),
    headers=headers)

#   Display raw result
print(resp.json())



In [23]:
import requests

#   The credentials to be used
login = {
    'username': dimensions_username,
    'password': dimensions_password
}

#   Send credentials to login url to retrieve token. Raise
#   an error, if the return code indicates a problem.
#   Please use the URL of the system you'd like to access the API
#   in the example below.
resp = requests.post('https://app.dimensions.ai/api/auth.json', json=login)
resp.raise_for_status()

#   Create http header using the generated token.
headers = {
    'Authorization': "JWT " + resp.json()['token']
}

In [55]:
# get and save all results
out_folder = "jsons/json_dimensions"
all_cited_ids = list() # contains the Dimensions IDs of the cited articles (from the sources)
all_source_ids = list() # contains the Dimensions IDs of the source articles
query_template_1 = 'search publications where doi in ["'
query_template_2 = '"] return publications[basics+extras] limit 500'
limit = 500
current_payload = list()

for n,doi in tqdm(enumerate(all_dois)):
    current_payload.append(doi)
    if (n > 0 and n % limit == 0) or n >= (len(all_dois)-1): # query Dimensions, limit reached
        #print((query_template_1+'","'.join(current_payload)+query_template_2))
        resp = requests.post(
            'https://app.dimensions.ai/api/dsl.json',
            data=(query_template_1+'","'.join(current_payload)+query_template_2).encode(),
            headers=headers)
        current_payload = list()
        #print(resp.json())

        #   Display raw result
        r = resp.json()
        #print(r["_stats"]["total_count"])
        #print(len(r["publications"]))

        for result in r["publications"]:
            f_name = result["id"].replace(".","_")
            with codecs.open(os.path.join(out_folder,f_name)+".json","w") as f:
                json.dump(result, f, indent=4)
            if "references" in result.keys():
                all_cited_ids.extend(result["references"])
            all_source_ids.append(result["id"])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [59]:
len(set(all_dois))

3533

In [58]:
len(set(all_source_ids))

3168

In [61]:
all_cited_ids = list(set(all_cited_ids))
len(all_cited_ids)

36912

In [62]:
# get and save all CITED
out_folder = "jsons/json_dimensions_cited"
query_template_1 = 'search publications where id in ["'
query_template_2 = '"] return publications[basics+extras] limit 500'
limit = 500
current_payload = list()

for n,did in tqdm(enumerate(list(all_cited_ids))):
    current_payload.append(did)
    if (n > 0 and n % limit == 0) or n >= (len(all_cited_ids)-1): # query Dimensions, limit reached
        #print((query_template_1+'","'.join(current_payload)+query_template_2))
        resp = requests.post(
            'https://app.dimensions.ai/api/dsl.json',
            data=(query_template_1+'","'.join(current_payload)+query_template_2).encode(),
            headers=headers)
        current_payload = list()
        #print(resp.json())

        #   Display raw result
        r = resp.json()
        #print(r["_stats"]["total_count"])
        #print(len(r["publications"]))

        for result in r["publications"]:
            f_name = result["id"].replace(".","_")
            with codecs.open(os.path.join(out_folder,f_name)+".json","w") as f:
                json.dump(result, f, indent=4)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [103]:
# check that all workd
import requests

#   The credentials to be used
login = {
    'username': dimensions_username,
    'password': dimensions_password
}

#   Send credentials to login url to retrieve token. Raise
#   an error, if the return code indicates a problem.
#   Please use the URL of the system you'd like to access the API
#   in the example below.
resp = requests.post('https://app.dimensions.ai/api/auth.json', json=login)
resp.raise_for_status()

#   Create http header using the generated token.
headers = {
    'Authorization': "JWT " + resp.json()['token']
}

#   Execute DSL query.
resp = requests.post(
    'https://app.dimensions.ai/api/dsl.json',
    data='search publications where reference_ids = \"pub.1125511397\" return publications[basics+extras] limit 1000'.encode(),
    headers=headers)

#   Display raw result
print(resp.json())



In [104]:
'search publications where reference_ids = \"pub.1093117302\" return publications[basics+extras]'.encode()

b'search publications where reference_ids = "pub.1093117302" return publications[basics+extras]'

In [25]:
# get and save all CITING
out_folder = "jsons/json_dimensions_citing"
query_template_1 = 'search publications where reference_ids = \"'
query_template_2 = '\" return publications[basics+extras] limit 1000'
total_citing = list()
query_limit = 0

for did in tqdm(extras):#all_source_ids):
    resp = requests.post(
        'https://app.dimensions.ai/api/dsl.json',
        data=(query_template_1+did+query_template_2).encode(),
        headers=headers)
    #print(resp.json())
    
    # wait on 30 requests limit per minute
    query_limit += 1
    if query_limit >= 28:
        time.sleep(120)
        query_limit = 0

    #   Display raw result
    try:
        r = resp.json()
    except:
        print(did)
        print(resp.status_code)
        continue
    #print(r["_stats"]["total_count"])
    #print(len(r["publications"]))

    for result in r["publications"]:
        f_name = result["id"].replace(".","_")
        total_citing.append(result["id"])
        with codecs.open(os.path.join(out_folder,f_name)+".json","w") as f:
            json.dump(result, f, indent=4)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


