### We will now bring together the lists of CrossRef-acquired DOIs and the manually-identified last few

In [232]:
import pandas as pd
import numpy as np

##  bring in the main list of all 2378 publications claimed in BRC phase2   
 file as exported from notebook A4   ('./CrossRef_BRC_FuzzyMatched_30Jun19.csv')

In [233]:
df_main = pd.read_csv('./A4out_CrossRef_FuzzyMatched_with_retry_07Aug19.csv', index_col=0)
df_main.shape

(2378, 20)

In [234]:
df_main.head(2)

Unnamed: 0,DOI,ID,complete,crossref_API_out,csv_post_title,DOI_API,title,title_API,title_match,DOI_match,CrossRefTitle_low,DOI_fuzzyRatio,TitleMatch01,Title_fuzzyRatio,Title_partialRatio,Complete_fuzzyRatio,Complete_partialRatio,good_list,DOI_retry_API,title_retry_API
0,10.1186/s12881-014-0095-4,1125,"&amp; , fenwick al, goos jac, rankin j, lord h...",{'title': ['Apparently synonymous substitution...,apparently synonymous substitutions in fgfr2 a...,10.1186/s12881-014-0095-4,['Apparently synonymous substitutions in FGFR2...,Apparently synonymous substitutions in FGFR2af...,False,True,apparently synonymous substitutions in fgfr2af...,100.0,0,99.0,99.0,51.0,99.0,True,,
1,10.1183/13993003.00321-2016,1996,", pattinson kt, turner mr. a wider pathologica...",{'title': ['A wider pathological network under...,a wider pathological network underlying breath...,10.1183/13993003.00321-2016,['A wider pathological network underlying brea...,A wider pathological network underlying breath...,True,True,a wider pathological network underlying breath...,100.0,1,100.0,100.0,73.0,100.0,True,,


## Then bring in the shorter list of Curated (manually checked) DOIs

In [235]:
df_curated = pd.read_csv('./A5in_Curated_DOIs(08Aug).csv', index_col=0, usecols=[0,1,4])
df_curated.head()

Unnamed: 0,ID,DOI_curated
5,1312,10.3174/ajnr.a4208
7,62,10.1016/j.atherosclerosis.2012.03.036
10,168,10.1002/ajmg.a.35558
20,1920,10.1016/j.fertnstert.2014.12.123
21,1023,10.1136/archdischild-2013-304601


In [236]:
df_curated.count()

ID             385
DOI_curated    374
dtype: int64

In [237]:
#  extract the doi from the web URL
df_curated['curatedDOI'] = df_curated.DOI_curated.str.extract(r'(10\.\d{4,9}\/.[^ ]+)') 

In [238]:
df_curated['curatedDOI'].count()

374

In [239]:
df_curated.head()

Unnamed: 0,ID,DOI_curated,curatedDOI
5,1312,10.3174/ajnr.a4208,10.3174/ajnr.a4208
7,62,10.1016/j.atherosclerosis.2012.03.036,10.1016/j.atherosclerosis.2012.03.036
10,168,10.1002/ajmg.a.35558,10.1002/ajmg.a.35558
20,1920,10.1016/j.fertnstert.2014.12.123,10.1016/j.fertnstert.2014.12.123
21,1023,10.1136/archdischild-2013-304601,10.1136/archdischild-2013-304601


### Then add 'curatedDOI column to df_api (merge using the 'ID' column)

In [240]:
df_DOIs = df_main.merge(df_curated, on='ID',how='left')
df_DOIs.shape

(2378, 22)

In [241]:
df_DOIs.count()

DOI                      1440
ID                       2378
complete                 2378
crossref_API_out         2378
csv_post_title           2378
DOI_API                  2378
title                    2378
title_API                2352
title_match              2378
DOI_match                2378
CrossRefTitle_low        2352
DOI_fuzzyRatio           1440
TitleMatch01             2378
Title_fuzzyRatio         2352
Title_partialRatio       2352
Complete_fuzzyRatio      2352
Complete_partialRatio    2352
good_list                2378
DOI_retry_API             379
title_retry_API           379
DOI_curated               374
curatedDOI                374
dtype: int64

### Now define DOI to use from all references:
    - If a curated DOI is available, use this

In [242]:
df_DOIs['FinalDOI']= np.nan

In [243]:
# be careful, the 'where' command is the reverse of this (replace when False)

df_DOIs['FinalDOI'].mask(df_DOIs.good_list==True, df_DOIs.DOI_API, inplace=True)

df_DOIs['FinalDOI'].mask(df_DOIs.curatedDOI.notna(), df_DOIs.curatedDOI, inplace=True)

df_DOIs['FinalDOI'] =df_DOIs['FinalDOI'].str.strip(' .')


In [244]:
df_DOIs.count()

DOI                      1440
ID                       2378
complete                 2378
crossref_API_out         2378
csv_post_title           2378
DOI_API                  2378
title                    2378
title_API                2352
title_match              2378
DOI_match                2378
CrossRefTitle_low        2352
DOI_fuzzyRatio           1440
TitleMatch01             2378
Title_fuzzyRatio         2352
Title_partialRatio       2352
Complete_fuzzyRatio      2352
Complete_partialRatio    2352
good_list                2378
DOI_retry_API             379
title_retry_API           379
DOI_curated               374
curatedDOI                374
FinalDOI                 2368
dtype: int64

In [245]:
#unabel to find good DOIs

df_DOIs[df_DOIs.FinalDOI.isnull()]

Unnamed: 0,DOI,ID,complete,crossref_API_out,csv_post_title,DOI_API,title,title_API,title_match,DOI_match,...,Title_fuzzyRatio,Title_partialRatio,Complete_fuzzyRatio,Complete_partialRatio,good_list,DOI_retry_API,title_retry_API,DOI_curated,curatedDOI,FinalDOI
233,,797,"brittain js, brown p. the many roads to tremor...",{'title': ['Simultaneous Thalamic and Posterio...,the many roads to tremor: an invited commentar...,10.1111/j.1525-1403.2012.00503.x,['Simultaneous Thalamic and Posterior Subthala...,Simultaneous Thalamic and Posterior Subthalami...,False,False,...,53.0,56.0,47.0,56.0,False,10.1016/j.expneurol.2013.06.009,Effects of low-frequency thalamic deep brain s...,,,
396,,966,"coffey s, prendergast b. medical therapies for...",{'title': ['Proceedings: Moving Toward Cell-Ba...,medical therapies for treatment of valvular he...,10.5966/sctm.2015-0118,['Proceedings: Moving Toward Cell-Based Therap...,Proceedings: Moving Toward Cell-Based Therapie...,False,False,...,54.0,56.0,48.0,54.0,False,10.1136/heartjnl-2016-310482,Serum biomarkers in valvular heart disease,,,
809,,1252,"gutowska-owsiak d, salimi m, selvakumar ta, wa...",{'title': ['Nrf2 links epidermal barrier funct...,histamine exerts multiple effects on expressio...,10.1002/emmm.201200219,['Nrf2 links epidermal barrier function with a...,Nrf2 links epidermal barrier function with ant...,False,False,...,42.0,52.0,31.0,58.0,False,10.1111/j.1600-0625.2011.01412.x,IL-17 downregulates filaggrin and affects kera...,,,
978,,1408,"isbarn h, briganti b, de visschere pj, fã•_tte...",{'title': ['Transperineal template-guided mapp...,systematic ultrasound-guided saturation and te...,10.1111/iju.12660,['Transperineal template-guided mapping biopsy...,Transperineal template-guided mapping biopsy o...,False,False,...,45.0,65.0,28.0,65.0,False,10.1016/j.urolonc.2014.08.007,Prognostic effect of neuroendocrine differenti...,,,
1133,,861,"lang s, ford kj, john t, pollard aj, mccarthy ...",{'title': ['Energy advice service as perceived...,immunisation errors reported to a vaccine advi...,10.1111/j.1470-6431.2010.00924.x,['Energy advice service as perceived by Swedis...,Energy advice service as perceived by Swedish ...,False,False,...,40.0,51.0,28.0,50.0,False,10.1093/pubmed/fdv112,A quantitative review of healthcare profession...,,,
1682,,1033,pollard aj. meningococcal disease prevention i...,"{'title': ['Neonatal meningococcal disease'], ...",meningococcal disease prevention in india,10.1111/jpc.13856,['Neonatal meningococcal disease'],Neonatal meningococcal disease,False,False,...,59.0,70.0,48.0,80.0,False,10.4161/hv.7.8.16270,Meningococcal disease: The advances and challe...,,,
1697,,34,"prendergast b, coope lt, crijns h, falkenstein...",{'title': ['Twenty years of research in the Hu...,the german centre for cardiovascular research,10.1111/nbu.12102,['Twenty years of research in the Human Nutrit...,Twenty years of research in the Human Nutritio...,False,False,...,34.0,49.0,28.0,43.0,False,10.1016/j.bbamcr.2015.11.010,Heart regeneration,,,
1751,,1006,"richens jl, vere ka, light ra, soria d, gariba...",{'title': ['The Relationship Between Brain Agi...,practical detection of a definitive biomarker ...,10.3724/sp.j.1206.2012.00351,['The Relationship Between Brain Aging and Pre...,The Relationship Between Brain Aging and Precl...,False,False,...,38.0,54.0,31.0,54.0,False,10.3233/jad-2011-111505,Identification of SPARC-like 1 Protein as Part...,,,
1870,,223,"schmidt j, iversen akn, tenzer s, gostick e, p...",{'title': ['Polyubiquitination of lysine-48 is...,rapid antigen processing,10.1002/eji.201444830,['Polyubiquitination of lysine-48 is an essent...,Polyubiquitination of lysine-48 is an essentia...,False,False,...,35.0,88.0,41.0,46.0,False,10.1016/s0168-8278(12)60059-2,45 RAPID AND EFFICIENT ANTIGEN PROCESSING AND ...,,,
1890,,1611,"sen a, selway r and nashe l. from channels to ...",{'title': ['Practical guide to dynamic pelvic ...,from channels to commissioning - a practical...,10.1002/jmri.25998,['Practical guide to dynamic pelvic floor MRI'],Practical guide to dynamic pelvic floor MRI,False,False,...,41.0,51.0,31.0,58.0,False,10.1016/j.yebeh.2018.03.016,Bitemporal seizure spread and its effect on au...,,,


In [246]:
df_DOIs.count().max()

2378

In [247]:
df_DOIs.dropna(subset=['FinalDOI'])[df_DOIs['FinalDOI'].duplicated(keep=False)]#.to_clipboard()

  """Entry point for launching an IPython kernel.


Unnamed: 0,DOI,ID,complete,crossref_API_out,csv_post_title,DOI_API,title,title_API,title_match,DOI_match,...,Title_fuzzyRatio,Title_partialRatio,Complete_fuzzyRatio,Complete_partialRatio,good_list,DOI_retry_API,title_retry_API,DOI_curated,curatedDOI,FinalDOI
1726,10.1128/CVI.00099-14,1119,"ramasamy mn, clutterbuck ea, haworth k, barel ...",{'title': ['Immunogenicity of meningococcal po...,the immunogenicity of quadrivalent meningococc...,10.1111/cei.13202,['Immunogenicity of meningococcal polysacchari...,Immunogenicity of meningococcal polysaccharide...,False,False,...,65.0,64.0,40.0,64.0,False,10.1128/cvi.00099-14,Randomized Clinical Trial To Evaluate the Immu...,10.1128/cvi.00099-14,10.1128/cvi.00099-14,10.1128/cvi.00099-14
1727,,1102,"ramasamy mn, clutterbuck ea, haworth k, bowman...",{'title': ['Randomized Clinical Trial To Evalu...,randomized clinical trial to evaluate the immu...,10.1128/cvi.00099-14,['Randomized Clinical Trial To Evaluate the Im...,Randomized Clinical Trial To Evaluate the Immu...,True,False,...,100.0,100.0,76.0,100.0,True,,,,,10.1128/cvi.00099-14


In [248]:
#drop duplicate row
df_DOIs.drop([1727],inplace=True)

In [249]:
df_DOIs.columns

Index(['DOI', 'ID', 'complete', 'crossref_API_out', 'csv_post_title',
       'DOI_API', 'title', 'title_API', 'title_match', 'DOI_match',
       'CrossRefTitle_low', 'DOI_fuzzyRatio', 'TitleMatch01',
       'Title_fuzzyRatio', 'Title_partialRatio', 'Complete_fuzzyRatio',
       'Complete_partialRatio', 'good_list', 'DOI_retry_API',
       'title_retry_API', 'DOI_curated', 'curatedDOI', 'FinalDOI'],
      dtype='object')

In [250]:
DOIs_out = df_DOIs.reindex(columns=['ID','complete','FinalDOI']).dropna()
DOIs_out.to_csv('./A5out_2367_correctDOIs_for_metrics(08Aug19).csv')
DOIs_out.sort_values('ID')

Unnamed: 0,ID,complete,FinalDOI
342,1,"chappell ma, woolrich mw, kazan s, jezzard p, ...",10.1002/mrm.24260
1893,2,"serres s, soto ms, hamilton a, mcateer ma, car...",10.1073/pnas.1117412109
1206,3,"lindsay ac, biasiolli l, lee, jm, kylintireas ...",10.1016/j.jcmg.2011.10.007
1105,4,"kolsch h, lehmann dj, ibrahim-verbaas ca, comb...",10.1007/s00702-011-0732-4
1317,5,"markwick, a., zamboni, g., & de jager, c. a. (...",10.1080/13803395.2012.672966
2142,6,valent. identification and functional characte...,10.1371/journal.pone.0034541
1861,7,"schã•_del j, bardella c, sciesielski lk, br...",10.1038/ng.2204
1457,8,"munnå½ s, held kr, magli cm, ata b, wells ...",10.1016/j.fertnstert.2012.01.106
1027,9,"jubb am, browning l, campo l, turley h, steers...",10.1111/j.1365-2559.2011.04138.x
1114,10,"kramer-marek g, gijsen m, kiesewetter do, benn...",10.2967/jnumed.111.096685


In [251]:
DOIs_out.count()

ID          2367
complete    2367
FinalDOI    2367
dtype: int64

In [252]:
df_DOIs.count().max()

2377

In [253]:
print('Digital Object Identifiers (DOIs) forund for % of refernces in Spreadsheet:')
print (((DOIs_out.FinalDOI.count()/df_DOIs.count().max())*100).round(2), '%')


Digital Object Identifiers (DOIs) forund for % of refernces in Spreadsheet:
99.58 %
