### We will now bring together the lists of CrossRef-acquired DOIs and the manually-identified last few

In [32]:
import pandas as pd
import numpy as np

##  bring in the main list of all 2377 publications claimed in BRC phase2   
 file as exported from notebook A4   ('./A4out_CrossRef_FuzzyMatched_with_retry_Oct19.csv')

In [33]:
df_main = pd.read_csv('./A4out_CrossRef_FuzzyMatched_with_retry_Oct19.csv', index_col=0)
df_main.shape

(2378, 21)

In [34]:
df_main.head(2)

Unnamed: 0,ID,DOI,complete,csv_post_title,crossref_API_out,title,DOI_API,title_API,title_match,DOI_match,...,DOI_fuzzyRatio,Title_fuzzyRatio,Title_partialRatio,Complete_fuzzyRatio,Complete_partialRatio,TitleMatch01,good_list,good_list2,DOI_retry_API,title_retry_API
0,1125,10.1186/s12881-014-0095-4,"&amp; , fenwick al, goos jac, rankin j, lord h...",apparently synonymous substitutions in fgfr2 a...,{'title': ['Apparently synonymous substitution...,['Apparently synonymous substitutions in FGFR2...,10.1186/s12881-014-0095-4,Apparently synonymous substitutions in FGFR2af...,False,True,...,100.0,99.0,99.0,51.0,99.0,0,True,True,,
1,1996,10.1183/13993003.00321-2016,", pattinson kt, turner mr. a wider pathologica...",a wider pathological network underlying breath...,{'title': ['A wider pathological network under...,['A wider pathological network underlying brea...,10.1183/13993003.00321-2016,A wider pathological network underlying breath...,True,True,...,100.0,100.0,100.0,73.0,100.0,1,True,True,,


## Then bring in the shorter list of Curated (manually checked) DOIs

In [60]:
df_curated = pd.read_csv('./A5in_curated_API_list(14Oct19).csv', index_col=0, usecols=[0,1,5])
df_curated.head()

Unnamed: 0_level_0,ID,curated_DOI
Column1,Unnamed: 1_level_1,Unnamed: 2_level_1
105.0,1467.0,10.3109/10428194.2015.1122783
33.0,1639.0,10.1080/23279095.2014.1003067
37.0,1313.0,10.1093/brain/awu328
38.0,1448.0,10.1523/jneurosci.3075-14.2015
39.0,1270.0,10.1007/s11892-014-0559-0


In [61]:
df_curated.count()

ID             176
curated_DOI    165
dtype: int64

In [62]:
#  extract the doi from the web URL
df_curated['curatedDOI'] = df_curated.curated_DOI.str.extract(r'(10\.\d{4,9}\/.[^ ]+)') 

In [63]:
df_curated['curatedDOI'].count()

165

In [64]:
df_curated.head()

Unnamed: 0_level_0,ID,curated_DOI,curatedDOI
Column1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
105.0,1467.0,10.3109/10428194.2015.1122783,10.3109/10428194.2015.1122783
33.0,1639.0,10.1080/23279095.2014.1003067,10.1080/23279095.2014.1003067
37.0,1313.0,10.1093/brain/awu328,10.1093/brain/awu328
38.0,1448.0,10.1523/jneurosci.3075-14.2015,10.1523/jneurosci.3075-14.2015
39.0,1270.0,10.1007/s11892-014-0559-0,10.1007/s11892-014-0559-0


### Then add 'curatedDOI column to df_api (merge using the 'ID' column)

In [65]:
df_DOIs = df_main.merge(df_curated, on='ID',how='left')
df_DOIs.shape

(2378, 23)

In [66]:
df_DOIs.count()

ID                       2378
DOI                      1442
complete                 2378
csv_post_title           2378
crossref_API_out         2378
title                    2378
DOI_API                  2378
title_API                2082
title_match              2378
DOI_match                2378
CrossRefTitle_low        2082
DOI_fuzzyRatio           1442
Title_fuzzyRatio         2082
Title_partialRatio       2082
Complete_fuzzyRatio      2082
Complete_partialRatio    2082
TitleMatch01             2378
good_list                2378
good_list2               2378
DOI_retry_API             562
title_retry_API           562
curated_DOI               165
curatedDOI                165
dtype: int64

### Now define DOI to use from all references:
    - If a curated DOI is available, use this

In [67]:
df_DOIs['FinalDOI']= np.nan

In [68]:
# be careful, the 'where' command is the reverse of this (replace when False)

df_DOIs['FinalDOI'].mask(df_DOIs.good_list==True, df_DOIs.DOI_API, inplace=True)

df_DOIs['FinalDOI'].mask(((df_DOIs.good_list==False)&(df_DOIs.good_list2==True)),
                         df_DOIs.DOI_retry_API, inplace=True)

df_DOIs['FinalDOI'].mask(df_DOIs.curatedDOI.notna(), df_DOIs.curatedDOI, inplace=True)

df_DOIs['FinalDOI'] = df_DOIs['FinalDOI'].str.strip(' .')


In [69]:
df_DOIs.count()

ID                       2378
DOI                      1442
complete                 2378
csv_post_title           2378
crossref_API_out         2378
title                    2378
DOI_API                  2378
title_API                2082
title_match              2378
DOI_match                2378
CrossRefTitle_low        2082
DOI_fuzzyRatio           1442
Title_fuzzyRatio         2082
Title_partialRatio       2082
Complete_fuzzyRatio      2082
Complete_partialRatio    2082
TitleMatch01             2378
good_list                2378
good_list2               2378
DOI_retry_API             562
title_retry_API           562
curated_DOI               165
curatedDOI                165
FinalDOI                 2367
dtype: int64

In [70]:
df_DOIs.dropna(subset=['FinalDOI'])[df_DOIs.duplicated(subset=['FinalDOI'], keep=False)]

  """Entry point for launching an IPython kernel.


Unnamed: 0,ID,DOI,complete,csv_post_title,crossref_API_out,title,DOI_API,title_API,title_match,DOI_match,...,Complete_fuzzyRatio,Complete_partialRatio,TitleMatch01,good_list,good_list2,DOI_retry_API,title_retry_API,curated_DOI,curatedDOI,FinalDOI
607,2063,10.1111/bjh.14177,"eyre ta, clifford r, bloor a, boyle l, roberts...",ncri phase ii study of chop in combination wit...,{'title': ['NCRI phase II study of CHOP in com...,['NCRI phase II study of CHOP in combination w...,10.1111/bjh.14177,NCRI phase II study of CHOP in combination wit...,True,True,...,66.0,100.0,1,True,True,,,,,10.1111/bjh.14177
608,1359,10.1186/s12885-015-1048-9,"eyre ta, clifford r, corran r, boyle l, franci...",single arm ncri phase ii study of chop in comb...,{'title': ['NCRI phase II study of CHOP in com...,['NCRI phase II study of CHOP in combination w...,10.1111/bjh.14177,NCRI phase II study of CHOP in combination wit...,False,False,...,64.0,88.0,0,True,False,,,,,10.1111/bjh.14177
725,2022,10.1111/liv.13163,"gathercole ll, hazlehurst jm, armstrong mj, cr...",advanced non-alcoholic fatty liver disease and...,{'title': ['Non-alcoholic fatty liver disease'...,['Non-alcoholic fatty liver disease'],10.1111/j.1753-0407.2012.00204.x,Non-alcoholic fatty liver disease,False,False,...,18.0,100.0,0,True,False,,,,,10.1111/j.1753-0407.2012.00204.x
872,1785,10.1016/j.metabol.2016.01.001,"hazlehurst jm, woods c, marjot t, cobbold jf, ...",non-alcoholic fatty liver disease and diabetes,{'title': ['Non-alcoholic fatty liver disease'...,['Non-alcoholic fatty liver disease'],10.1111/j.1753-0407.2012.00204.x,Non-alcoholic fatty liver disease,False,False,...,33.0,100.0,0,True,False,,,,,10.1111/j.1753-0407.2012.00204.x
1609,2287,10.1111/liv.13284,"pavlides m, banerjee r, tunnicliffe em, kelly ...",multi-parametric magnetic resonance imaging fo...,{'title': ['Non-alcoholic fatty liver disease'...,['Non-alcoholic fatty liver disease'],10.1111/j.1753-0407.2012.00204.x,Non-alcoholic fatty liver disease,False,False,...,24.0,100.0,0,True,False,,,,,10.1111/j.1753-0407.2012.00204.x
1726,1119,10.1128/CVI.00099-14,"ramasamy mn, clutterbuck ea, haworth k, barel ...",the immunogenicity of quadrivalent meningococc...,{'title': ['Immunogenicity of meningococcal po...,['Immunogenicity of meningococcal polysacchari...,10.1111/cei.13202,Immunogenicity of meningococcal polysaccharide...,False,False,...,40.0,64.0,0,False,True,10.1128/cvi.00099-14,Randomized Clinical Trial To Evaluate the Immu...,,,10.1128/cvi.00099-14
1727,1102,,"ramasamy mn, clutterbuck ea, haworth k, bowman...",randomized clinical trial to evaluate the immu...,{'title': ['Randomized Clinical Trial To Evalu...,['Randomized Clinical Trial To Evaluate the Im...,10.1128/cvi.00099-14,Randomized Clinical Trial To Evaluate the Immu...,True,False,...,76.0,100.0,1,True,True,,,,,10.1128/cvi.00099-14


In [71]:
#  And following checking of final references, 6 duplicated FinalDOIs returned to 'DOIs'
df_DOIs['FinalDOI'].mask((df_DOIs.duplicated(subset=['FinalDOI'], keep=False)),
                         df_DOIs.DOI, inplace=True)

In [72]:
#drop duplicate rows if needed  

# references ID 1726 and 1727 are duplicates
df_DOIs.drop([1727], inplace=True)

In [75]:
df_DOIs.dropna(subset=['FinalDOI'])[df_DOIs.duplicated(subset=['FinalDOI'], keep=False)]

  """Entry point for launching an IPython kernel.


Unnamed: 0,ID,DOI,complete,csv_post_title,crossref_API_out,title,DOI_API,title_API,title_match,DOI_match,...,Complete_fuzzyRatio,Complete_partialRatio,TitleMatch01,good_list,good_list2,DOI_retry_API,title_retry_API,curated_DOI,curatedDOI,FinalDOI


In [74]:
#unabel to find good DOIs

df_DOIs[df_DOIs.FinalDOI.isnull()]

Unnamed: 0,ID,DOI,complete,csv_post_title,crossref_API_out,title,DOI_API,title_API,title_match,DOI_match,...,Complete_fuzzyRatio,Complete_partialRatio,TitleMatch01,good_list,good_list2,DOI_retry_API,title_retry_API,curated_DOI,curatedDOI,FinalDOI
396,966,,"coffey s, prendergast b. medical therapies for...",medical therapies for treatment of valvular he...,{'title': ['Proceedings: Moving Toward Cell-Ba...,['Proceedings: Moving Toward Cell-Based Therap...,10.5966/sctm.2015-0118,Proceedings: Moving Toward Cell-Based Therapie...,False,False,...,48.0,54.0,0,False,False,10.1136/heartjnl-2016-310482,Serum biomarkers in valvular heart disease,,,
809,1252,,"gutowska-owsiak d, salimi m, selvakumar ta, wa...",histamine exerts multiple effects on expressio...,{'title': ['Topical hesperidin improves epider...,['Topical hesperidin improves epidermal permea...,10.1111/j.1600-0625.2012.01455.x,Topical hesperidin improves epidermal permeabi...,False,False,...,38.0,42.0,0,False,False,10.1111/j.1600-0625.2011.01412.x,IL-17 downregulates filaggrin and affects kera...,,,
978,1408,,"isbarn h, briganti b, de visschere pj, fã•_tte...",systematic ultrasound-guided saturation and te...,{'title': ['Transperineal template-guided mapp...,['Transperineal template-guided mapping biopsy...,10.1111/iju.12660,Transperineal template-guided mapping biopsy o...,False,False,...,28.0,65.0,0,False,False,10.1016/j.urolonc.2014.08.007,Prognostic effect of neuroendocrine differenti...,,,
1059,234,,"kavvoura fk, owen kr. maturity onset diabetes ...",maturity onset diabetes of the young: clinical...,{'title': ['Characteristics of maturity onset ...,['Characteristics of maturity onset diabetes o...,10.1111/pedi.12289,Characteristics of maturity onset diabetes of ...,False,False,...,56.0,65.0,0,False,False,10.2217/dmt.12.82,Biomarkers currently used for the diagnosis of...,,,
1133,861,,"lang s, ford kj, john t, pollard aj, mccarthy ...",immunisation errors reported to a vaccine advi...,{'title': ['Energy advice service as perceived...,['Energy advice service as perceived by Swedis...,10.1111/j.1470-6431.2010.00924.x,Energy advice service as perceived by Swedish ...,False,False,...,28.0,50.0,0,False,False,10.1093/pubmed/fdv112,A quantitative review of healthcare profession...,,,
1682,1033,,pollard aj. meningococcal disease prevention i...,meningococcal disease prevention in india,"{'title': ['Neonatal meningococcal disease'], ...",['Neonatal meningococcal disease'],10.1111/jpc.13856,Neonatal meningococcal disease,False,False,...,48.0,80.0,0,False,False,10.4161/hv.7.8.16270,Meningococcal disease: The advances and challe...,,,
1697,34,,"prendergast b, coope lt, crijns h, falkenstein...",the german centre for cardiovascular research,{'title': ['Twenty years of research in the Hu...,['Twenty years of research in the Human Nutrit...,10.1111/nbu.12102,Twenty years of research in the Human Nutritio...,False,False,...,28.0,43.0,0,False,False,10.1016/j.bbamcr.2015.11.010,Heart regeneration,,,
1733,1056,,"rand l, sheehan m. healthcare resource allocat...",healthcare resource allocation: balancing prin...,,,,,False,False,...,,,0,False,False,10.1177/1471301215615780,"Charles Foster, Jonathan Herring and Israel Do...",,,
1751,1006,,"richens jl, vere ka, light ra, soria d, gariba...",practical detection of a definitive biomarker ...,,,,,False,False,...,,,0,False,False,,,,,
1890,1611,,"sen a, selway r and nashe l. from channels to ...",from channels to commissioning - a practical...,{'title': ['Practical guide to dynamic pelvic ...,['Practical guide to dynamic pelvic floor MRI'],10.1002/jmri.25998,Practical guide to dynamic pelvic floor MRI,False,False,...,31.0,58.0,0,False,False,10.1016/j.yebeh.2018.03.016,Bitemporal seizure spread and its effect on au...,,,


In [77]:
df_DOIs.count()

ID                       2377
DOI                      1442
complete                 2377
csv_post_title           2377
crossref_API_out         2377
title                    2377
DOI_API                  2377
title_API                2081
title_match              2377
DOI_match                2377
CrossRefTitle_low        2081
DOI_fuzzyRatio           1442
Title_fuzzyRatio         2081
Title_partialRatio       2081
Complete_fuzzyRatio      2081
Complete_partialRatio    2081
TitleMatch01             2377
good_list                2377
good_list2               2377
DOI_retry_API             562
title_retry_API           562
curated_DOI               165
curatedDOI                165
FinalDOI                 2366
dtype: int64

In [50]:
df_DOIs.columns

Index(['ID', 'DOI', 'complete', 'csv_post_title', 'crossref_API_out', 'title',
       'DOI_API', 'title_API', 'title_match', 'DOI_match', 'CrossRefTitle_low',
       'DOI_fuzzyRatio', 'Title_fuzzyRatio', 'Title_partialRatio',
       'Complete_fuzzyRatio', 'Complete_partialRatio', 'TitleMatch01',
       'good_list', 'good_list2', 'DOI_retry_API', 'title_retry_API',
       'curated_DOI', 'curatedDOI', 'FinalDOI'],
      dtype='object')

In [51]:
df_DOIs.dropna(subset=['FinalDOI'])[df_DOIs['FinalDOI'].duplicated(keep=False)].to_clipboard()

  """Entry point for launching an IPython kernel.


In [78]:
DOIs_out = df_DOIs.reindex(columns=['ID','complete','FinalDOI']).dropna()
DOIs_out.to_csv('./A5out_2366_correctDOIs_for_metrics(14Oct19).csv')
DOIs_out.sort_values('ID')

Unnamed: 0,ID,complete,FinalDOI
342,1,"chappell ma, woolrich mw, kazan s, jezzard p, ...",10.1002/mrm.24260
1893,2,"serres s, soto ms, hamilton a, mcateer ma, car...",10.1073/pnas.1117412109
1206,3,"lindsay ac, biasiolli l, lee, jm, kylintireas ...",10.1016/j.jcmg.2011.10.007
1105,4,"kolsch h, lehmann dj, ibrahim-verbaas ca, comb...",10.1007/s00702-011-0732-4
1317,5,"markwick, a., zamboni, g., & de jager, c. a. (...",10.1080/13803395.2012.672966
...,...,...,...
634,2416,"faull ok, pattinson kt. the cortical connectiv...",10.7554/elife.21749
877,2417,"hellner k, dorrell l. recent advances in under...",10.12688/f1000research.9701.1
976,2419,"iro ma, snape md, voysey m, jawad s, finn a, h...",10.1016/j.vaccine.2016.11.009
541,2420,"duane f, aznar mc, bartlett f, cutter dj, darb...",10.1016/j.radonc.2017.01.008


In [79]:
DOIs_out.count()

ID          2366
complete    2366
FinalDOI    2366
dtype: int64

In [80]:
df_DOIs.count().max()

2377

In [81]:
print('Digital Object Identifiers (DOIs) forund for % of refernces in Spreadsheet:')
print (((DOIs_out.FinalDOI.count()/df_DOIs.count().max())*100).round(2), '%')


Digital Object Identifiers (DOIs) forund for % of refernces in Spreadsheet:
99.54 %
