### We will now bring together the lists of CrossRef-acquired DOIs and the manually-identified last few

In [1]:
import pandas as pd
import numpy as np

##  bring in the main list of all 2377 publications claimed in BRC phase2   
 file as exported from notebook A4   ('./A4out_CrossRef_FuzzyMatched_with_retry_Oct19.csv')

In [2]:
df_main = pd.read_csv('./Source_files/A4out_CrossRef_FuzzyMatched_with_retry_Oct19.csv', index_col=0)
df_main.shape

(2378, 21)

In [3]:
df_main.head(2)

Unnamed: 0,ID,DOI,complete,csv_post_title,crossref_API_out,title,DOI_API,title_API,title_match,DOI_match,...,DOI_fuzzyRatio,Title_fuzzyRatio,Title_partialRatio,Complete_fuzzyRatio,Complete_partialRatio,TitleMatch01,good_list,good_list2,DOI_retry_API,title_retry_API
0,1125,10.1186/s12881-014-0095-4,"&amp; , fenwick al, goos jac, rankin j, lord h...",apparently synonymous substitutions in fgfr2 a...,{'title': ['Apparently synonymous substitution...,['Apparently synonymous substitutions in FGFR2...,10.1186/s12881-014-0095-4,Apparently synonymous substitutions in FGFR2af...,False,True,...,100.0,99.0,99.0,51.0,99.0,0,True,True,,
1,1996,10.1183/13993003.00321-2016,", pattinson kt, turner mr. a wider pathologica...",a wider pathological network underlying breath...,{'title': ['A wider pathological network under...,['A wider pathological network underlying brea...,10.1183/13993003.00321-2016,A wider pathological network underlying breath...,True,True,...,100.0,100.0,100.0,73.0,100.0,1,True,True,,


## Then bring in the shorter list of Curated (manually checked) DOIs

In [4]:
df_curated = pd.read_csv('./Source_files/A5in_curated_API_list(14Oct19).csv', index_col=0, usecols=[0,1,5])
df_curated.head()

Unnamed: 0_level_0,ID,curated_DOI
Column1,Unnamed: 1_level_1,Unnamed: 2_level_1
105.0,1467.0,10.3109/10428194.2015.1122783
33.0,1639.0,10.1080/23279095.2014.1003067
37.0,1313.0,10.1093/brain/awu328
38.0,1448.0,10.1523/jneurosci.3075-14.2015
39.0,1270.0,10.1007/s11892-014-0559-0


In [5]:
df_curated.count()

ID             176
curated_DOI    165
dtype: int64

### Then add 'curated_DOI column to df_api (merge using the 'ID' column)

In [6]:
df_DOIs = df_main.merge(df_curated, on='ID',how='left')
df_DOIs.shape

(2378, 22)

In [7]:
df_DOIs.count()

ID                       2378
DOI                      1442
complete                 2378
csv_post_title           2378
crossref_API_out         2378
title                    2378
DOI_API                  2378
title_API                2082
title_match              2378
DOI_match                2378
CrossRefTitle_low        2082
DOI_fuzzyRatio           1442
Title_fuzzyRatio         2082
Title_partialRatio       2082
Complete_fuzzyRatio      2082
Complete_partialRatio    2082
TitleMatch01             2378
good_list                2378
good_list2               2378
DOI_retry_API             562
title_retry_API           562
curated_DOI               165
dtype: int64

### Now define DOI to use from all references:
    - If there was a good match on title or DOI orginially, use this DOI
    - Where this was not the case, use exact matches from search of CrossRef using the 'complete'record
    - if a 'curated_DOI' is available, use this preferentially
    - remove any whitespace or full-stops from the DOIs

In [8]:
df_DOIs['FinalDOI']= np.nan

In [9]:
# be careful, the 'where' command is the reverse of this (replace when False)

df_DOIs['FinalDOI'].mask(df_DOIs.good_list==True, df_DOIs.DOI_API, inplace=True)

df_DOIs['FinalDOI'].mask(((df_DOIs.good_list==False)&(df_DOIs.good_list2==True)),
                         df_DOIs.DOI_retry_API, inplace=True)

df_DOIs['FinalDOI'].mask(df_DOIs.curated_DOI.notna(), df_DOIs.curated_DOI, inplace=True)

df_DOIs['FinalDOI'] = df_DOIs['FinalDOI'].str.strip(' .')


In [10]:
df_DOIs.count()

ID                       2378
DOI                      1442
complete                 2378
csv_post_title           2378
crossref_API_out         2378
title                    2378
DOI_API                  2378
title_API                2082
title_match              2378
DOI_match                2378
CrossRefTitle_low        2082
DOI_fuzzyRatio           1442
Title_fuzzyRatio         2082
Title_partialRatio       2082
Complete_fuzzyRatio      2082
Complete_partialRatio    2082
TitleMatch01             2378
good_list                2378
good_list2               2378
DOI_retry_API             562
title_retry_API           562
curated_DOI               165
FinalDOI                 2367
dtype: int64

In [11]:
df_future_self = pd.read_csv('./Source_files/A5in_Rechecked_DOIs_after_author_numbers(Nov19).csv')
df_future_self

Unnamed: 0,DOI,Recurated_DOI
0,10.1056/nejmx120009,10.1056/nejmx120009
1,10.1016/s0140-6736(12)60768-5,10.1016/s0140-6736(12)60768-5
2,10.1097/md.0b013e3182963750,10.1097/MD.0b013e31828a01f9
3,10.1056/nejmc1303486,10.1056/NEJMoa1110039
4,10.1136/bmj.f4799,10.1136/bmj.f4303
5,10.1038/sj.bdj.2013.807,10.1371/journal.pmed.1001474
6,10.1002/ana.24105,10.1002/ana.23838
7,10.1056/nejmc1315848,10.1056/NEJMoa1307557
8,10.1038/sj.bdj.2015.166,10.1016/S0140-6736(14)62007-9
9,10.1056/nejmc1505499,10.1056/NEJMoa1411627


In [12]:
df_future = df_DOIs.merge(df_future_self, left_on='FinalDOI', right_on='DOI', how ='left')
df_future['FinalDOI'].mask(df_future.Recurated_DOI.notna(), df_future.Recurated_DOI, inplace=True)
df_future[df_future.Recurated_DOI.notna()]
df_future

Unnamed: 0,ID,DOI_x,complete,csv_post_title,crossref_API_out,title,DOI_API,title_API,title_match,DOI_match,...,Complete_partialRatio,TitleMatch01,good_list,good_list2,DOI_retry_API,title_retry_API,curated_DOI,FinalDOI,DOI_y,Recurated_DOI
0,1125,10.1186/s12881-014-0095-4,"&amp; , fenwick al, goos jac, rankin j, lord h...",apparently synonymous substitutions in fgfr2 a...,{'title': ['Apparently synonymous substitution...,['Apparently synonymous substitutions in FGFR2...,10.1186/s12881-014-0095-4,Apparently synonymous substitutions in FGFR2af...,False,True,...,99.0,0,True,True,,,,10.1186/s12881-014-0095-4,,
1,1996,10.1183/13993003.00321-2016,", pattinson kt, turner mr. a wider pathologica...",a wider pathological network underlying breath...,{'title': ['A wider pathological network under...,['A wider pathological network underlying brea...,10.1183/13993003.00321-2016,A wider pathological network underlying breath...,True,True,...,100.0,1,True,True,,,,10.1183/13993003.00321-2016,,
2,506,,"adib-samii p, rost n, traylor m, devan w, ...",17q25 locus is associated with white matter hy...,{'title': ['17q25 Locus Is Associated With Whi...,['17q25 Locus Is Associated With White Matter ...,10.1161/strokeaha.113.679936,17q25 Locus Is Associated With White Matter Hy...,True,False,...,100.0,1,True,True,,,,10.1161/strokeaha.113.679936,,
3,1430,10.1093/annonc/mdu449,"and i. tomlinson*, findlay jm, middleton mr, t...",a systematic review and meta-analysis of somat...,{'title': ['A systematic review and meta-analy...,['A systematic review and meta-analysis of som...,10.1093/annonc/mdu449,A systematic review and meta-analysis of somat...,True,True,...,100.0,1,True,True,,,,10.1093/annonc/mdu449,,
4,848,,"dichgans m, malik r, kã•_nig ir, rosand j, ...",shared genetic susceptibility to ischemic stro...,{'title': ['Shared Genetic Susceptibility to I...,['Shared Genetic Susceptibility to Ischemic St...,10.1161/strokeaha.113.002707,Shared Genetic Susceptibility to Ischemic Stro...,False,False,...,100.0,0,True,False,,,,10.1161/strokeaha.113.002707,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373,1838,10.1007/s00109-015-1377-9,"zois ce, harris al. glycogen metabolism has a ...",glycogen metabolism has a key role in the canc...,{'title': ['Glycogen metabolism has a key role...,['Glycogen metabolism has a key role in the ca...,10.1007/s00109-015-1377-9,Glycogen metabolism has a key role in the canc...,True,True,...,100.0,1,True,True,,,,10.1007/s00109-015-1377-9,,
2374,1103,10.1093/brain/awu143,"zokaei n, mcneill a, proukakis c, beavan m, ja...",visual short-term memory deficits associated w...,{'title': ['Visual short-term memory deficits ...,['Visual short-term memory deficits associated...,10.1093/brain/awu143,Visual short-term memory deficits associated w...,False,True,...,98.0,0,True,True,,,,10.1093/brain/awu143,,
2375,1065,10.1212/NXI.0000000000000016,"zuliani l, ferlazzo e, andrigo c, casano a, ci...","glycine receptor antibodies in 2 cases of new,...",{'title': ['Glycine receptor antibodies in 2 c...,['Glycine receptor antibodies in 2 cases of ne...,10.1212/nxi.0000000000000016,"Glycine receptor antibodies in 2 cases of new,...",False,True,...,100.0,0,True,True,,,,10.1212/nxi.0000000000000016,,
2376,69,,"zuliani l, graus f, giometto b, bien c, vincen...",central nervous system neuronal surface antibo...,{'title': ['Central nervous system neuronal su...,['Central nervous system neuronal surface anti...,10.1136/jnnp-2011-301237,Central nervous system neuronal surface antibo...,True,False,...,100.0,1,True,True,,,,10.1136/jnnp-2011-301237,,


In [13]:
df_future.dropna(subset=['FinalDOI'])[df_future.duplicated(subset=['FinalDOI'], keep=False)]

  """Entry point for launching an IPython kernel.


Unnamed: 0,ID,DOI_x,complete,csv_post_title,crossref_API_out,title,DOI_API,title_API,title_match,DOI_match,...,Complete_partialRatio,TitleMatch01,good_list,good_list2,DOI_retry_API,title_retry_API,curated_DOI,FinalDOI,DOI_y,Recurated_DOI
607,2063,10.1111/bjh.14177,"eyre ta, clifford r, bloor a, boyle l, roberts...",ncri phase ii study of chop in combination wit...,{'title': ['NCRI phase II study of CHOP in com...,['NCRI phase II study of CHOP in combination w...,10.1111/bjh.14177,NCRI phase II study of CHOP in combination wit...,True,True,...,100.0,1,True,True,,,,10.1111/bjh.14177,,
608,1359,10.1186/s12885-015-1048-9,"eyre ta, clifford r, corran r, boyle l, franci...",single arm ncri phase ii study of chop in comb...,{'title': ['NCRI phase II study of CHOP in com...,['NCRI phase II study of CHOP in combination w...,10.1111/bjh.14177,NCRI phase II study of CHOP in combination wit...,False,False,...,88.0,0,True,False,,,,10.1111/bjh.14177,,
725,2022,10.1111/liv.13163,"gathercole ll, hazlehurst jm, armstrong mj, cr...",advanced non-alcoholic fatty liver disease and...,{'title': ['Non-alcoholic fatty liver disease'...,['Non-alcoholic fatty liver disease'],10.1111/j.1753-0407.2012.00204.x,Non-alcoholic fatty liver disease,False,False,...,100.0,0,True,False,,,,10.1111/j.1753-0407.2012.00204.x,,
872,1785,10.1016/j.metabol.2016.01.001,"hazlehurst jm, woods c, marjot t, cobbold jf, ...",non-alcoholic fatty liver disease and diabetes,{'title': ['Non-alcoholic fatty liver disease'...,['Non-alcoholic fatty liver disease'],10.1111/j.1753-0407.2012.00204.x,Non-alcoholic fatty liver disease,False,False,...,100.0,0,True,False,,,,10.1111/j.1753-0407.2012.00204.x,,
1609,2287,10.1111/liv.13284,"pavlides m, banerjee r, tunnicliffe em, kelly ...",multi-parametric magnetic resonance imaging fo...,{'title': ['Non-alcoholic fatty liver disease'...,['Non-alcoholic fatty liver disease'],10.1111/j.1753-0407.2012.00204.x,Non-alcoholic fatty liver disease,False,False,...,100.0,0,True,False,,,,10.1111/j.1753-0407.2012.00204.x,,
1726,1119,10.1128/CVI.00099-14,"ramasamy mn, clutterbuck ea, haworth k, barel ...",the immunogenicity of quadrivalent meningococc...,{'title': ['Immunogenicity of meningococcal po...,['Immunogenicity of meningococcal polysacchari...,10.1111/cei.13202,Immunogenicity of meningococcal polysaccharide...,False,False,...,64.0,0,False,True,10.1128/cvi.00099-14,Randomized Clinical Trial To Evaluate the Immu...,,10.1128/cvi.00099-14,,
1727,1102,,"ramasamy mn, clutterbuck ea, haworth k, bowman...",randomized clinical trial to evaluate the immu...,{'title': ['Randomized Clinical Trial To Evalu...,['Randomized Clinical Trial To Evaluate the Im...,10.1128/cvi.00099-14,Randomized Clinical Trial To Evaluate the Immu...,True,False,...,100.0,1,True,True,,,,10.1128/cvi.00099-14,,


In [14]:
#  And following checking of final references, 6 duplicated FinalDOIs returned to 'DOIs'
df_future['FinalDOI'].mask((df_future.duplicated(subset=['FinalDOI'], keep=False)),
                         df_future.DOI_x, inplace=True)

In [20]:
#drop duplicate rows if needed  
#df_future.loc[1726:1727]

# references ID 1726 and 1727 are duplicates after manual curation
df_future.drop([1727], inplace=True)

In [22]:
# references ID 1326 is the preliminary version of 1812 are duplicates after manual curation (resolve to 10.1056/nejmoa1411627)

#df_future.query('ID==1326')  # gives index 1730
df_future.drop([1730], inplace=True)

In [26]:
df_future.query('ID==1326') #gone

Unnamed: 0,ID,DOI_x,complete,csv_post_title,crossref_API_out,title,DOI_API,title_API,title_match,DOI_match,...,Complete_partialRatio,TitleMatch01,good_list,good_list2,DOI_retry_API,title_retry_API,curated_DOI,FinalDOI,DOI_y,Recurated_DOI


In [27]:
df_future.dropna(subset=['FinalDOI'])[df_future.duplicated(subset=['FinalDOI'], keep=False)]

  """Entry point for launching an IPython kernel.


Unnamed: 0,ID,DOI_x,complete,csv_post_title,crossref_API_out,title,DOI_API,title_API,title_match,DOI_match,...,Complete_partialRatio,TitleMatch01,good_list,good_list2,DOI_retry_API,title_retry_API,curated_DOI,FinalDOI,DOI_y,Recurated_DOI


In [28]:
#unable to find good DOIs

df_future[df_future.FinalDOI.isnull()]

Unnamed: 0,ID,DOI_x,complete,csv_post_title,crossref_API_out,title,DOI_API,title_API,title_match,DOI_match,...,Complete_partialRatio,TitleMatch01,good_list,good_list2,DOI_retry_API,title_retry_API,curated_DOI,FinalDOI,DOI_y,Recurated_DOI
396,966,,"coffey s, prendergast b. medical therapies for...",medical therapies for treatment of valvular he...,{'title': ['Proceedings: Moving Toward Cell-Ba...,['Proceedings: Moving Toward Cell-Based Therap...,10.5966/sctm.2015-0118,Proceedings: Moving Toward Cell-Based Therapie...,False,False,...,54.0,0,False,False,10.1136/heartjnl-2016-310482,Serum biomarkers in valvular heart disease,,,,
809,1252,,"gutowska-owsiak d, salimi m, selvakumar ta, wa...",histamine exerts multiple effects on expressio...,{'title': ['Topical hesperidin improves epider...,['Topical hesperidin improves epidermal permea...,10.1111/j.1600-0625.2012.01455.x,Topical hesperidin improves epidermal permeabi...,False,False,...,42.0,0,False,False,10.1111/j.1600-0625.2011.01412.x,IL-17 downregulates filaggrin and affects kera...,,,,
978,1408,,"isbarn h, briganti b, de visschere pj, fã•_tte...",systematic ultrasound-guided saturation and te...,{'title': ['Transperineal template-guided mapp...,['Transperineal template-guided mapping biopsy...,10.1111/iju.12660,Transperineal template-guided mapping biopsy o...,False,False,...,65.0,0,False,False,10.1016/j.urolonc.2014.08.007,Prognostic effect of neuroendocrine differenti...,,,,
1059,234,,"kavvoura fk, owen kr. maturity onset diabetes ...",maturity onset diabetes of the young: clinical...,{'title': ['Characteristics of maturity onset ...,['Characteristics of maturity onset diabetes o...,10.1111/pedi.12289,Characteristics of maturity onset diabetes of ...,False,False,...,65.0,0,False,False,10.2217/dmt.12.82,Biomarkers currently used for the diagnosis of...,,,,
1133,861,,"lang s, ford kj, john t, pollard aj, mccarthy ...",immunisation errors reported to a vaccine advi...,{'title': ['Energy advice service as perceived...,['Energy advice service as perceived by Swedis...,10.1111/j.1470-6431.2010.00924.x,Energy advice service as perceived by Swedish ...,False,False,...,50.0,0,False,False,10.1093/pubmed/fdv112,A quantitative review of healthcare profession...,,,,
1682,1033,,pollard aj. meningococcal disease prevention i...,meningococcal disease prevention in india,"{'title': ['Neonatal meningococcal disease'], ...",['Neonatal meningococcal disease'],10.1111/jpc.13856,Neonatal meningococcal disease,False,False,...,80.0,0,False,False,10.4161/hv.7.8.16270,Meningococcal disease: The advances and challe...,,,,
1697,34,,"prendergast b, coope lt, crijns h, falkenstein...",the german centre for cardiovascular research,{'title': ['Twenty years of research in the Hu...,['Twenty years of research in the Human Nutrit...,10.1111/nbu.12102,Twenty years of research in the Human Nutritio...,False,False,...,43.0,0,False,False,10.1016/j.bbamcr.2015.11.010,Heart regeneration,,,,
1733,1056,,"rand l, sheehan m. healthcare resource allocat...",healthcare resource allocation: balancing prin...,,,,,False,False,...,,0,False,False,10.1177/1471301215615780,"Charles Foster, Jonathan Herring and Israel Do...",,,,
1751,1006,,"richens jl, vere ka, light ra, soria d, gariba...",practical detection of a definitive biomarker ...,,,,,False,False,...,,0,False,False,,,,,,
1890,1611,,"sen a, selway r and nashe l. from channels to ...",from channels to commissioning - a practical...,{'title': ['Practical guide to dynamic pelvic ...,['Practical guide to dynamic pelvic floor MRI'],10.1002/jmri.25998,Practical guide to dynamic pelvic floor MRI,False,False,...,58.0,0,False,False,10.1016/j.yebeh.2018.03.016,Bitemporal seizure spread and its effect on au...,,,,


In [29]:
df_future.count()

ID                       2376
DOI_x                    1442
complete                 2376
csv_post_title           2376
crossref_API_out         2376
title                    2376
DOI_API                  2376
title_API                2081
title_match              2376
DOI_match                2376
CrossRefTitle_low        2081
DOI_fuzzyRatio           1442
Title_fuzzyRatio         2081
Title_partialRatio       2081
Complete_fuzzyRatio      2081
Complete_partialRatio    2081
TitleMatch01             2376
good_list                2376
good_list2               2376
DOI_retry_API             561
title_retry_API           561
curated_DOI               164
FinalDOI                 2365
DOI_y                      14
Recurated_DOI              14
dtype: int64

In [30]:
df_future.columns

Index(['ID', 'DOI_x', 'complete', 'csv_post_title', 'crossref_API_out',
       'title', 'DOI_API', 'title_API', 'title_match', 'DOI_match',
       'CrossRefTitle_low', 'DOI_fuzzyRatio', 'Title_fuzzyRatio',
       'Title_partialRatio', 'Complete_fuzzyRatio', 'Complete_partialRatio',
       'TitleMatch01', 'good_list', 'good_list2', 'DOI_retry_API',
       'title_retry_API', 'curated_DOI', 'FinalDOI', 'DOI_y', 'Recurated_DOI'],
      dtype='object')

In [31]:
df_future.dropna(subset=['FinalDOI'])[df_future['FinalDOI'].duplicated(keep=False)]

  """Entry point for launching an IPython kernel.


Unnamed: 0,ID,DOI_x,complete,csv_post_title,crossref_API_out,title,DOI_API,title_API,title_match,DOI_match,...,Complete_partialRatio,TitleMatch01,good_list,good_list2,DOI_retry_API,title_retry_API,curated_DOI,FinalDOI,DOI_y,Recurated_DOI


In [32]:
DOIs_out = df_future.reindex(columns=['ID','complete','FinalDOI']).dropna()

DOIs_out.sort_values('ID')

Unnamed: 0,ID,complete,FinalDOI
342,1,"chappell ma, woolrich mw, kazan s, jezzard p, ...",10.1002/mrm.24260
1893,2,"serres s, soto ms, hamilton a, mcateer ma, car...",10.1073/pnas.1117412109
1206,3,"lindsay ac, biasiolli l, lee, jm, kylintireas ...",10.1016/j.jcmg.2011.10.007
1105,4,"kolsch h, lehmann dj, ibrahim-verbaas ca, comb...",10.1007/s00702-011-0732-4
1317,5,"markwick, a., zamboni, g., & de jager, c. a. (...",10.1080/13803395.2012.672966
...,...,...,...
634,2416,"faull ok, pattinson kt. the cortical connectiv...",10.7554/elife.21749
877,2417,"hellner k, dorrell l. recent advances in under...",10.12688/f1000research.9701.1
976,2419,"iro ma, snape md, voysey m, jawad s, finn a, h...",10.1016/j.vaccine.2016.11.009
541,2420,"duane f, aznar mc, bartlett f, cutter dj, darb...",10.1016/j.radonc.2017.01.008


In [33]:

DOIs_out.count()

ID          2365
complete    2365
FinalDOI    2365
dtype: int64

In [34]:
DOIs_out.to_csv('./A5out.csv')

In [35]:
df_future.count().max()

2376

In [36]:
print('Digital Object Identifiers (DOIs) found for % of refernces in Spreadsheet:')
print (((DOIs_out.FinalDOI.count()/df_DOIs.count().max())*100).round(2), '%')


Digital Object Identifiers (DOIs) found for % of refernces in Spreadsheet:
99.45 %
