In [14]:
%load_ext autoreload
%autoreload 2

from artfinder import CrossrefArticle, Crossref, load_csv, strict_filter
import pandas as pd
import datetime
import re
import logging
from json import dumps

logging.basicConfig(level=logging.INFO)
crosref = Crossref(app='artfinder', email='aapopov1@mephi.ru')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df = load_csv('database/processed/kabashin_full.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   publisher               251 non-null    string        
 1   license                 149 non-null    object        
 2   is_referenced_by_count  251 non-null    int64         
 3   link                    203 non-null    object        
 4   authors                 251 non-null    object        
 5   abstract                78 non-null     string        
 6   title                   251 non-null    string        
 7   doi                     251 non-null    string        
 8   type                    251 non-null    string        
 9   journal                 247 non-null    string        
 10  issn                    210 non-null    string        
 11  volume                  206 non-null    string        
 12  issue                   169 non-null    string    

In [3]:
all_refs = []
for ret_list in df['references']:
    if ret_list is not None:
        all_refs.extend(ret_list)

print(f"Total references: {len(all_refs)}")
all_refs = list(set(all_refs))
print(f"Unique references: {len(all_refs)}")
all_refs[:10]

Total references: 7595
Unique references: 4181


['10.1021/acsanm.3c04567',
 '10.1002/bbpc.19840881010',
 '10.1088/0963-9659/5/5/005',
 '10.1007/s11095-015-1718-y',
 '10.1103/PhysRevLett.113.247401',
 '10.2351/1.5061377',
 '10.1007/s10973-018-7099-9',
 '10.1063/1.1886896',
 '10.1088/1361-6560/ac80e6',
 '10.1021/acsnano.7b00476']

In [6]:
results, failed = crosref.get_refs(df, concurrent_lim=30)

INFO:artfinder.api:Found 4181 unique references.


293/4181: https://api.crossref.org/works/10.1039/C2CP42895D34031-17.10.048:AID-SIMO261>3.0.CO;2-P

ERROR:artfinder.api:Error fetching https://api.crossref.org/works/10.1016/j.biomaterals.2017.10.048: 404


450/4181: https://api.crossref.org/works/10.1118/1.4934370K-02819-7.01965149

ERROR:artfinder.api:Error fetching https://api.crossref.org/works/10.1038/s41467-018-02819-7.: 404


1008/4181: https://api.crossref.org/works/10.1364/OL.39.0051921852-79805)04:03&lt;92::AID-CVDE92&gt;3.0.CO;2-C

ERROR:artfinder.api:Error fetching https://api.crossref.org/works/10.1002/(SICI)1521-3862(199805)04:03&lt;92::AID-CVDE92&gt;3.0.CO;2-C: 404


1684/4181: https://api.crossref.org/works/10.1021/acsnano.5b000424406862.005117097

ERROR:artfinder.api:Error fetching https://api.crossref.org/works/10.2217/nnm.16: 404


2155/4181: https://api.crossref.org/works/10.1016/j.bcp.2014.02.02501202.00107.082

ERROR:artfinder.api:Error fetching https://api.crossref.org/works/10.17185/duepublico/70584: 404


3203/4181: https://api.crossref.org/works/10.1039/c3nr02657d10.1c021993001-84<263::AID-MOP19>3.0.CO;2-82-9-C

ERROR:artfinder.api:Error fetching https://api.crossref.org/works/10.1016/j.apsusc.2004.02..013: 404


4181/4181: https://api.crossref.org/works/10.1007/s00339-003-2489-605906.01252:AID-JRS653>3.0.CO;2-EO;2-U2-2

In [7]:
results.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4175 entries, 0 to 0
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   publisher               4175 non-null   string        
 1   license                 2705 non-null   object        
 2   is_referenced_by_count  4175 non-null   object        
 3   link                    4056 non-null   object        
 4   authors                 4175 non-null   object        
 5   abstract                1108 non-null   string        
 6   title                   4152 non-null   string        
 7   doi                     4175 non-null   string        
 8   type                    4175 non-null   string        
 9   journal                 4131 non-null   string        
 10  issn                    4117 non-null   string        
 11  volume                  4081 non-null   string        
 12  issue                   3683 non-null   string        
 

In [9]:
articles = results[results['type'].isin(['journal-article', 'proceedings-article'])]

In [15]:
articles = articles[articles['publication_date'] > '1993']

In [18]:
articles['title'].fillna('', inplace=True)

In [20]:
kaba = load_csv('database/processed/kabashin_full.csv')
barc = load_csv('database/processed/barcikowski_full.csv')

In [21]:
kaba_barc = pd.concat([kaba, barc], ignore_index=True)

In [25]:
kaba_barc.drop_duplicates(subset=['doi'], inplace=True)

In [27]:
strict_refs = articles[articles['title'].map(lambda x: strict_filter(x))]

In [29]:
strict_refs[~strict_refs['doi'].isin(kaba_barc['doi'])].info()

<class 'pandas.core.frame.DataFrame'>
Index: 185 entries, 0 to 0
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   publisher               185 non-null    string        
 1   license                 101 non-null    object        
 2   is_referenced_by_count  185 non-null    object        
 3   link                    180 non-null    object        
 4   authors                 185 non-null    object        
 5   abstract                46 non-null     string        
 6   title                   185 non-null    string        
 7   doi                     185 non-null    string        
 8   type                    185 non-null    string        
 9   journal                 185 non-null    string        
 10  issn                    185 non-null    string        
 11  volume                  185 non-null    string        
 12  issue                   159 non-null    string        
 1