## Lookup for citations -- based on the wild examples

In [2]:
## All imports
import re
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [8]:
tqdm.pandas()

In [3]:
## load the file which contains the citation for which we need to perform the lookup
wild_exp_journal = pd.read_csv('wild_exp_info.csv')

In [4]:
wild_exp_journal.shape

(2080785, 5)

In [5]:
## A look into the file..
wild_exp_journal.head()

Unnamed: 0,index,citations,label_category,title,first_author
0,0,{{Citation | last = Buchanan | first = Patrick...,journal,"As Adelphia Goes, so Goes America?",Patrick \u2018Pat\u2019 Joseph Buchanan
1,1,{{Citation | last = Edwards | first = Adam | t...,journal,Any Old Iron?,Adam Edwards
2,2,{{Citation | last = Stuart | first = Patience ...,journal,National Register of Historic Places Registrat...,Patience Stuart
3,3,{{Citation | last = Vince | first = Alan | tit...,journal,Obituary: Keith Watson,Alan Vince
4,4,{{Citation | last = | first = | author-link = ...,journal,The ARIA Nominees,No author found


### Extract the title of the citation and the author, if they exist - which will be used for CrossRef lookup

In [6]:
def get_title(citation_text):
    """Get title of citation based on finding the placeholder keywords using regular expressions"""
    title_res = re.findall('title\s{0,10}=\s{0,10}([^|]+)', citation_text) 
    if len(title_res) == 0:
        if 'sports-reference' in citation_text:
            return re.findall('(C|c)ite\s{0,10}sports-reference\s{0,10}|([^|]+)', citation_text)[1][1].strip()
        else:
            article_res = re.findall('article\s{0,10}=\s{0,10}([^|]+)', citation_text)
            if len(article_res) != 0:
                return article_res[0].strip()
            return None
    return title_res[0].strip()

wild_exp_journal['title'] = wild_exp_journal['citations'].progress_apply(lambda x: get_title(x))

100%|██████████| 2080785/2080785 [00:10<00:00, 200349.00it/s]


In [7]:
wild_exp_journal.head()

Unnamed: 0,citations,label_category,title
0,{{Citation | last = Buchanan | first = Patrick...,journal,"As Adelphia Goes, so Goes America?"
1,{{Citation | last = Edwards | first = Adam | t...,journal,Any Old Iron?
2,{{Citation | last = Stuart | first = Patience ...,journal,National Register of Historic Places Registrat...
3,{{Citation | last = Vince | first = Alan | tit...,journal,Obituary: Keith Watson
4,{{Citation | last = | first = | author-link = ...,journal,The ARIA Nominees


In [8]:
wild_exp_journal[~wild_exp_journal['title'].notnull()].shape

(5709, 3)

In [9]:
def get_author(citation_text):    
    """Get author based on finding the placeholder keywords using regular expressions"""
    def check_first_last_res(first_res, last_res):
        if len(first_res) != 0 and len(last_res) != 0:
            return first_res[0].strip() + ' ' + last_res[0].strip()
        if len(first_res) != 0:
            return first_res[0].strip()
        if len(last_res) != 0:
            return last_res[0].strip()
    
    ## https://en.wikipedia.org/wiki/Template:Citation -  only these keywords are available
    first_res = re.findall('first\s{0,10}=\s{0,10}([^|]+)', citation_text)
    last_res = re.findall('last\s{0,10}=\s{0,10}([^|]+)', citation_text)
    
    author_res = re.findall('author\s{0,10}=\s{0,10}([^|]+)', citation_text)
    author1_res = re.findall('author1\s{0,10}=\s{0,10}([^|]+)', citation_text)
    
    first1_res = re.findall('first1\s{0,10}=\s{0,10}([^|]+)', citation_text)
    last1_res = re.findall('last1\s{0,10}=\s{0,10}([^|]+)', citation_text)
    
    if first_res or last_res:
        return check_first_last_res(first_res, last_res)
    elif author_res:
        return author_res[0].strip()
    elif author1_res:
        return author1_res[0].strip()
    elif first1_res or last1_res:
        return check_first_last_res(first1_res, last1_res)
    else:
        return None

In [10]:
## For each citation, get the first author if it exists
wild_exp_journal['first_author'] = wild_exp_journal['citations'].progress_apply(lambda x: get_author(x))

100%|██████████| 2080785/2080785 [00:29<00:00, 71345.61it/s]


In [11]:
wild_exp_journal.head()

Unnamed: 0,citations,label_category,title,first_author
0,{{Citation | last = Buchanan | first = Patrick...,journal,"As Adelphia Goes, so Goes America?",Patrick \u2018Pat\u2019 Joseph Buchanan
1,{{Citation | last = Edwards | first = Adam | t...,journal,Any Old Iron?,Adam Edwards
2,{{Citation | last = Stuart | first = Patience ...,journal,National Register of Historic Places Registrat...,Patience Stuart
3,{{Citation | last = Vince | first = Alan | tit...,journal,Obituary: Keith Watson,Alan Vince
4,{{Citation | last = | first = | author-link = ...,journal,The ARIA Nominees,


In [12]:
## Total number of citations for which their is no author
wild_exp_journal[~wild_exp_journal['first_author'].notnull()].shape

(1487541, 4)

In [13]:
## Fill NaN values in first author column as it is missing and can act as distraction for the lookup
wild_exp_journal['first_author'].fillna(value='No author found', inplace=True)

In [14]:
def check_whitespace_author(first_author):
    """Check if the first author column has whitespace and replace it"""
    if not first_author.strip():
        return 'No author found'
    return first_author
    
wild_exp_journal['first_author'] = wild_exp_journal['first_author'].progress_apply(lambda x: check_whitespace_author(x))

100%|██████████| 2080785/2080785 [00:03<00:00, 531132.56it/s]


In [15]:
wild_exp_journal.head()

Unnamed: 0,citations,label_category,title,first_author
0,{{Citation | last = Buchanan | first = Patrick...,journal,"As Adelphia Goes, so Goes America?",Patrick \u2018Pat\u2019 Joseph Buchanan
1,{{Citation | last = Edwards | first = Adam | t...,journal,Any Old Iron?,Adam Edwards
2,{{Citation | last = Stuart | first = Patience ...,journal,National Register of Historic Places Registrat...,Patience Stuart
3,{{Citation | last = Vince | first = Alan | tit...,journal,Obituary: Keith Watson,Alan Vince
4,{{Citation | last = | first = | author-link = ...,journal,The ARIA Nominees,No author found


In [16]:
## Saving the file for which the lookup needs to be performed
wild_exp_journal.to_csv('wild_exp_info.csv', index=True, index_label='index')

In [21]:
## Example as to how a citation might look like
wild_exp_journal.iloc[0]['citations']

'{{Citation | last = Buchanan | first = Patrick \\u2018Pat\\u2019 Joseph | url = | title = As Adelphia Goes, so Goes America? | date = February 14, 2005 | publisher = The American cause }}'

### Once lookup is done..

1. Assign filename for each index in the dataframe
2. For each file/metadata found, get the 3 DOIs if they exist and their corresponding scores

In [6]:
wild_exp_journal.head()

Unnamed: 0,index,citations,label_category,title,first_author
0,0,{{Citation | last = Buchanan | first = Patrick...,journal,"As Adelphia Goes, so Goes America?",Patrick \u2018Pat\u2019 Joseph Buchanan
1,1,{{Citation | last = Edwards | first = Adam | t...,journal,Any Old Iron?,Adam Edwards
2,2,{{Citation | last = Stuart | first = Patience ...,journal,National Register of Historic Places Registrat...,Patience Stuart
3,3,{{Citation | last = Vince | first = Alan | tit...,journal,Obituary: Keith Watson,Alan Vince
4,4,{{Citation | last = | first = | author-link = ...,journal,The ARIA Nominees,No author found


In [7]:
def assign_metadata(index):
    """Check if the file exists for a given index"""
    if os.path.exists('lookup/_{}.json'.format(index)):
        return '_{}.json'.format(index)
    return 'Metadata does not exist'
    
## assigning name of the file for each citation
wild_exp_journal['metadata_file'] = wild_exp_journal['index'].progress_apply(lambda x: assign_metadata(x))

100%|██████████| 2080785/2080785 [00:08<00:00, 253571.91it/s]


In [8]:
## Number of citations for which metadata did not exist in CrossRef
## This is because some of the citations neither author and title did not exist
## or CrossRef was unable to parse the request -- due to invalid characters
print('Metadata did not exist for these many citations: {} out of {}'.format(
    wild_exp_journal[wild_exp_journal['metadata_file'] == 'Metadata does not exist'].shape[0],
    wild_exp_journal.shape[0]
))

Metadata did not exist for these many citations: 3811 out of 2080785


In [9]:
wild_exp_journal.head()

Unnamed: 0,index,citations,label_category,title,first_author,metadata_file
0,0,{{Citation | last = Buchanan | first = Patrick...,journal,"As Adelphia Goes, so Goes America?",Patrick \u2018Pat\u2019 Joseph Buchanan,_0.json
1,1,{{Citation | last = Edwards | first = Adam | t...,journal,Any Old Iron?,Adam Edwards,_1.json
2,2,{{Citation | last = Stuart | first = Patience ...,journal,National Register of Historic Places Registrat...,Patience Stuart,_2.json
3,3,{{Citation | last = Vince | first = Alan | tit...,journal,Obituary: Keith Watson,Alan Vince,_3.json
4,4,{{Citation | last = | first = | author-link = ...,journal,The ARIA Nominees,No author found,_4.json


In [14]:
def get_identifier_or_score(filename, identifier=True):
    """Return the 3 identifiers (DOIs) or confidence scores for each citation"""
    column_name = 'DOI' if identifier else 'score'
    if filename == 'Metadata does not exist':
        return None
    else:
        with open('lookup/{}'.format(filename)) as f:
            content = json.loads(f.read())
            if len(content) > 0:
                if 'message' in content and content['message'] == 'No result was found in CrossRef':
                    return None
                else:
                    return np.array([content[i][column_name] for i in range(len(content))])
            return None

In [15]:
## Get the DOIs for each citation which we extracted from CrossRef
wild_exp_journal['identifier'] = wild_exp_journal['metadata_file'].progress_apply(
    lambda x: get_identifier_or_score(x))

100%|██████████| 2080785/2080785 [04:14<00:00, 8166.49it/s]


In [16]:
wild_exp_journal.head()

Unnamed: 0,index,citations,label_category,title,first_author,metadata_file,identifier
0,0,{{Citation | last = Buchanan | first = Patrick...,journal,"As Adelphia Goes, so Goes America?",Patrick \u2018Pat\u2019 Joseph Buchanan,_0.json,"[10.1038/scientificamerican1297-48, 10.1029/jd..."
1,1,{{Citation | last = Edwards | first = Adam | t...,journal,Any Old Iron?,Adam Edwards,_1.json,"[10.1364/ofc.2016.w3b.4, 10.1016/s0140-6736(83..."
2,2,{{Citation | last = Stuart | first = Patience ...,journal,National Register of Historic Places Registrat...,Patience Stuart,_2.json,"[10.1177/019263654502913320, 10.12968/bjsn.201..."
3,3,{{Citation | last = Vince | first = Alan | tit...,journal,Obituary: Keith Watson,Alan Vince,_3.json,"[10.1049/iet-tv.44.16506, 10.1161/01.cir.23.2...."
4,4,{{Citation | last = | first = | author-link = ...,journal,The ARIA Nominees,No author found,_4.json,"[10.4016/32852.01, 10.1007/978-1-84882-864-3_4..."


In [20]:
print('Identifier does not exist for these many citations: {} out of {}'.format(
    wild_exp_journal[~wild_exp_journal['identifier'].notnull()].shape[0],
    wild_exp_journal.shape[0]
))

Identifier does not exist for these many citations: 77188 out of 2080785


In [21]:
## Get the confidence scores for each citation which we extracted from CrossRef
wild_exp_journal['conf_score'] = wild_exp_journal['metadata_file'].progress_apply(
    lambda x: get_identifier_or_score(x, identifier=False))

100%|██████████| 2080785/2080785 [04:23<00:00, 7896.74it/s]


In [22]:
wild_exp_journal.head()

Unnamed: 0,index,citations,label_category,title,first_author,metadata_file,identifier,conf_score
0,0,{{Citation | last = Buchanan | first = Patrick...,journal,"As Adelphia Goes, so Goes America?",Patrick \u2018Pat\u2019 Joseph Buchanan,_0.json,"[10.1038/scientificamerican1297-48, 10.1029/jd...","[32.524315, 29.46917, 29.390476]"
1,1,{{Citation | last = Edwards | first = Adam | t...,journal,Any Old Iron?,Adam Edwards,_1.json,"[10.1364/ofc.2016.w3b.4, 10.1016/s0140-6736(83...","[17.818737, 17.74486, 17.038055]"
2,2,{{Citation | last = Stuart | first = Patience ...,journal,National Register of Historic Places Registrat...,Patience Stuart,_2.json,"[10.1177/019263654502913320, 10.12968/bjsn.201...","[22.847187, 22.367924, 22.256752]"
3,3,{{Citation | last = Vince | first = Alan | tit...,journal,Obituary: Keith Watson,Alan Vince,_3.json,"[10.1049/iet-tv.44.16506, 10.1161/01.cir.23.2....","[18.879705, 18.771431, 18.371412]"
4,4,{{Citation | last = | first = | author-link = ...,journal,The ARIA Nominees,No author found,_4.json,"[10.4016/32852.01, 10.1007/978-1-84882-864-3_4...","[21.100235, 18.031717, 18.031717]"


In [23]:
print('Confidence scores does not exist for these many citations: {} out of {}'.format(
    wild_exp_journal[~wild_exp_journal['conf_score'].notnull()].shape[0],
    wild_exp_journal.shape[0]
))

Confidence scores does not exist for these many citations: 77188 out of 2080785


In [24]:
## Save the results which we got from the metadata
wild_exp_journal.to_parquet('wild_exp.gzip', compression='gzip')

### If the file with the lookup for no confidence threshold is there.. then load it

In [3]:
wild_exp_journal = pd.read_parquet('wild_exp.gzip')

In [4]:
print('The total number of citations are: {}'.format(wild_exp_journal.shape))

The total number of citations are: (2080785, 8)


In [6]:
threshold_score = 34.997

In [9]:
def get_score_based_on_threshold(conf_score):
    if conf_score is None:
        return None
    else:
        res = np.array([i for i, j in enumerate(conf_score) if j > threshold_score])
        if len(res) == 0:
            return None
        else:
            return res
    
wild_exp_journal['updated_conf_index'] = wild_exp_journal['conf_score'].progress_apply(
    lambda x: get_score_based_on_threshold(x))

100%|██████████| 2080785/2080785 [00:09<00:00, 209875.31it/s]


In [10]:
wild_exp_journal.head()

Unnamed: 0,index,citations,label_category,title,first_author,metadata_file,identifier,conf_score,updated_conf_index
0,0,{{Citation | last = Buchanan | first = Patrick...,journal,"As Adelphia Goes, so Goes America?",Patrick \u2018Pat\u2019 Joseph Buchanan,_0.json,"[10.1038/scientificamerican1297-48, 10.1029/jd...","[32.524315, 29.46917, 29.390476]",
1,1,{{Citation | last = Edwards | first = Adam | t...,journal,Any Old Iron?,Adam Edwards,_1.json,"[10.1364/ofc.2016.w3b.4, 10.1016/s0140-6736(83...","[17.818737, 17.74486, 17.038055]",
2,2,{{Citation | last = Stuart | first = Patience ...,journal,National Register of Historic Places Registrat...,Patience Stuart,_2.json,"[10.1177/019263654502913320, 10.12968/bjsn.201...","[22.847187, 22.367924, 22.256752]",
3,3,{{Citation | last = Vince | first = Alan | tit...,journal,Obituary: Keith Watson,Alan Vince,_3.json,"[10.1049/iet-tv.44.16506, 10.1161/01.cir.23.2....","[18.879705, 18.771431, 18.371412]",
4,4,{{Citation | last = | first = | author-link = ...,journal,The ARIA Nominees,No author found,_4.json,"[10.4016/32852.01, 10.1007/978-1-84882-864-3_4...","[21.100235, 18.031717, 18.031717]",


In [11]:
print('Total number of identifiers greater than the confidence score: {}'.format(
    wild_exp_journal.shape[0] - wild_exp_journal[~wild_exp_journal['updated_conf_index'].notnull()].shape[0]))
print('which is a percentage: {}'.format(
    wild_exp_journal[wild_exp_journal['updated_conf_index'].notnull()].shape[0] / wild_exp_journal.shape[0] * 100))

Total number of identifiers greater than the confidence score: 271345
which is a percentage: 13.040511153242646


In [9]:
def get_updated_identifier(row):
    if row['updated_conf_index'] is None:
        return None
    else:
        return row['identifier'][row['updated_conf_index']]
    
wild_exp_journal['updated_identifier'] = wild_exp_journal.progress_apply(get_updated_identifier, axis=1)

100%|██████████| 2080785/2080785 [06:04<00:00, 5707.44it/s]


In [10]:
wild_exp_journal.head()

Unnamed: 0,index,citations,label_category,title,first_author,metadata_file,identifier,conf_score,updated_conf_index,updated_identifier
0,0,{{Citation | last = Buchanan | first = Patrick...,journal,"As Adelphia Goes, so Goes America?",Patrick \u2018Pat\u2019 Joseph Buchanan,_0.json,"[10.1038/scientificamerican1297-48, 10.1029/jd...","[32.524315, 29.46917, 29.390476]","[0, 1, 2]","[10.1038/scientificamerican1297-48, 10.1029/jd..."
1,1,{{Citation | last = Edwards | first = Adam | t...,journal,Any Old Iron?,Adam Edwards,_1.json,"[10.1364/ofc.2016.w3b.4, 10.1016/s0140-6736(83...","[17.818737, 17.74486, 17.038055]",,
2,2,{{Citation | last = Stuart | first = Patience ...,journal,National Register of Historic Places Registrat...,Patience Stuart,_2.json,"[10.1177/019263654502913320, 10.12968/bjsn.201...","[22.847187, 22.367924, 22.256752]","[0, 1, 2]","[10.1177/019263654502913320, 10.12968/bjsn.201..."
3,3,{{Citation | last = Vince | first = Alan | tit...,journal,Obituary: Keith Watson,Alan Vince,_3.json,"[10.1049/iet-tv.44.16506, 10.1161/01.cir.23.2....","[18.879705, 18.771431, 18.371412]","[0, 1, 2]","[10.1049/iet-tv.44.16506, 10.1161/01.cir.23.2...."
4,4,{{Citation | last = | first = | author-link = ...,journal,The ARIA Nominees,No author found,_4.json,"[10.4016/32852.01, 10.1007/978-1-84882-864-3_4...","[21.100235, 18.031717, 18.031717]",[0],[10.4016/32852.01]


In [16]:
wild_exp_journal.to_parquet('wild_exp_with_confidence_score.gzip', compression='gzip')

ArrowIOError: Arrow error: IOError: [Errno 28] No space left on device