## Lookup for citations -- based on the wild examples

In [1]:
## All imports
import re
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
tqdm.pandas()

In [3]:
## load the file which contains the citation for which we need to perform the lookup
wild_exp_journal = pd.read_csv('./wild_examples_journal.csv')
# wild_exp_book = pd.read_csv('../wild_examples_book.csv')

In [4]:
wild_exp_journal.shape

(1105231, 7)

### Extract the title of the citation and the author, if they exist - which will be used for CrossRef lookup

In [6]:
def get_title(citation_text):
    """Get title of citation based on finding the placeholder keywords using regular expressions"""
    title_res = re.findall('title\s{0,10}=\s{0,10}([^|]+)', citation_text) 
    if len(title_res) == 0:
        if 'sports-reference' in citation_text:
            return re.findall('(C|c)ite\s{0,10}sports-reference\s{0,10}|([^|]+)', citation_text)[1][1].strip()
        else:
            article_res = re.findall('article\s{0,10}=\s{0,10}([^|]+)', citation_text)
            if len(article_res) != 0:
                return article_res[0].strip()
            return None
    return title_res[0].strip()

wild_exp_book['title'] = wild_exp_book['citations'].progress_apply(lambda x: get_title(x))

100%|██████████| 3806097/3806097 [00:28<00:00, 133743.82it/s]


In [7]:
wild_exp_book.head()

Unnamed: 0,id,page_title,citations,ID_list,type_of_citation,predicted_label_no,existing_label,title
0,290,A,{{cite encyclopedia | last = Hall-Quest | firs...,,cite encyclopedia,book,NO LABEL,A
1,569,Anthropology,{{cite book | first=Robert | last=Fletcher | c...,,cite book,book,NO LABEL,"The Saturday Lectures, Delivered in the Lectur..."
2,569,Anthropology,{{cite book | last=Lewis | first=H.S. | author...,,cite book,book,NO LABEL,"Histories of Anthropology Annual, Vol. I}}"
3,624,Alaska,{{cite news |url=http://www.washingtontimes.co...,,cite news,book,NO LABEL,Topic\u2014Sarah Palin
4,633,Algae,{{cite book| author = Thomas Kelly Cheyne |aut...,,cite book,book,NO LABEL,Encyclop\xe6dia biblica: a critical dictionary...


In [8]:
wild_exp_book[~wild_exp_book['title'].notnull()].shape

(92570, 8)

In [9]:
def get_author(citation_text):    
    """Get author based on finding the placeholder keywords using regular expressions"""
    def check_first_last_res(first_res, last_res):
        if len(first_res) != 0 and len(last_res) != 0:
            return first_res[0].strip() + ' ' + last_res[0].strip()
        if len(first_res) != 0:
            return first_res[0].strip()
        if len(last_res) != 0:
            return last_res[0].strip()
    
    ## https://en.wikipedia.org/wiki/Template:Citation -  only these keywords are available
    first_res = re.findall('first\s{0,10}=\s{0,10}([^|]+)', citation_text)
    last_res = re.findall('last\s{0,10}=\s{0,10}([^|]+)', citation_text)
    
    author_res = re.findall('author\s{0,10}=\s{0,10}([^|]+)', citation_text)
    author1_res = re.findall('author1\s{0,10}=\s{0,10}([^|]+)', citation_text)
    
    first1_res = re.findall('first1\s{0,10}=\s{0,10}([^|]+)', citation_text)
    last1_res = re.findall('last1\s{0,10}=\s{0,10}([^|]+)', citation_text)
    
    if first_res or last_res:
        return check_first_last_res(first_res, last_res)
    elif author_res:
        return author_res[0].strip()
    elif author1_res:
        return author1_res[0].strip()
    elif first1_res or last1_res:
        return check_first_last_res(first1_res, last1_res)
    else:
        return None

In [10]:
## For each citation, get the first author if it exists
wild_exp_book['first_author'] = wild_exp_book['citations'].progress_apply(lambda x: get_author(x))

100%|██████████| 3806097/3806097 [00:57<00:00, 66306.15it/s]


In [11]:
wild_exp_book.head()

Unnamed: 0,id,page_title,citations,ID_list,type_of_citation,predicted_label_no,existing_label,title,first_author
0,290,A,{{cite encyclopedia | last = Hall-Quest | firs...,,cite encyclopedia,book,NO LABEL,A,Olga Wilbourne Hall-Quest
1,569,Anthropology,{{cite book | first=Robert | last=Fletcher | c...,,cite book,book,NO LABEL,"The Saturday Lectures, Delivered in the Lectur...",Robert Fletcher
2,569,Anthropology,{{cite book | last=Lewis | first=H.S. | author...,,cite book,book,NO LABEL,"Histories of Anthropology Annual, Vol. I}}",H.S. Lewis
3,624,Alaska,{{cite news |url=http://www.washingtontimes.co...,,cite news,book,NO LABEL,Topic\u2014Sarah Palin,
4,633,Algae,{{cite book| author = Thomas Kelly Cheyne |aut...,,cite book,book,NO LABEL,Encyclop\xe6dia biblica: a critical dictionary...,Thomas Kelly Cheyne


In [12]:
## Total number of citations for which their is no author
wild_exp_book[~wild_exp_book['first_author'].notnull()].shape

(1839309, 9)

In [13]:
wild_exp_book.iloc[1]['citations']

'{{cite book | first=Robert | last=Fletcher | chapter=Paul Broca and the French School of Anthropology | title=The Saturday Lectures, Delivered in the Lecture-room of the U. S. National Museum under the Auspices of the Anthropological and Biological Societies of Washington in March and April 1882 | year=1882 | location=Boston; Washington, DC | publisher=D. Lothrop & Co.; Judd & Detweiler | chapter-url={{Google books|9dEJAQAAIAAJ|plainurl=yes}}}}'

In [14]:
## Fill NaN values in first author column as it is missing and can act as distraction for the lookup
wild_exp_book['first_author'].fillna(value='No author found', inplace=True)

In [15]:
def check_whitespace_author(first_author):
    """Check if the first author column has whitespace and replace it"""
    if not first_author.strip():
        return 'No author found'
    return first_author
    
wild_exp_book['first_author'] = wild_exp_book['first_author'].progress_apply(lambda x: check_whitespace_author(x))

100%|██████████| 3806097/3806097 [00:06<00:00, 591315.43it/s]


In [27]:
wild_exp_book.head(7)

Unnamed: 0,id,page_title,citations,ID_list,type_of_citation,predicted_label_no,existing_label,title,first_author
0,290,A,{{cite encyclopedia | last = Hall-Quest | firs...,,cite encyclopedia,book,NO LABEL,A,Olga Wilbourne Hall-Quest
1,569,Anthropology,{{cite book | first=Robert | last=Fletcher | c...,,cite book,book,NO LABEL,"The Saturday Lectures, Delivered in the Lectur...",Robert Fletcher
2,569,Anthropology,{{cite book | last=Lewis | first=H.S. | author...,,cite book,book,NO LABEL,"Histories of Anthropology Annual, Vol. I}}",H.S. Lewis
3,624,Alaska,{{cite news |url=http://www.washingtontimes.co...,,cite news,book,NO LABEL,Topic\u2014Sarah Palin,No author found
4,633,Algae,{{cite book| author = Thomas Kelly Cheyne |aut...,,cite book,book,NO LABEL,Encyclop\xe6dia biblica: a critical dictionary...,Thomas Kelly Cheyne
5,698,Atlantic Ocean,{{Cite web |url=http://www.ngdc.noaa.gov/mgg/g...,,cite web,book,NO LABEL,Volumes of the World's Oceans from ETOPO1,B.W. Eakins
6,864,Andy Warhol,{{cite book |last=Broughton |first=Philip Delv...,,cite book,book,NO LABEL,The Art of the Sale,Philip Delves Broughton


In [17]:
## Saving the file for which the lookup needs to be performed
wild_exp_book.to_csv('../wild_exp_info_book.csv', index=True, index_label='index')

In [18]:
## Example as to how a citation might look like
wild_exp_book.iloc[0]['citations']

"{{cite encyclopedia | last = Hall-Quest | first = Olga Wilbourne | editor-last = Johnston | editor-first = Bernard | encyclopedia = [[Collier's Encyclopedia]] | title = A | edition = First | year = 1997 | publisher = P.F. Collier | volume = I: A to Ameland | location = New York, NY | ref = harv }}"

### Once lookup is done..

1. Assign filename for each index in the dataframe
2. For each file/metadata found, get the 3 DOIs if they exist and their corresponding scores

In [5]:
wild_exp_info = pd.read_csv('./wild_exp_info.csv')

In [6]:
print('The total examples of the wild examples: {}'.format(wild_exp_info.shape[0]))
wild_exp_info.head()

The total examples of the wild examples: 1105231


Unnamed: 0,index,id,page_title,citations,ID_list,type_of_citation,predicted_label_no,existing_label,title,first_author
0,0,689,Asia,{{cite journal |title=What is Asia? |url=http:...,,cite journal,journal,NO LABEL,What is Asia?,Philip Bowring
1,1,844,Amsterdam,{{cite web|url=http://www.os.amsterdam.nl/tabe...,,cite web,journal,NO LABEL,Amsterdam in cijfers 2010,No author found
2,2,983,Albert Camus,{{cite journal|last=Cohn|first=Robert Greer|ti...,{JSTOR=393607},cite journal,journal,NO LABEL,The True Camus,Robert Greer Cohn
3,3,1078,Antisemitism,{{Cite news|url=https://www.nytimes.com/2002/0...,,cite news,journal,NO LABEL,Tunisian Synagogue Blast Called Accident,Donald G. McNeil Jr.
4,4,1098,Foreign relations of Armenia,{{cite web |url=http://www.foreign.gov.mv/v2/e...,,cite web,journal,NO LABEL,Bilateral Relations : Ministry of Foreign Affairs,No author found


In [7]:
def assign_metadata(index):
    """Check if the file exists for a given index"""
    if os.path.exists('lookup_journal/_{}.json'.format(index)):
        return '_{}.json'.format(index)
    return 'Metadata does not exist'
    
## assigning name of the file for each citation
wild_exp_info['metadata_file'] = wild_exp_info['index'].progress_apply(lambda x: assign_metadata(x))

100%|██████████| 1105231/1105231 [00:12<00:00, 87895.59it/s] 


In [8]:
## Number of citations for which metadata did not exist in CrossRef
## This is because some of the citations neither author and title did not exist
## or CrossRef was unable to parse the request -- due to invalid characters
print('Metadata did not exist for these many citations: {} out of {}'.format(
    wild_exp_info[wild_exp_info['metadata_file'] == 'Metadata does not exist'].shape[0],
    wild_exp_info.shape[0]
))

Metadata did not exist for these many citations: 1157 out of 1105231


In [9]:
wild_exp_info.head()

Unnamed: 0,index,id,page_title,citations,ID_list,type_of_citation,predicted_label_no,existing_label,title,first_author,metadata_file
0,0,689,Asia,{{cite journal |title=What is Asia? |url=http:...,,cite journal,journal,NO LABEL,What is Asia?,Philip Bowring,_0.json
1,1,844,Amsterdam,{{cite web|url=http://www.os.amsterdam.nl/tabe...,,cite web,journal,NO LABEL,Amsterdam in cijfers 2010,No author found,_1.json
2,2,983,Albert Camus,{{cite journal|last=Cohn|first=Robert Greer|ti...,{JSTOR=393607},cite journal,journal,NO LABEL,The True Camus,Robert Greer Cohn,_2.json
3,3,1078,Antisemitism,{{Cite news|url=https://www.nytimes.com/2002/0...,,cite news,journal,NO LABEL,Tunisian Synagogue Blast Called Accident,Donald G. McNeil Jr.,_3.json
4,4,1098,Foreign relations of Armenia,{{cite web |url=http://www.foreign.gov.mv/v2/e...,,cite web,journal,NO LABEL,Bilateral Relations : Ministry of Foreign Affairs,No author found,_4.json


In [10]:
def get_identifier_or_score(filename, identifier=True):
    """Return the 3 identifiers (DOIs) or confidence scores for each citation"""
    column_name = 'DOI' if identifier else 'score'
    if filename == 'Metadata does not exist':
        return None
    else:
        with open('lookup_journal/{}'.format(filename)) as f:
            content = json.loads(f.read())
            if len(content) > 0:
                if 'message' in content and content['message'] == 'No result was found in CrossRef':
                    return None
                else:
                    return np.array([content[i][column_name] for i in range(len(content))])
            return None

In [11]:
## Get the DOIs for each citation which we extracted from CrossRef
wild_exp_info['identifier'] = wild_exp_info['metadata_file'].progress_apply(
    lambda x: get_identifier_or_score(x))

100%|██████████| 1105231/1105231 [11:20<00:00, 1622.97it/s]


In [12]:
wild_exp_info.head()

Unnamed: 0,index,id,page_title,citations,ID_list,type_of_citation,predicted_label_no,existing_label,title,first_author,metadata_file,identifier
0,0,689,Asia,{{cite journal |title=What is Asia? |url=http:...,,cite journal,journal,NO LABEL,What is Asia?,Philip Bowring,_0.json,"[10.5790/hongkong/9789888208722.001.0001, 10.5..."
1,1,844,Amsterdam,{{cite web|url=http://www.os.amsterdam.nl/tabe...,,cite web,journal,NO LABEL,Amsterdam in cijfers 2010,No author found,_1.json,"[10.1016/j.jcss.2009.10.016, 10.1016/s0300-297..."
2,2,983,Albert Camus,{{cite journal|last=Cohn|first=Robert Greer|ti...,{JSTOR=393607},cite journal,journal,NO LABEL,The True Camus,Robert Greer Cohn,_2.json,"[10.2307/2908484, 10.2307/2929259, 10.1017/s00..."
3,3,1078,Antisemitism,{{Cite news|url=https://www.nytimes.com/2002/0...,,cite news,journal,NO LABEL,Tunisian Synagogue Blast Called Accident,Donald G. McNeil Jr.,_3.json,"[10.21236/ad0730733, 10.1515/ebr.freedmensynag..."
4,4,1098,Foreign relations of Armenia,{{cite web |url=http://www.foreign.gov.mv/v2/e...,,cite web,journal,NO LABEL,Bilateral Relations : Ministry of Foreign Affairs,No author found,_4.json,"[10.1037/h0021153, 10.2307/2750243, 10.2172/55..."


In [13]:
print('Identifier does not exist for these many citations: {} out of {}'.format(
    wild_exp_info[~wild_exp_info['identifier'].notnull()].shape[0], wild_exp_info.shape[0]
))

Identifier does not exist for these many citations: 57399 out of 1105231


In [14]:
## Get the confidence scores for each citation which we extracted from CrossRef
wild_exp_info['conf_score'] = wild_exp_info['metadata_file'].progress_apply(
    lambda x: get_identifier_or_score(x, identifier=False))

100%|██████████| 1105231/1105231 [06:16<00:00, 2937.18it/s]


In [15]:
wild_exp_info.head()

Unnamed: 0,index,id,page_title,citations,ID_list,type_of_citation,predicted_label_no,existing_label,title,first_author,metadata_file,identifier,conf_score
0,0,689,Asia,{{cite journal |title=What is Asia? |url=http:...,,cite journal,journal,NO LABEL,What is Asia?,Philip Bowring,_0.json,"[10.5790/hongkong/9789888208722.001.0001, 10.5...","[30.679447, 30.14759, 30.14759]"
1,1,844,Amsterdam,{{cite web|url=http://www.os.amsterdam.nl/tabe...,,cite web,journal,NO LABEL,Amsterdam in cijfers 2010,No author found,_1.json,"[10.1016/j.jcss.2009.10.016, 10.1016/s0300-297...","[29.915737, 20.72509, 19.624651]"
2,2,983,Albert Camus,{{cite journal|last=Cohn|first=Robert Greer|ti...,{JSTOR=393607},cite journal,journal,NO LABEL,The True Camus,Robert Greer Cohn,_2.json,"[10.2307/2908484, 10.2307/2929259, 10.1017/s00...","[38.344334, 38.344334, 26.110119]"
3,3,1078,Antisemitism,{{Cite news|url=https://www.nytimes.com/2002/0...,,cite news,journal,NO LABEL,Tunisian Synagogue Blast Called Accident,Donald G. McNeil Jr.,_3.json,"[10.21236/ad0730733, 10.1515/ebr.freedmensynag...","[22.869783, 22.84087, 22.050879]"
4,4,1098,Foreign relations of Armenia,{{cite web |url=http://www.foreign.gov.mv/v2/e...,,cite web,journal,NO LABEL,Bilateral Relations : Ministry of Foreign Affairs,No author found,_4.json,"[10.1037/h0021153, 10.2307/2750243, 10.2172/55...","[22.427742, 20.759182, 19.855873]"


In [16]:
print('Confidence scores does not exist for these many citations: {} out of {}'.format(
    wild_exp_info[~wild_exp_info['conf_score'].notnull()].shape[0],
    wild_exp_info.shape[0]
))

Confidence scores does not exist for these many citations: 57399 out of 1105231


In [17]:
## Save the results which we got from the metadata
wild_exp_info.to_parquet('wild_exp.gzip', compression='gzip')

### If the file with the lookup for no confidence threshold is there.. then load it

In [18]:
wild_exp_journal = pd.read_parquet('wild_exp.gzip')

In [19]:
print('The total number of citations are: {}'.format(wild_exp_journal.shape))

The total number of citations are: (1105231, 13)


In [21]:
threshold_score = 34.997

In [22]:
def get_score_based_on_threshold(conf_score):
    if conf_score is None:
        return None
    else:
        res = np.array([i for i, j in enumerate(conf_score) if j > threshold_score])
        if len(res) == 0:
            return None
        else:
            return res
    
wild_exp_journal['updated_conf_index'] = wild_exp_journal['conf_score'].progress_apply(
    lambda x: get_score_based_on_threshold(x))

100%|██████████| 1105231/1105231 [00:05<00:00, 214329.83it/s]


In [23]:
wild_exp_journal.head()

Unnamed: 0,index,id,page_title,citations,ID_list,type_of_citation,predicted_label_no,existing_label,title,first_author,metadata_file,identifier,conf_score,updated_conf_index
0,0,689,Asia,{{cite journal |title=What is Asia? |url=http:...,,cite journal,journal,NO LABEL,What is Asia?,Philip Bowring,_0.json,"[10.5790/hongkong/9789888208722.001.0001, 10.5...","[30.679447, 30.14759, 30.14759]",
1,1,844,Amsterdam,{{cite web|url=http://www.os.amsterdam.nl/tabe...,,cite web,journal,NO LABEL,Amsterdam in cijfers 2010,No author found,_1.json,"[10.1016/j.jcss.2009.10.016, 10.1016/s0300-297...","[29.915737, 20.72509, 19.624651]",
2,2,983,Albert Camus,{{cite journal|last=Cohn|first=Robert Greer|ti...,{JSTOR=393607},cite journal,journal,NO LABEL,The True Camus,Robert Greer Cohn,_2.json,"[10.2307/2908484, 10.2307/2929259, 10.1017/s00...","[38.344334, 38.344334, 26.110119]","[0, 1]"
3,3,1078,Antisemitism,{{Cite news|url=https://www.nytimes.com/2002/0...,,cite news,journal,NO LABEL,Tunisian Synagogue Blast Called Accident,Donald G. McNeil Jr.,_3.json,"[10.21236/ad0730733, 10.1515/ebr.freedmensynag...","[22.869783, 22.84087, 22.050879]",
4,4,1098,Foreign relations of Armenia,{{cite web |url=http://www.foreign.gov.mv/v2/e...,,cite web,journal,NO LABEL,Bilateral Relations : Ministry of Foreign Affairs,No author found,_4.json,"[10.1037/h0021153, 10.2307/2750243, 10.2172/55...","[22.427742, 20.759182, 19.855873]",


In [24]:
print('Total number of identifiers greater than the confidence score: {}'.format(
    wild_exp_journal.shape[0] - wild_exp_journal[~wild_exp_journal['updated_conf_index'].notnull()].shape[0]))
print('which is a percentage: {}'.format(
    wild_exp_journal[wild_exp_journal['updated_conf_index'].notnull()].shape[0] / float(wild_exp_journal.shape[0]) * 100))

Total number of identifiers greater than the confidence score: 260752
which is a percentage: 23.5925340494


In [25]:
def get_updated_identifier(row):
    if row['updated_conf_index'] is None:
        return None
    else:
        return row['identifier'][row['updated_conf_index']]
    
wild_exp_journal['updated_identifier'] = wild_exp_journal.progress_apply(get_updated_identifier, axis=1)

100%|██████████| 1105231/1105231 [00:37<00:00, 29452.21it/s]


In [26]:
wild_exp_journal.head()

Unnamed: 0,index,id,page_title,citations,ID_list,type_of_citation,predicted_label_no,existing_label,title,first_author,metadata_file,identifier,conf_score,updated_conf_index,updated_identifier
0,0,689,Asia,{{cite journal |title=What is Asia? |url=http:...,,cite journal,journal,NO LABEL,What is Asia?,Philip Bowring,_0.json,"[10.5790/hongkong/9789888208722.001.0001, 10.5...","[30.679447, 30.14759, 30.14759]",,
1,1,844,Amsterdam,{{cite web|url=http://www.os.amsterdam.nl/tabe...,,cite web,journal,NO LABEL,Amsterdam in cijfers 2010,No author found,_1.json,"[10.1016/j.jcss.2009.10.016, 10.1016/s0300-297...","[29.915737, 20.72509, 19.624651]",,
2,2,983,Albert Camus,{{cite journal|last=Cohn|first=Robert Greer|ti...,{JSTOR=393607},cite journal,journal,NO LABEL,The True Camus,Robert Greer Cohn,_2.json,"[10.2307/2908484, 10.2307/2929259, 10.1017/s00...","[38.344334, 38.344334, 26.110119]","[0, 1]","[10.2307/2908484, 10.2307/2929259]"
3,3,1078,Antisemitism,{{Cite news|url=https://www.nytimes.com/2002/0...,,cite news,journal,NO LABEL,Tunisian Synagogue Blast Called Accident,Donald G. McNeil Jr.,_3.json,"[10.21236/ad0730733, 10.1515/ebr.freedmensynag...","[22.869783, 22.84087, 22.050879]",,
4,4,1098,Foreign relations of Armenia,{{cite web |url=http://www.foreign.gov.mv/v2/e...,,cite web,journal,NO LABEL,Bilateral Relations : Ministry of Foreign Affairs,No author found,_4.json,"[10.1037/h0021153, 10.2307/2750243, 10.2172/55...","[22.427742, 20.759182, 19.855873]",,


In [27]:
wild_exp_journal.iloc[0]

index                                                                 0
id                                                                  689
page_title                                                         Asia
citations             {{cite journal |title=What is Asia? |url=http:...
ID_list                                                            None
type_of_citation                                           cite journal
predicted_label_no                                              journal
existing_label                                                 NO LABEL
title                                                     What is Asia?
first_author                                             Philip Bowring
metadata_file                                                   _0.json
identifier            [10.5790/hongkong/9789888208722.001.0001, 10.5...
conf_score                              [30.679447, 30.14759, 30.14759]
updated_conf_index                                              

In [28]:
wild_exp_journal.iloc[2]

index                                                                 2
id                                                                  983
page_title                                                 Albert Camus
citations             {{cite journal|last=Cohn|first=Robert Greer|ti...
ID_list                                                  {JSTOR=393607}
type_of_citation                                           cite journal
predicted_label_no                                              journal
existing_label                                                 NO LABEL
title                                                    The True Camus
first_author                                          Robert Greer Cohn
metadata_file                                                   _2.json
identifier            [10.2307/2908484, 10.2307/2929259, 10.1017/s00...
conf_score                            [38.344334, 38.344334, 26.110119]
updated_conf_index                                              

In [29]:
wild_exp_journal.to_parquet('wild_exp_with_confidence_score.gzip', compression='gzip')

In [34]:
wild_exp_journal[wild_exp_journal['updated_identifier'].notnull()]

Unnamed: 0,index,id,page_title,citations,ID_list,type_of_citation,predicted_label_no,existing_label,title,first_author,metadata_file,identifier,conf_score,updated_conf_index,updated_identifier
2,2,983,Albert Camus,{{cite journal|last=Cohn|first=Robert Greer|ti...,{JSTOR=393607},cite journal,journal,NO LABEL,The True Camus,Robert Greer Cohn,_2.json,"[10.2307/2908484, 10.2307/2929259, 10.1017/s00...","[38.344334, 38.344334, 26.110119]","[0, 1]","[10.2307/2908484, 10.2307/2929259]"
16,16,3226,Azores,{{cite journal|first=Ant\xf3nio de Brum|last=F...,,cite journal,journal,NO LABEL,Geodin\xe2mica e perigosidade natural nas ilha...,Ant\xf3nio de Brum Ferreira,_16.json,"[10.18055/finis1494, 10.18055/finis2540, 10.18...","[118.19752, 52.82293, 51.64595]","[0, 1, 2]","[10.18055/finis1494, 10.18055/finis2540, 10.18..."
25,25,5195,Economy of Canada,{{citation|author=Natural Resources Canada|tit...,,citation,journal,NO LABEL,Reducing diesel energy in rural and remote com...,Natural Resources Canada,_25.json,"[10.4095/289124, 10.4095/315201, 10.4095/315203]","[48.002598, 40.940514, 40.940514]","[0, 1, 2]","[10.4095/289124, 10.4095/315201, 10.4095/315203]"
32,32,6115,P versus NP problem,{{cite journal | last1 = Hartmanis | first1 = ...,,cite journal,journal,NO LABEL,"G\xf6del, von Neumann, and the '''P''' = '''NP...",Juris Hartmanis,_32.json,"[10.1142/9789812794499_0033, 10.1016/0020-0190...","[71.68191, 41.519527, 41.519527]","[0, 1, 2]","[10.1142/9789812794499_0033, 10.1016/0020-0190..."
35,35,7274,Cushitic languages,{{cite journal |last1=Cooper |first1=Julien |t...,,cite journal,journal,NO LABEL,Toponymic Strata in Ancient Nubia Until the Co...,Julien Cooper,_35.json,"[10.5070/d64110028, 10.1163/9789004422216, 10....","[86.3757, 28.04935, 20.515572]",[0],[10.5070/d64110028]
36,36,7381,Cyberspace,{{Cite journal|last=Bryant|first=William|date=...,,cite journal,journal,NO LABEL,Cyberspace Superiority A Conceptual Model,William Bryant,_36.json,"[10.4324/9781315688183, 10.1007/978-3-319-2358...","[40.480827, 25.906298, 23.591625]",[0],[10.4324/9781315688183]
37,37,7463,Cold fusion,{{cite journal |ref=harv |mode=cs2\n | last=U...,,cite journal,journal,NO LABEL,A Report of the Energy Research Advisory Board...,U.S. Department of Energy\n US DOE,_37.json,"[10.2172/890707, 10.2172/5692641, 10.2172/1011...","[70.40054, 62.718155, 62.392334]","[0, 1, 2]","[10.2172/890707, 10.2172/5692641, 10.2172/1011..."
46,46,10951,Fahrenheit 451,{{cite journal |last=Smolla |first=Rodney A. |...,{ISSN=0026-2234},cite journal,journal,NO LABEL,The Life of the Mind and a Life of Meaning: Re...,Rodney A. Smolla,_46.json,"[10.1080/10811680.2019.1660552, 10.1016/b978-0...","[36.935966, 30.486177, 26.806963]",[0],[10.1080/10811680.2019.1660552]
47,47,11393,Four Noble Truths,{{Citation | last =Sharf | first =Robert H. | ...,,citation,journal,NO LABEL,The Rhetoric of Experience and the Study of Re...,Robert H. Sharf,_47.json,"[10.1163/1568527952598549, 10.1086/489616, 10....","[35.970562, 31.049137, 31.049137]",[0],[10.1163/1568527952598549]
48,48,11561,Father Christmas,{{cite journal | url=https://archive.org/strea...,,cite journal,journal,NO LABEL,Gifts Placed in the Stocking at Christmas,"Lees, Edwin",_48.json,"[10.1093/nq/s5-xi.265.66c, 10.1097/01.eem.0000...","[62.620388, 20.278206, 19.735504]",[0],[10.1093/nq/s5-xi.265.66c]


In [37]:
z = wild_exp_journal[wild_exp_journal['updated_identifier'].notnull()]

In [48]:
all_doi_identifiers = np.concatenate(z['updated_identifier'].tolist()).ravel()

In [54]:
len(set(all_doi_identifiers))

320887

In [57]:
len(set(z['page_title'].tolist()))

164831