## Loading the curated citations dataset and do processing

In [1]:
# All imports
import glob
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from scripts.const import CITATION_TEMPLATES

import findspark
findspark.init('/Users/harshdeep/Downloads/spark-1.6.0-bin-hadoop2.6/')
from pyspark import SparkContext, SQLContext


import warnings
warnings.filterwarnings("ignore")

### Load the citations (extracted) parquet file with 21 million citations

In [2]:
sc = SparkContext()
sqlContext = SQLContext(sc)

In [3]:
all_citations = sqlContext.read.parquet('./citations_separated.parquet/')

In [4]:
# Get citation count for all the 25 citations which can be parsed
citation_count = all_citations.groupby('type_of_citation').count().toPandas()

In [5]:
citation_count['type_of_citation'].unique()

array([u'cite journal', u'cite dvd notes', u'cite thesis', u'gnis',
       u'cite episode', u'cite encyclopedia', u'cite arxiv',
       u'cite newsgroup', u'cite web', u'cite mailing list',
       u'timatic visa policy', u'cite av media', u'harvnb',
       u'soccerbase season', u'cite sports-reference', u'citation',
       u'cite conference', u'nhle', u'cite news', u'cite report',
       u'harvard citation no brackets', u'cite av media notes',
       u'cite press release', u'cite serial', u'geonet3', u'cite map',
       u'cite techreport', u'cite podcast',
       u'national heritage list for england', u'cite book', u'cite gnis',
       u'nrisref', u'cite interview', u'cite speech', u'cite sign'],
      dtype=object)

### Get the top 300 templates (according to DLAB) and compare it with the number of citations we have got

In [6]:
top300_templates = pd.read_csv('top300_templates.csv')

In [7]:
# Only consider the templates which can be parsed by mwparserfromhell
parseable_template_count = top300_templates.loc[top300_templates['template'].isin(CITATION_TEMPLATES)]

In [8]:
merged_counts = pd.merge(
    parseable_template_count, citation_count,
    left_on='template', right_on='type_of_citation', how='inner'
).drop('template', axis=1)

merged_counts.columns = ['dlab_count', 'type_of_citation', 'curated_count']

In [9]:
merged_counts['bigger_than'] = merged_counts['curated_count'] - merged_counts['dlab_count']
merged_counts

Unnamed: 0,dlab_count,type_of_citation,curated_count,bigger_than
0,13340076,cite web,13971631,631555
1,4003325,cite news,4099397,96072
2,1765330,cite book,2573371,808041
3,1413205,cite journal,1747097,333892
4,304742,citation,484974,180232
5,62177,cite press release,61998,-179
6,39439,cite episode,40203,764
7,34989,cite map,35402,413
8,33637,cite encyclopedia,43864,10227
9,32269,cite report,35374,3105


In [10]:
# Load all csv files and add them into one dataframe
files = glob.glob('citations_ids.csv/part-*')
citation_dataframes = [pd.read_csv(f, header=None, sep=',') for f in files]
citation_with_ids = pd.concat(citation_dataframes, ignore_index=True)

In [11]:
print(citation_with_ids.shape)
total_citations = citation_with_ids.shape[0]

(3147372, 4)


In [12]:
citation_with_ids.head()

Unnamed: 0,0,1,2,3
0,1831197,Fatal Deceit,{ISSN=0093-7673},"[{last=Gliatto, first=Tom}]"
1,1831220,WHO Model Formulary 2008,{ISBN=9789241547659},
2,1831220,Protamine sulfate as an effective alternative ...,"{PMID=2786000, DOI=10.1016/0166-0934(89)90132-8}","[{last=Kenneth Cornetta}, {last=W.French Ander..."
3,1831220,Protamine sulfate enhances lipid-mediated gene...,"{PMID=9349433, DOI=10.1038/sj.gt.3300484}","[{last=Sorgi, first=FL}, {last=Bhattacharya, S..."
4,1831220,Successful cardiopulmonary bypass in diabetics...,"{PMID=6743419, PMC=481594, DOI=10.1136/hrt.52....","[{last=Walker, first=WS}, {last=Reid, KG}, {la..."


In [13]:
citation_with_ids

Unnamed: 0,0,1,2,3
0,1831197,Fatal Deceit,{ISSN=0093-7673},"[{last=Gliatto, first=Tom}]"
1,1831220,WHO Model Formulary 2008,{ISBN=9789241547659},
2,1831220,Protamine sulfate as an effective alternative ...,"{PMID=2786000, DOI=10.1016/0166-0934(89)90132-8}","[{last=Kenneth Cornetta}, {last=W.French Ander..."
3,1831220,Protamine sulfate enhances lipid-mediated gene...,"{PMID=9349433, DOI=10.1038/sj.gt.3300484}","[{last=Sorgi, first=FL}, {last=Bhattacharya, S..."
4,1831220,Successful cardiopulmonary bypass in diabetics...,"{PMID=6743419, PMC=481594, DOI=10.1136/hrt.52....","[{last=Walker, first=WS}, {last=Reid, KG}, {la..."
5,1831220,Management of the patient with protamine hyper...,"{PMID=6334459, DOI=10.1097/00000542-198412000-...","[{last=Campbell, first=FW}, {last=Goldstein, M..."
6,1831220,Hemodynamic changes after protamine administra...,"{PMID=15681944, DOI=10.1097/00000542-200502000...","[{last=Welsby, first=IJ}, {last=Newman, MF}, {..."
7,1831220,The toxicology of heparin reversal with protam...,{DOI=10.1080/17425255.2016.1194395},"[{last=Sokolowska, first=E}, {last=Kalaska, B}..."
8,1831239,The protamine family of sperm nuclear proteins,"{PMID=17903313, PMC=2375014, DOI=10.1186/gb-20...","[{last=Balhorn, first=R}]"
9,1831239,Protamine-induced condensation and decondensat...,"{PMID=10506559, DOI=10.1126/science.286.5437.120}","[{last=Brewer, first=LR}, {last=Corzett, first..."


In [14]:
citation_with_ids.columns = [
    'id', 'title_of_citation', 'id_list', 'authors',
    # 'title_of_page', publisher_place', 'type_of_citation', 'publisher_name'
]

In [15]:
# Percentage of values present for title of page, title of citation and authors
(citation_with_ids.count() * 100) / total_citations

id                   100.000000
title_of_citation     99.881234
id_list              100.000000
authors               92.977157
dtype: float64

In [16]:
# citation_with_ids.groupby('type_of_citation').size()

In [17]:
# Formulate a structure for the ID_List in which we can do something meaningful
citation_with_ids['id_list'] = citation_with_ids['id_list'].apply(
    lambda x: list(item.split('=') for item in x.replace('{','').replace('}','').replace(' ', '').split(','))
)

In [18]:
# Get the kinds of ids associated with each tuple
kinds_of_ids = set()
def update_ids(x):
    for item in x:
        kinds_of_ids.add(item[0])

_ = citation_with_ids['id_list'].apply(lambda x: update_ids(x))

# Add the columns with NoneType in the previous DF
for id_ in kinds_of_ids:
    citation_with_ids[id_] = None

In [19]:
print('Total kind of Citation IDs: {}'.format(len(kinds_of_ids)))

Total kind of Citation IDs: 51


In [20]:
# Set the value of identifiers for each column, for e.g. DOI, ISBN etc.
def set_citation_val(x):
    for item in x['id_list']:
        citation_with_ids.at[x.name, item[0]] = item[1] if len(item) >= 2 else None

_ = citation_with_ids.apply(lambda x: set_citation_val(x), axis=1)

In [21]:
citation_with_ids.head()

Unnamed: 0,id,title_of_citation,id_list,authors,DOI,ISBN,JFM,1997),ARXIV,SSRN,...,+david+w.,5-79-051006-X,5-7905-1008-6,info:pmid/25772496&rft.eissn,+xin,cribb,No.3(Summer,MR,ZBL,PMID
0,1831197,Fatal Deceit,"[[ISSN, 0093-7673]]","[{last=Gliatto, first=Tom}]",,,,,,,...,,,,,,,,,,
1,1831220,WHO Model Formulary 2008,"[[ISBN, 9789241547659]]",,,9789241547659.0,,,,,...,,,,,,,,,,
2,1831220,Protamine sulfate as an effective alternative ...,"[[PMID, 2786000], [DOI, 10.1016/0166-0934(89)9...","[{last=Kenneth Cornetta}, {last=W.French Ander...",10.1016/0166-0934(89)90132-8,,,,,,...,,,,,,,,,,2786000.0
3,1831220,Protamine sulfate enhances lipid-mediated gene...,"[[PMID, 9349433], [DOI, 10.1038/sj.gt.3300484]]","[{last=Sorgi, first=FL}, {last=Bhattacharya, S...",10.1038/sj.gt.3300484,,,,,,...,,,,,,,,,,9349433.0
4,1831220,Successful cardiopulmonary bypass in diabetics...,"[[PMID, 6743419], [PMC, 481594], [DOI, 10.1136...","[{last=Walker, first=WS}, {last=Reid, KG}, {la...",10.1136/hrt.52.1.112,,,,,,...,,,,,,,,,,6743419.0


In [22]:
# Save the file in the Pandas format
# citation_with_ids.to_csv('citations_with_ids.csv')

# How robust are the `Crossref` and the `Google Books API`?

In [114]:
# Importing the methods written in the script to access google books and cross ref apis
from scripts.run_apis import run_google_book_get_info, run_crossref_get_info

Let's take a few samples and query the crossref and Google books API for DOI and ISBN respectively.

In [115]:
mask_isbn_or_doi = citation_with_ids['DOI'].notnull() | citation_with_ids['ISBN'].notnull()

citation_with_isbn_or_doi = citation_with_ids[mask_isbn_or_doi][['id', 'title_of_citation', 'ISBN', 'DOI', 'authors']]

In [116]:
citation_with_isbn_or_doi = citation_with_isbn_or_doi.sample(n=100)

In [117]:
citation_with_isbn_or_doi.head()

Unnamed: 0,id,title_of_citation,ISBN,DOI,authors
1662599,14816661,Intercellular transfer of carcinoembryonic ant...,,10.4049/jimmunol.179.7.4424,"[{last=Stern-Ginossar, first=N}, {last=Nedvetz..."
2161734,47868959,The Beach Boys,978-0-345-27398-7,,"[{link=Byron Preiss, last=Preiss, first=Byron}]"
1865967,14241815,John Maynard Keynes,9780230229204,,"[{link=Paul Davidson (economist), last=Davidso..."
708192,50610544,Value Creation: Strategies for the Chemical In...,978-3527312665,,"[{last=Budde, first=Florian}, {last=Frankem\xf..."
3082627,42443400,University of Wisconsin: Renewal to Revolution...,978-0-299-16290-0,,"[{last=Cronon, first=E. David}, {last=Jenkins,..."


In [118]:
citation_with_isbn_or_doi['retrieved_title'] = [[] for i in citation_with_isbn_or_doi.index]
citation_with_isbn_or_doi['retrieved_author'] = [[] for i in citation_with_isbn_or_doi.index]
citation_with_isbn_or_doi['api_type'] = ['' for i in citation_with_isbn_or_doi.index]

Crossref is great for DOI, but does not return a lot of information for ISBN. Google books is better for ISBN but it limits the amount of requests one could send so thats why we are testing it on a smaller sample case.

In [119]:
for i in range(len(citation_with_isbn_or_doi)):
    title = []
    author = []
    row = citation_with_isbn_or_doi.iloc[i] # Get the particular row
    if row['DOI']:
        result_crossref = run_crossref_get_info(doi=row['DOI'])
        citation_with_isbn_or_doi.iloc[i, 7] = 'Crossref'

        if result_crossref.status_code != 200:
            title.append('No title mentioned')
            author.append('No authors mentioned')
            continue

        crossref_message = result_crossref.json()['message']
        
        if 'title' in crossref_message:
            title.extend(crossref_message['title'])
        else:
            title.append('No title mentioned')
        
        if 'author' in crossref_message:
            author.extend([
                a.get('given', '') + ' ' + a.get('family', '')
                for a in crossref_message['author']
            ])
        else:
            author.append('No authors mentioned')

    if not row['DOI'] and row['ISBN']:
        isbn = row['ISBN'].replace('-', '')
        result_google = run_google_book_get_info(isbn=isbn).json()
        citation_with_isbn_or_doi.iloc[i, 7] = 'Google'
        
        if 'items' not in result_google:
            row['retrieved_title'] = 'No title mentioned'
            row['retrieved_author'] = 'No authors mentioned'
            continue
        
        for item in result_google['items']:
            title.append(item['volumeInfo'].get('title', 'No title mentioned'))
            author.extend(item['volumeInfo'].get('authors', ['No authors mentioned']))
    
    if i % 50 == 0:
        print('Done with {} citations'.format(i + 1))
    
    citation_with_isbn_or_doi.iloc[i]['retrieved_title'].extend(title)
    citation_with_isbn_or_doi.iloc[i]['retrieved_author'].extend(author)

Done with 1 citations
Done with 51 citations


In [120]:
citation_with_isbn_or_doi

Unnamed: 0,id,title_of_citation,ISBN,DOI,authors,retrieved_title,retrieved_author,api_type
1662599,14816661,Intercellular transfer of carcinoembryonic ant...,,10.4049/jimmunol.179.7.4424,"[{last=Stern-Ginossar, first=N}, {last=Nedvetz...",[Intercellular Transfer of Carcinoembryonic An...,"[N. Stern-Ginossar, S. Nedvetzki, G. Markel, R...",Crossref
2161734,47868959,The Beach Boys,978-0-345-27398-7,,"[{link=Byron Preiss, last=Preiss, first=Byron}]",[The Beach Boys],[Byron Preiss],Google
1865967,14241815,John Maynard Keynes,9780230229204,,"[{link=Paul Davidson (economist), last=Davidso...",[John Maynard Keynes],[Paul Davidson],Google
708192,50610544,Value Creation: Strategies for the Chemical In...,978-3527312665,,"[{last=Budde, first=Florian}, {last=Frankem\xf...",[Value Creation],"[Florian Budde, Utz-Hellmuth Felcht, Heiner Fr...",Google
3082627,42443400,University of Wisconsin: Renewal to Revolution...,978-0-299-16290-0,,"[{last=Cronon, first=E. David}, {last=Jenkins,...","[Univ of Wisconsin V4: Renewal to Revolution, ...","[E. David Cronon, John W. Jenkins]",Google
1803658,4004781,Planet Quest: The Epic Discovery of Alien Sola...,978-0-19-288083-3,,"[{last=Croswell, first=Ken}]",[Planet Quest],[Ken Croswell],Google
1520549,31194185,Africa,978-1-74104-482-9,,"[{last=Pitcher, first=Gemma}]",[Africa],[Gemma Pitcher],Google
3140894,43654526,Il Mereghetti,8860736269,,[{last=Paolo Mereghetti}],"[Il Mereghetti, Il Mereghetti]","[Paolo Mereghetti, Alberto Pezzotta, Paolo Mer...",Google
549814,12065590,A Reduction of Doxastic Logic to Action Logic,,10.1023/A:1005666218871,"[{last=Wansing, first=H.}]",[],[Heinrich Wansing],Crossref
1691756,1131083,The Silence and the Scorpion: The Coup against...,978-0-786-72744-5,,"[{last=Nelson, first=Brian A.}]",[The Silence and the Scorpion],[Brian A. Nelson],Google


Doing analysis of the Google and Crossref API - as to how many authors and title are equal?

In [121]:
total_google_samples = len(citation_with_isbn_or_doi[citation_with_isbn_or_doi['api_type'] == 'Google'])
total_crossref_samples = len(citation_with_isbn_or_doi[citation_with_isbn_or_doi['api_type'] == 'Crossref'])

In [122]:
print('Google Samples: {}\nCrossref Samples: {}'.format(total_google_samples, total_crossref_samples))

Google Samples: 53
Crossref Samples: 47


Lets perform some API robustness test on titles..

In [257]:
# Using Fuzzy String Matching to get an approximate matching of string instead of actual one
from fuzzywuzzy import fuzz

def get_ratio(row, col1, col2):
    actual_ = row[col1] if row[col1] else 'No title'
    retrieved_ = row[col2][0] if len(row[col2]) >= 1 else 'No retrieved title'
    return fuzz.token_set_ratio(actual_, retrieved_)

As you can see Crossref is more precise with its results and its API is more robust than Google Books since Google Books is more like a search engine for books and returns more broader results. Also, Crossref is a specialist API and hence results are more specific.

Some of the edge cases which are not addressed is sometimes the title retrieved are in another language, but these cases are far and less.

In [151]:
citation_with_isbn_or_doi['title_percent_match'] = citation_with_isbn_or_doi.apply(
    get_ratio, args=('title_of_citation', 'retrieved_title'), axis=1)
citation_with_isbn_or_doi[['api_type', 'title_percent_match']].groupby('api_type').mean()

Unnamed: 0_level_0,title_percent_match
api_type,Unnamed: 1_level_1
Crossref,96.744681
Google,81.566038


Lets now apply the robustness test on authors..

In [248]:
# Preprocess authors so that we disappear the 'last=' and 'first=' phrase and convert them into list
def preprocess_authors(row):
    authors = ''
    authors = (
        'No authors' if isinstance(row['authors'], float) or not row['authors']
        else row['authors'].split('}, {')
    )
    for ch in [']', '[', '{', '}', 'first=', 'last=', ',', 'link=']:
        authors = [i.replace(ch, '') for i in authors]
    return len(authors), ', '.join(authors)

In [250]:
citation_temp_authors = citation_with_isbn_or_doi[['authors', 'retrieved_author', 'api_type']]

# Get the length of the number of author and the length of the number of retrieved authors
citation_temp_authors['len_authors'], citation_temp_authors['joined_authors'] = zip(*citation_temp_authors.apply(preprocess_authors, axis=1))
citation_temp_authors['len_retrieved_author'] = citation_temp_authors['retrieved_author'].apply(lambda x: len(x))
citation_temp_authors['retrieved_author'] = citation_temp_authors['retrieved_author'].apply(lambda x: ', '.join(x))

86 times the number of authors were equal or more when retrieved which means both the APIs were quite robust when in comes to term of retreiving the number of authors for a publication.

In [256]:
(citation_temp_authors['len_authors'] <= citation_temp_authors['len_retrieved_author']).value_counts()

True     86
False    14
dtype: int64

As you can see the percentages are low but this is partially because of the different annotations in the way which the authors were written in, but again we see that Crossref API is more specific and precise than Google Books.

In [259]:
citation_temp_authors['author_percent_match'] = citation_temp_authors.apply(
    get_ratio, args=('joined_authors', 'retrieved_author'), axis=1)
citation_temp_authors[['api_type', 'author_percent_match']].groupby('api_type').mean()

Unnamed: 0_level_0,author_percent_match
api_type,Unnamed: 1_level_1
Crossref,43.319149
Google,22.698113


### How do the identifiers appear with each other?

* Do we have citations with a lot of two identifiers?

In [None]:
identifiers_existing = citation_with_ids[['DOI', 'ISBN', 'ISSN', 'PMC', 'PMID']].notnull()

In [None]:
identifiers_existing.head()

In [None]:
all_columns = identifiers_existing.columns
frequency_citation = dict()

def get_frequency_of_identifiers_appearing(x):
    available_citation_types = tuple([column for column in all_columns if x[column]])
    frequency_citation.setdefault(available_citation_types, 0)
    frequency_citation[available_citation_types] += 1

_ = identifiers_existing.apply(lambda x: get_frequency_of_identifiers_appearing(x), axis=1)

In [None]:
# Make a graph of the frequency distribution calculated above
names = list(frequency_citation.keys())
values = list(frequency_citation.values())

plt.rcParams["figure.figsize"] = (12,5)
plt.xticks(rotation=90)
plt.bar(range(len(frequency_citation)),values,tick_label=names)
plt.show()

## Loading the wikipedia dataset with identifiers

In [None]:
CITATION_WITH_IDENTIFIERS = '../Citations_with_Identifiers/enwiki.tsv.tar.gz'

wiki_en_identifiers = pd.read_csv(CITATION_WITH_IDENTIFIERS, compression='gzip', sep='\t')

In [None]:
wiki_en_identifiers.head(5)

In [None]:
print('Total citation identifiers for English Wikipedia: {}'.format(wiki_en_identifiers.shape[0]))

In [None]:
wiki_en_identifiers['type'].unique() # Labels which have unique IDSs

In [None]:
# Remove the one with the NaN value
wiki_en_identifiers = wiki_en_identifiers[wiki_en_identifiers['type'].notnull()]

In [None]:
# Adding a boolean to check if the citation is in other dataset - to 
wiki_en_identifiers['is_in_other_dataset'] = False

## Revision Analysis

In [None]:
curated_title_id = citation_with_ids[['title_of_page', 'r_id', 'r_parentid']]
curated_title_id.head()

As we can see that many parent ids in our dataset are not present in the citation with identifiers dataset which should be kept in mind for further analysis and can be classified as a reason that we might get less citations.

In [None]:
r_parentid_which_are_present = curated_title_id['r_parentid'].isin(wiki_en_identifiers['rev_id'])
total_number_of_r_parentid_in_wiki = np.sum(r_parentid_which_are_present)
print(curated_title_id.shape[0], wiki_en_identifiers['rev_id'].shape[0], total_number_of_r_parentid_in_wiki)

## Comparing the two datasets

The gap exists between the two datasets (3.8 mil, 3.14 mil) of about 400,000  because we are looking only at certain citation formats which can be parsed by the `mwparserfromhell`. But still we have got 90% of the citation data by looking at just mere numbers. The 10% deficit is because of the dataset used by wiki identifiers is for revision where we are using a dataset relating to a particular date.

In [None]:
gap = wiki_en_identifiers.shape[0] - total_citations

print('The total gap between between total number of wikipedias citations and our citations: {}'.format(gap))

In [None]:
def get_citations_specific_to_type(wiki_type, curated_type):
    type_wiki_identifiers = wiki_en_identifiers[wiki_en_identifiers['type'] == wiki_type]
    type_citations_curated = citation_with_ids[citation_with_ids[curated_type].notnull()]
    
    # Just considering the unique ones since they are a lot of duplicated DOIs
    # Maybe one citation is cited in many different pages
    number_of_identifiers_wiki = type_wiki_identifiers['id'].shape[0]
    number_of_identifiers_curated = type_citations_curated['DOI'].shape[0]
    print('The total number of unique {} wiki identifiers: {}'.format(wiki_type, number_of_identifiers_wiki))
    print('The total number of unique {} curated identifiers: {}'.format(curated_type, number_of_identifiers_curated))
    
    print('\nThe difference between wiki and curated is: {}'.format(
        number_of_identifiers_wiki - number_of_identifiers_curated)
    )
    
    return type_wiki_identifiers, type_citations_curated

### How many DOI identifiers are common?

In [None]:
doi_wiki_identifiers, doi_citations_curated = get_citations_specific_to_type('doi', 'DOI')

In [None]:
# Check if curated DOIs are contained in the already obtained dataset from Wikipedia
doi_which_are_present = doi_wiki_identifiers['id'].isin(doi_citations_curated['DOI'])
total_number_of_doi_identifiers_in_wiki = np.sum(doi_which_are_present)
wiki_en_identifiers['is_in_other_dataset'].loc[doi_which_are_present.index] = doi_which_are_present.values

In [None]:
print(
    'Stats:\nTotal Curated: {} \nTotal Wiki:{} \nCurated which are in Wiki: {} \nGap: {} -> Wiki which are not identified: {}'.format(
        doi_citations_curated.shape[0],
        doi_wiki_identifiers.shape[0],
        total_number_of_doi_identifiers_in_wiki,
        doi_citations_curated.shape[0] - total_number_of_doi_identifiers_in_wiki,
        doi_which_are_present[~doi_which_are_present].shape[0]
    )
)

### How many ISBN (also ISSN) identifiers are common?

* ISBNs are International Standard Book Numbers. They can be assigned to monographic publications, such as books, e-books and audiobooks.
* ISMNs are International Standard Music Numbers. They can be assigned to notated music (scores and sheet music) whether published in print, online or in other media.
* ISSNs are International Standard Serial Numbers. They can be assigned to periodical publications, such as magazines and journals.

In [None]:
isbn_wiki_identifiers, isbn_citations_curated = get_citations_specific_to_type('isbn', 'ISBN')

### Trying to normalize all the ISBN (also need to do for ISSN)

* So if '00-11-223344' it becomes '0011223344'

In [None]:
# Check if the wikipedia citation identifiers does not have hyphens
np.sum(isbn_wiki_identifiers['id'].apply(lambda x: '-' in x))

In [None]:
isbn_citations_curated['ISBN'] = isbn_citations_curated['ISBN'].apply(lambda x: x.replace('-', ''))

In [None]:
# Check if curated DOIs are contained in the already obtained dataset from Wikipedia

isbn_which_are_present = isbn_wiki_identifiers['id'].isin(isbn_citations_curated['ISBN'])
total_number_of_isbn_identifiers_in_wiki = np.sum(isbn_which_are_present)
wiki_en_identifiers['is_in_other_dataset'].loc[isbn_which_are_present.index] = isbn_which_are_present.values

In [None]:
print(
    'Stats:\nTotal Curated: {} \nTotal Wiki:{} \nCurated which are in Wiki: {} \nGap: {} -> Wiki which are not identified: {}'.format(
        isbn_citations_curated.shape[0],
        isbn_wiki_identifiers.shape[0],
        total_number_of_isbn_identifiers_in_wiki,
        isbn_citations_curated.shape[0] - total_number_of_isbn_identifiers_in_wiki,
        isbn_which_are_present[~isbn_which_are_present].shape[0]
    )
)

### Now time for ISSN...

In [None]:
# But the stats for this does not matter!!!
# because the hypothesis is that ISSN is contained inside ISBN - but only some of them do!
isbn_wiki_identifiers, issn_citations_curated = get_citations_specific_to_type('isbn', 'ISSN')

In [None]:
# Normalizing it again like ISBN
issn_citations_curated['ISSN'] = issn_citations_curated['ISSN'].apply(lambda x: x.replace('-', ''))

In [None]:
issn_which_are_present = issn_citations_curated['ISSN'].isin(isbn_wiki_identifiers['id'])
total_number_of_issn_identifiers_in_wiki = np.sum(issn_which_are_present)
wiki_en_identifiers['is_in_other_dataset'].loc[issn_which_are_present.index] = issn_which_are_present.values

In [None]:
print(
    'Stats:\nTotal Curated: {} \nCurated which are in Wiki: {} \nGap: {} -> Wiki which are not identified: {}'.format(
        issn_citations_curated.shape[0],
        total_number_of_issn_identifiers_in_wiki,
        issn_citations_curated.shape[0] - total_number_of_issn_identifiers_in_wiki,
        issn_which_are_present[~issn_which_are_present].shape[0]
    )
)

What we can see is that ISSN exists in our `curated` dataset and only some of them of these are contained in the existing `wikipedia dataset`. Most of them do not exist and hence the hypothesis is potentially not correct. Also, some of these can be counter examples since they are magazines and music volumes which are not scientific in nature.

### How many PMID identifiers are common?

In [None]:
pmid_wiki_identifiers, pmid_citations_curated = get_citations_specific_to_type('pmid', 'PMID')

In [None]:
# Check if curated PMIDs are contained in the already obtained dataset from Wikipedia
pmid_which_are_present = pmid_citations_curated['PMID'].isin(pmid_wiki_identifiers['id'])
total_number_of_pmid_identifiers_in_wiki = np.sum(pmid_which_are_present)
wiki_en_identifiers['is_in_other_dataset'].loc[pmid_which_are_present.index] = pmid_which_are_present.values

In [None]:
print(
    'Stats:\nTotal Curated: {} \nTotal Wiki:{} \nCurated which are in Wiki: {} \nGap: {} -> Wiki which are not identified: {}'.format(
        pmid_citations_curated.shape[0],
        pmid_wiki_identifiers.shape[0],
        total_number_of_pmid_identifiers_in_wiki,
        pmid_citations_curated.shape[0] - total_number_of_pmid_identifiers_in_wiki,
        pmid_which_are_present[~pmid_which_are_present].shape[0]
    )
)

### How many PMC identifiers are common?

In [None]:
pmc_wiki_identifiers, pmc_citations_curated = get_citations_specific_to_type('pmc', 'PMC')

In [None]:
# Check if curated PMCs are contained in the already obtained dataset from Wikipedia
pmc_which_are_present = pmc_citations_curated['PMC'].isin(pmc_wiki_identifiers['id'])
total_number_of_pmc_identifiers_in_wiki = np.sum(pmc_which_are_present)
wiki_en_identifiers['is_in_other_dataset'].loc[pmc_which_are_present.index] = pmc_which_are_present.values

In [None]:
print(
    'Stats:\nTotal Curated: {} \nTotal Wiki:{} \nCurated which are in Wiki: {} \nGap: {} -> Wiki which are not identified: {}'.format(
        pmc_citations_curated.shape[0],
        pmc_wiki_identifiers.shape[0],
        total_number_of_pmc_identifiers_in_wiki,
        pmc_citations_curated.shape[0] - total_number_of_pmc_identifiers_in_wiki,
        pmc_which_are_present[~pmc_which_are_present].shape[0]
    )
)

### How many ArXiV identifiers are common?

In [None]:
arxiv_wiki_identifiers, arxiv_citations_curated = get_citations_specific_to_type('arxiv', 'ARXIV')

In [None]:
# Check if curated PMCs are contained in the already obtained dataset from Wikipedia
arxiv_which_are_present = arxiv_citations_curated['ARXIV'].isin(arxiv_wiki_identifiers['id'])
total_number_of_arxiv_identifiers_in_wiki = np.sum(arxiv_which_are_present)
wiki_en_identifiers['is_in_other_dataset'].loc[arxiv_which_are_present.index] = arxiv_which_are_present.values

In [None]:
print(
    'Stats:\nTotal Curated: {} \nTotal Wiki:{} \nCurated which are in Wiki: {} \nGap: {} -> Wiki which are not identified: {}'.format(
        arxiv_citations_curated.shape[0],
        arxiv_wiki_identifiers.shape[0],
        total_number_of_arxiv_identifiers_in_wiki,
        arxiv_citations_curated.shape[0] - total_number_of_arxiv_identifiers_in_wiki,
        arxiv_which_are_present[~arxiv_which_are_present].shape[0]
    )
)

## Check which citations in already existing ones are not in curated?