In [1]:
### All Imports
import ast
import json
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [2]:
## Loading the file which contains the 10000 random citations
selected_citations = pd.read_csv('selected_citations_for_evaluation_10000.csv')

In [3]:
## Reading the file which contains 10000 citations which are random in nature
selected_citations.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
selected_citations.head()

Unnamed: 0,authors_list,citation_title,citation,page_title,id_list,DOI
0,"['Friedrich B', 'Feng Y', 'Cohen P', 'Risler T...",The serine/threonine kinases SGK2 and SGK3 are...,Map(Title -> The serine/threonine kinases SGK2...,SGK2,"[['PMID', '12632189'], ['DOI', '10.1007/s00424...",10.1007/s00424-002-0993-8
1,"['Scherer Stephen', 'Cheung J', 'MacDonald JR'...",Human Chromosome 7: DNA Sequence and Biology,Map(Title -> Human Chromosome 7: DNA Sequence ...,Ectrodactyly,"[['PMID', '12690205'], ['PMC', '2882961'], ['D...",10.1126/science.1083423
2,"['Kuiper GG', 'Carlsson B', 'Grandien K', 'Enm...",Comparison of the ligand binding specificity a...,Map(Title -> Comparison of the ligand binding ...,Androstenedione,"[['PMID', '9048584'], ['DOI', '10.1210/endo.13...",10.1210/endo.138.3.4979
3,['Makepeace Tsao'],A New Synthesis of Mescaline,"Map(Title -> A New Synthesis of Mescaline, Vol...",Mescaline,"[['DOI', '10.1021/ja01155a562']]",10.1021/ja01155a562
4,['Rennie David'],Two Thoughts on Abraham Maslow.,"Map(Title -> Two Thoughts on Abraham Maslow., ...",Abraham Maslow,"[['DOI', '10.1177/0022167808320537']]",10.1177/0022167808320537


In [4]:
print(
    'Number of total citations: {} and number with unique title: {}'.format(
        selected_citations.shape[0], selected_citations['citation_title'].nunique())
)

Number of total citations: 10000 and number with unique title: 9707


In [5]:
## Getting the author and title for each of the unique random citations we have
def get_params(dataset):
    params = []
    for i in range(dataset.shape[0]):
        r_dict = dict()
        title_ = dataset.iloc[i]['citation_title']
        r_dict['query.bibliographic'] = title_
        author = ast.literal_eval(dataset.iloc[i]['authors_list'])[0]
        if author != 'No authors':
            r_dict['query.author'] = author
            r_dict['DOI'] = dataset.iloc[i]['DOI']
            params.append(r_dict)

    print('Constructed parameters for requests')
    return params

In [6]:
params = get_params(selected_citations)
print('Total number of unique params: {}'.format(len(params)))

Constructed parameters for requests
Total number of unique params: 9764


In [7]:
## Get all the indices and shuffle them 
indices = np.arange(len(params))
np.random.shuffle(indices)

In [8]:
## Make a train and test split for checking which heuristic is the best heuristic
SPLIT_THRESHOLD = (len(params) * 80) // 100
 
training_indices = indices[:SPLIT_THRESHOLD]
testing_indices = indices[SPLIT_THRESHOLD:]

training_content = [j for i, j in enumerate(params) if i in training_indices]
training_dois = [i['DOI'] for i in training_content]
testing_content  = [j for i, j in enumerate(params) if i in testing_indices]
testing_dois = [i['DOI'] for i in testing_content]

In [9]:
## taking in the column of score as the threshold and putting title, author, and DOI in dataframe
def get_eval(score_number, training=True):
    
    indices__, dois__ = None, None
    if training:
        indices__, dois__ = training_indices, training_dois
    else:
        indices__, dois__ = testing_indices, testing_dois
        
    crossref_invalid = 0 ## Requests which are not present in CrossRef for some reason
    no_result_for_heuristic = 0
    
    info_threshold = []
    for i in tqdm_notebook(range(len(indices__))):
        
        
        with open('lookup_eval/result_{}.txt'.format(indices__[i])) as f:
            file_content = json.loads(f.read())
        
        if 'items' not in file_content['message'] or len(file_content['message']['items']) == 0:
            crossref_invalid += 1
            continue

        if len(file_content['message']['items']) <= score_number:
            no_result_for_heuristic += 1
            continue
            
        res = file_content['message']['items'][score_number] ## score_number represents the threshold
        res_doi = res.get('DOI', 'No DOI')
        res_title = res.get('title', ['No title'])[0]
        info_threshold.append([res_doi, res_title])


    take_score = pd.DataFrame(info_threshold)
    take_score.rename({0: 'ID', 1: 'title'}, axis=1, inplace=True)

    present = []
    not_present = []
    for i in list(take_score['ID']):
        if i in dois__:
            present.append(i)
        else:
            not_present.append(i)
    print('Total number of retreieved IDs present in original: {}'.format(len(present)))
    print('Total number of retreieved IDs NOT present in original: {}'.format(len(not_present)))
    print('Total number of IDs for which CrossRef request is not valid: {}'.format(crossref_invalid))
    print('Total number of IDs for which there is no result for that heuristic: {}'.format(no_result_for_heuristic))
    return present, not_present

In [10]:
present_first_score, not_present_first_score = get_eval(0)

HBox(children=(IntProgress(value=0, max=7811), HTML(value='')))


Total number of retreieved IDs present in original: 5258
Total number of retreieved IDs NOT present in original: 2510
Total number of IDs for which CrossRef request is not valid: 43
Total number of IDs for which there is no result for that heuristic: 0


In [11]:
present_second_score, not_present_second_score = get_eval(1)

HBox(children=(IntProgress(value=0, max=7811), HTML(value='')))


Total number of retreieved IDs present in original: 345
Total number of retreieved IDs NOT present in original: 7407
Total number of IDs for which CrossRef request is not valid: 43
Total number of IDs for which there is no result for that heuristic: 16


In [12]:
present_third_score, not_present_third_score = get_eval(2)

HBox(children=(IntProgress(value=0, max=7811), HTML(value='')))


Total number of retreieved IDs present in original: 96
Total number of retreieved IDs NOT present in original: 7647
Total number of IDs for which CrossRef request is not valid: 43
Total number of IDs for which there is no result for that heuristic: 25


In [13]:
print('Is 2nd threshold any better: {}'.format(
    any([True if i in not_present_first_score else False for i in present_second_score])))
print('Is 3rd threshold any better: {}'.format(
    any([True if i in not_present_first_score else False for i in present_third_score])))

Is 2nd threshold any better: False
Is 3rd threshold any better: False


From the part above, we see that the best heuristic is given by the item which has the highest score or the one which is first in the list by index. Now, we check it on the testing set..

### Testing part just check the best threshold gained in the first part

In [14]:
print('Total points in the testing set: {}'.format(len(testing_indices)))

Total points in the testing set: 1953


In [15]:
## Passing 0 as the parameter as we know the first is the best heuristic
presenting_testing_score, not_present_testing_score = get_eval(0, training=False)

HBox(children=(IntProgress(value=0, max=1953), HTML(value='')))


Total number of retreieved IDs present in original: 1331
Total number of retreieved IDs NOT present in original: 617
Total number of IDs for which CrossRef request is not valid: 5
Total number of IDs for which there is no result for that heuristic: 0
