In [82]:
import pdfminer
import numpy as np
import pdfminer.converter
import pdfminer.layout
import pdfminer.pdfinterp
import pdfminer.pdfpage
import re
import pandas as pd
from mars.utils import extract_text_from_pdf

In [11]:
file_name = 'C:\\Users\jakwi\Downloads\\1906.11668.pdf'

In [89]:
def get_longest(text_list): 
    lengths = [] 
    for text in text_list: 
        lengths.append(len(text))
    lengths = np.array(lengths)
    return text_list[int(np.argmax(lengths))]

def extract_citations_from_jobin2019(file_name):
    """Extracts citation numbers from jobin2019 - preprint version"""
    
    # get text
    separated_text = extract_text_from_pdf(file_name)['separated_text']
    
    # getting the longest - the proper text on page
    text_on_pages = [get_longest(st) for st in separated_text]
    text_on_pages = np.array(text_on_pages)
        
    # split words like buckets
    # everything above justice fairness and equity is Transparency 
    # later it is excluded from text and everything above Non-maleficience is Justice, fairness, and equity, etc.
    split_words = [
    'Transparency',
    'Justice, fairness, and equity', 
    'Non-maleficence', 
    'Responsibility and accountability', 
    'Privacy',
    'Beneficence',
    'Freedom and autonomy',
    'Trust',
    'Sustainability',
    'Dignity',
    'Solidarity',
    'Discussion'] # stopword - last one will be excluded 
    
    threads = dict.fromkeys(split_words)
    
    # clean
    for i, text in enumerate(text_on_pages): 
        text = text.replace('\n', '')
        text = text.replace('(cf. Table 2)', '')
        text = text.replace('1.5', '')


        text_on_pages[i] = text
        
    relevant_text = ' '.join(text_on_pages[7:13])

    for i in range(len(list(threads.keys()))-1):
        earlier_thread = list(threads.keys())[i]
        split_thread = list(threads.keys())[i + 1] 
        splitted = relevant_text.split(split_thread, 1)           
        threads[earlier_thread] = splitted[0]

        relevant_text = splitted[1]
        
        # pop empty one
    threads.pop('Discussion')

    all_citations = dict.fromkeys(list(threads.keys()))
    for key, text in threads.items(): 

        citations = np.array(re.findall(r'\d+', text))
        additional_citations = re.findall(r'\d+[–]\d+', text)
        for ac in additional_citations: 
            range_ = re.findall(r'\d+', ac)
            lower, higher = int(range_[0]), int(range_[1])
            between = np.arange(lower+1, higher)
            citations = np.append(citations, between)
        all_citations[key] = (citations)

    for key, ac in all_citations.items(): 
        ac = np.unique(ac)
        all_citations[key] = ac
        
    return all_citations

In [90]:
citations = extract_citations_from_jobin2019(file_name)

In [91]:
citations

{'Transparency': array(['23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33',
        '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44',
        '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55',
        '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66',
        '67', '68', '69', '70', '71'], dtype='<U11'),
 'Justice, fairness, and equity': array(['23', '25', '27', '28', '29', '31', '33', '35', '36', '37', '38',
        '39', '41', '42', '43', '44', '45', '46', '47', '48', '50', '51',
        '52', '54', '55', '56', '57', '58', '59', '60', '62', '63', '65',
        '66', '67', '68', '69', '70', '72', '73', '74', '75', '76', '77',
        '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88',
        '89', '90', '91', '92', '93', '94', '95'], dtype='<U11'),
 'Non-maleficence': array(['100', '101', '102', '23', '25', '27', '30', '31', '32', '33',
        '34', '35', '36', '37', '38', '39', '40', '44', '47', '48', '50',
 

In [92]:
all_citations = np.array([]) 
for c in citations.values():
    all_citations = np.append(all_citations, c)
    

In [93]:
all_citations = np.sort(np.unique(all_citations).astype(int))


In [94]:
citations_dict = dict.fromkeys(all_citations) 

for k in citations_dict.keys(): 
    citations_dict[k] = np.zeros(11) 

In [95]:
mapping = {
    0: 'Transparency',
    1: 'Justice, fairness, and equity', 
    2: 'Non-maleficence', 
    3: 'Responsibility and accountability', 
    4: 'Privacy',
    5: 'Beneficence',
    6: 'Freedom and autonomy',
    7: 'Trust',
    8: 'Sustainability',
    9: 'Dignity',
    10: 'Solidarity'
}


In [97]:
for key1, val1 in citations_dict.items(): 
    for key2, val2 in mapping.items(): 
        if np.isin(key1, citations[val2]) : 
            citations_dict[key1][key2] = 1

  mask |= (ar1 == a)


In [98]:
data = pd.DataFrame(citations_dict).T.astype(int)

In [99]:
data.columns = mapping.values()

In [100]:
data

Unnamed: 0,Transparency,"Justice, fairness, and equity",Non-maleficence,Responsibility and accountability,Privacy,Beneficence,Freedom and autonomy,Trust,Sustainability,Dignity,Solidarity
8,0,0,1,0,0,0,0,0,0,0,0
23,1,1,1,1,1,1,0,1,0,0,0
24,1,0,0,0,0,0,0,1,0,0,0
25,1,1,1,0,1,0,0,0,0,0,0
26,1,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
101,0,0,1,0,0,0,0,0,0,1,0
102,0,0,1,0,0,1,0,0,0,0,0
103,0,0,0,1,0,0,0,0,0,0,0
104,0,0,0,0,1,0,0,1,0,0,0
