In [54]:
import networkx as nx
import operator
import json
import math
import spacy
nlp = spacy.load("en_core_web_sm")

import warnings
warnings.filterwarnings("ignore")

In [95]:
json_path = 'PE_sample.pdf.json'
with open(json_path, 'r', encoding='utf-8') as jsonfile:
    json_string = json.load(jsonfile)

In [164]:
def FindObject(json):
    texts = []
    res = []
    titles = []
    for page in json['pages']:
        for element in page['elements']:
            try:    
                if element['type'] == 'heading':
                    title = GetText(element)
                    titles.append(title)
                    texts.append(res)
                    res = []
                if element['type'] in ['word', 'line', 'character', 'paragraph', 'heading', 'list']:
                    res.append(element)
            except TypeError:
                continue
    texts.append(res)
    return texts, titles

def GetText(text_object):
    result = ""
    if text_object['type'] in ['paragraph','heading','list']:
        for i in text_object['content']:
            result += GetText(i)
    if text_object['type'] in ['line']:
        for i in text_object['content']:
            result += GetText(i)
    elif text_object['type'] in ['word']:
        if type(text_object['content']) is list:
            for i in text_object['content']:
                result += GetText(i)
        else:
            result += text_object['content']
            result += ' '
    elif text_object['type'] in ['character']:
        result += text_object['content']
    return result                

In [165]:
text = ""
sections = []
text_lists, titles = FindObject(json_string)
for text_list in text_lists:
    for text_Obj in text_list:
        text += GetText(text_Obj)
        text += '\n\n'
    sections.append(text)
    text = ""

In [121]:
def increment_edge (graph, node0, node1):
    #print("link {} {}".format(node0, node1))
    
    if graph.has_edge(node0, node1):
        graph[node0][node1]["weight"] += 1.0
    else:
        graph.add_edge(node0, node1, weight=1.0)



def link_sentence (doc, sent, lemma_graph, seen_lemma):
    visited_tokens = []
    visited_nodes = []
    POS_KEPT = ["ADJ", "NOUN", "PROPN", "VERB"]

    for i in range(sent.start, sent.end):
        token = doc[i]

        if token.pos_ in POS_KEPT:
            key = (token.lemma_, token.pos_)

            if key not in seen_lemma:
                seen_lemma[key] = set([token.i])
            else:
                seen_lemma[key].add(token.i)

            node_id = list(seen_lemma.keys()).index(key)

            if not node_id in lemma_graph:
                lemma_graph.add_node(node_id)

            #print("visit {} {}".format(visited_tokens, visited_nodes))
            #print("range {}".format(list(range(len(visited_tokens) - 1, -1, -1))))
            
            for prev_token in range(len(visited_tokens) - 1, -1, -1):
                #print("prev_tok {} {}".format(prev_token, (token.i - visited_tokens[prev_token])))
                
                if (token.i - visited_tokens[prev_token]) <= 3:
                    increment_edge(lemma_graph, node_id, visited_nodes[prev_token])
                else:
                    break

            #print(" -- {} {} {} {} {} {}".format(token.i, token.text, token.lemma_, token.pos_, visited_tokens, visited_nodes))

            visited_tokens.append(token.i)
            visited_nodes.append(node_id)
            
lemma_graph = nx.Graph()
seen_lemma = {}

for sent in doc.sents:
    link_sentence(doc, sent, lemma_graph, seen_lemma)
    #break # only test one sentence

In [122]:
def collect_phrases (chunk, phrases, counts):
    chunk_len = chunk.end - chunk.start + 1
    sq_sum_rank = 0.0
    non_lemma = 0
    compound_key = set([])

    for i in range(chunk.start, chunk.end):
        token = doc[i]
        key = (token.lemma_, token.pos_)
        
        if key in seen_lemma:
            node_id = list(seen_lemma.keys()).index(key)
            rank = ranks[node_id]
            sq_sum_rank += rank
            compound_key.add(key)
        
            #print(" {} {} {} {}".format(token.lemma_, token.pos_, node_id, rank))
        else:
            non_lemma += 1
    
    # although the noun chunking is greedy, we discount the ranks using a
    # point estimate based on the number of non-lemma tokens within a phrase
    non_lemma_discount = chunk_len / (chunk_len + (2.0 * non_lemma) + 1.0)

    # use root mean square (RMS) to normalize the contributions of all the tokens
    phrase_rank = math.sqrt(sq_sum_rank / (chunk_len + non_lemma))
    phrase_rank *= non_lemma_discount

    # remove spurious punctuation
    phrase = chunk.text.lower().replace("'", "")

    # create a unique key for the the phrase based on its lemma components
    compound_key = tuple(sorted(list(compound_key)))
    
    if not compound_key in phrases:
        phrases[compound_key] = set([ (phrase, phrase_rank) ])
        counts[compound_key] = 1
    else:
        phrases[compound_key].add( (phrase, phrase_rank) )
        counts[compound_key] += 1

    #print("{} {} {} {} {} {}".format(phrase_rank, chunk.text, chunk.start, chunk.end, chunk_len, counts[compound_key]))

In [192]:
Output = []
for i, section in enumerate(sections[1:]):
    Dict = {}
    Final = {}
    doc = nlp(section)
    lemma_graph = nx.Graph()
    seen_lemma = {}
    for sent in doc.sents:
        link_sentence(doc, sent, lemma_graph, seen_lemma)
    ranks = nx.pagerank(lemma_graph)
    phrases = {}
    counts = {}

    for chunk in doc.noun_chunks:
        collect_phrases(chunk, phrases, counts)

    for ent in doc.ents:
        collect_phrases(ent, phrases, counts)
    
    min_phrases = {}

    for compound_key, rank_tuples in phrases.items():
        l = list(rank_tuples)
        l.sort(key=operator.itemgetter(1), reverse=True)
        
        phrase, rank = l[0]
        count = counts[compound_key]
        
        min_phrases[phrase] = (rank, count)
    
    for phrase, (rank, count) in sorted(min_phrases.items(), key=lambda x: x[1][0], reverse=True):
        Dict[phrase] = {'count': count, 'rank_score': rank}
        #print(phrase, count, rank)
    Final['section_title'] = titles[i]
    Final['text_rank'] = Dict
    
    Output.append(Final)
    print('-------------------------------')

generative adversarial networks 2 0.106824835688465
real data 1 0.10629389219348619
xi hanzhou chen1 1 0.08388310647038866
geo-privacy trajectory data 1 0.0818632554021497
university park 2 0.08030965824367783
geo-privacy protection 3 0.07812891562995869
a geo- privacy protection layer 1 0.07551316055418666
vision paper 2 0.07545483515791312
synthetic trajectories 1 0.07513842432606543
clio andris1 1 0.07428355057364551
xi hanzhou 1 0.06956517013928551
validation metrics 1 0.06545398127768326
the possible data generation scenarios 1 0.06478774748674457
analysis tasks 1 0.06350908751549264
generative 1 0.060674792353492636
trajectory data publication 1 0.057770688559283384
trajectory 1 0.05703420238438389
usage 1 0.056930592345935845
clio andris1 

department 1 0.055040746915250856
trajectory data 2 0.052009718591623656
the pennsylvania {xiliu,hzc176,clio}@psu.edu state university 2 0.05169509709108916
geo 1 0.050969800864812884
pa 1 0.050942231411226405
geography 1 0.050024090506973984

movements 1 0.03438091699968419
sequences 3 0.033800580472756844
the summary statistics 1 0.032669744820773935
neighborhoods 1 0.032107316279784984
shopping 1 0.03203740712307066
dining 1 0.03182356870374628
semantics 1 0.031713658308296824
, t1 s2 1 0.031366437601069413
work 1 0.029891907152944465
bikes 1 0.02851025833818552
interest 1 0.02782157989217786
one unique property 1 0.027706342935520333
pi 1 0.027626496504545718
si 1 0.027626496504545715
trajectory 1 0.02735834697179378
each individual people 1 0.027093161778818285
the related activities 1 0.02595938190848019
gans 1 0.025853628574377382
the generative model 1 0.025732018071404662
the original dataset 1 0.023707384102255983
the urban topology 1 0.023301119078662676
the property 1 0.023142279929033685
uber/lyft 2 0.022937101103971323
more interpretable formats 1 0.02272438542820008
< x, y, t 1 0.02205763099941876
the geometric information 1 0.02068027900746018
the user 1 0.018051469468254134
the usage 1 0.01673148132954577
a 

planning 1 0.05021169565965534
trajectory related geo-privacy issues 1 0.04998975089824028
aforementioned trajectory embedding and time-stamp issues 1 0.048696806048539196
the summary statistical properties 1 0.0485980907891652
gans 2 0.04828797599749258
these data 4 0.04719106739218477
problems 1 0.04671529722908414
the trajgans framework 1 0.04577684603456603
discussion 2 0.045572395134977545
oﬀ 1 0.04507167613694863
high-level patterns 1 0.04466429992746334
oﬀer 1 0.04458103257106128
people 2 0.040603283040162336
their continued use 1 0.03724454956675105
behavior 1 0.036929241309459036
the model 2 0.03268639713957817
policy 1 0.03202862732574096
a local optimum 1 0.031912962522346884
a deep understanding 1 0.031733426127070974
the built environment 1 0.0309322168174939
these challenges 1 0.02959435966860713
this vision paper 1 0.029151541273735183
the complex distribution 1 0.028448703367780912
a more straightforward approach 1 0.02583711946644009
5 conclusions 1 0.02416841222614159

In [193]:
with open('output_sample.json', 'w', encoding="utf-8") as outfile:
    json.dump(Output, outfile, indent=4, ensure_ascii=False)