In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import xml.etree.ElementTree as ET
import re
import pprint
from bs4 import BeautifulSoup
from dataclasses import dataclass
import json

In [3]:
def read_tei(tei_file):
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')

In [4]:
def elem_to_text(elem, default=''):
    if elem:
        return elem.getText(separator=' ', strip=True)
    else:
        return default

In [5]:
@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str

turing_author = Person(firstname='Alan', middlename='M', surname='Turing')

f"{turing_author.firstname} {turing_author.surname} authored many influential publications in computer science."

'Alan Turing authored many influential publications in computer science.'

In [83]:
class TEIFile(object):
  
###########################################################################
#                            class initializer
###########################################################################
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self._title = ''
        self._abstract = ''
        
###########################################################################
#                      getting paper sections(headings)
###########################################################################
    def get_sections(self):
      '''
      get body text
      '''
      body = self.soup.find("body")
      try:
        assert (body == self.soup.body)
      except:
        print("*************not equal*********************")
      body_text = []
      if body:        
        all_divs = body.find_all('div')
        for div in all_divs:
          if not div.get("type"):# not reference or appendex
            section={"section":'', "sec_text":[]}    
            if len(list(div) ) > 0:    
              if list(div)[0].name != 'p':
                # #if (len(list(div)))>1:
                #   for par in div.find_all('p'): # with that I got the text inside the p
                #     text={"paragraph":"", "cite_spans":[]}
                #     text["paragraph"] = par.get_text(separator=' ', strip=True)
                #     text["cite_spans"] = self._get_cite_spans(par) 
                #     section["sec_text"].append(text)
                 # in this case we have section name
                section["section"] = list(div)[0]  # get the title of section
            for par in div.find_all('p'): # with that I got the text inside the p
              text={"paragraph":"", "cite_spans":[]}
              text["paragraph"] = par.get_text(separator=' ', strip=True)
              text["cite_spans"] = self._get_grobid_cite_spans(par)#_get_cite_spans(par)   self._get_grobid_cite_spans(par)
              text["cite_spans"] = self._get_cite_spans_links(par, text["cite_spans"])
              section["sec_text"].append(text)
            body_text.append(section)           
      #print("sections", body_text)
      return body_text
 

    # def doi(self):
    #     '''
    #     supposed to be paper id
    #     '''
    #     idno_elem = self.soup.find('idno', type='DOI')
    #     if not idno_elem:
    #         return ''
    #     else:
    #         return idno_elem.getText()

    def paper_title(self):
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title

    def paper_abstract(self):
        '''
        paper abstract
        '''
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract

    def authors(self):
      '''
      paper authors
      '''
      authors_in_header = self.soup.analytic.find_all('author')
      result = []
      for author in authors_in_header:
        # print("author  ",author )
        persname = author.persname
        if not persname:
          continue   
        firstname = elem_to_text(persname.find("forename", type="first"))
        middlename = elem_to_text(persname.find("forename", type="middle"))
        surname = elem_to_text(persname.surname)
        person = Person(firstname, middlename, surname)
        result.append({"first":person.firstname,"middle":person.middlename,"last":person.surname})
      self._authors = result

      return result



    def text(self):
        '''
        paper plain text
        '''
        print("text",self._text)
        if not self._text:
            divs_text = []
            
            for div in self.soup.body.find_all("div"):
              if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = plain_text
        return self._text

    def _get_references(self):
      ref_list =[]
      ref_sec = self.soup.find('div', type='references')
      if not ref_sec:
          return ''
      else:
        i=1
        for elem in ref_sec:
          if len(list(elem))>1:
            for refs in elem:                 
              if len(list(refs))>1:                                                             
                    i= i + 1 
                    ref_list.append(refs)
      return ref_list

    def _build_bib_entry(self,elem):
        ref_dic ={"ref_id":"","ref_title":"","authors":[],"DOI":"","ISSN":"","ISSNe":"" ,"year":"","publisher":"","volume":"","issue":"" }# ref_dic ={"ref_id":"","ref_title":"","authors":[],"idno":"","date":"","publisher":"","volume":"","issue":"", "pages":"" }
        ref_dic["ref_id"] = elem.get("xml:id")
        authors = elem.find_all('author')
        result = []
        for author in authors:
          persname = author.persname
          if not persname:
              pass   
          firstname = elem_to_text(persname.find("forename", type="first"))
          middlename = elem_to_text(persname.find("forename", type="middle"))
          surname = elem_to_text(persname.surname)
          person = Person(firstname, middlename, surname)
          per={}
          per["fisrt"] = person.firstname
          per["middle"] = person.middlename
          per["last"] = person.surname
          result.append(per)            
        ref_dic["authors"]= result               
        title = elem.find_all("title")[0]
        ref_dic["ref_title"] = title.get_text(separator=' ', strip=True)
        idno_elem = elem.find_all('idno')
        if idno_elem:
          if len(idno_elem)>1:
            for elem in idno_elem:
              elem_type = elem.get("type")
              if elem_type == 'DOI':
                ref_dic["DOI"] = elem.get_text(separator=' ', strip=True)
              elif elem_type == 'ISSN': 
                ref_dic["ISSN"] = elem.get_text(separator=' ', strip=True)
              elif elem_type == 'ISSNe': 
                ref_dic["ISSNe"] = elem.get_text(separator=' ', strip=True)

        date_ = elem.find('date')
        if  date_:
          ref_dic["year"] = date_.get("when")
        else:
          ref_dic["year"] = date_


        publisher = elem.find('publisher')
        if not publisher :
          pass
        else:
          ref_dic["publisher"] = publisher.get_text(separator=' ', strip=True)
        biblscope = elem.find_all('biblscope')
        if len(biblscope) ==0:
          pass
        else:            
            for bib in biblscope:
              unit = bib.get("unit")
              if unit =="volume":
                ref_dic["volume"] = bib.get_text(separator=' ', strip=True)
              if unit =="issue":
                ref_dic["issue"] = bib.get_text(separator=' ', strip=True)  
        return ref_dic

    def refrences(self):
      '''
      Get all references
      '''
      bib_entries=[]# for all metadata
      ref_list=[] # for all references
      ref_list = self._get_references()
      for i, elem in enumerate(ref_list):
        ref_dic = self._build_bib_entry(elem)    
        bib_entries.append(ref_dic)     
      return bib_entries  

    def parsepaper(self): 
      '''
      Parse paper
      '''
      self.paper={}
      self.papermeta = {}
      self.papermeta["title"] = self.paper_title()
      self.papermeta["authors"] =  self.authors()
      
      self.papermeta["abstract"] =  self.paper_abstract()
      #self.papermeta["DOI"] =  self.doi()
      self.paper["meta"] = self.papermeta
      self.paper["body"] = self.get_sections()
      self.paper["references"] = self.refrences()
      return self.paper       

    def _get_cite_spans(self,par):
      '''
       private method for getting citation in a aparagraph
      input: text represent the text of one section from article specified outside this function
      output: dataset dictionary contains {number of sentence:(x,y)}; x the citation(s) , y the sentence that contains citations 
      '''
      dataset = []
      text = par.get_text(separator=' ', strip=True)
      cite_spans = []

      # for all types of citations
      regex_find_citation = r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}[a-c]?(;\s)?)+\s?\)|\
                              \[(\d{1,3},\s?)+\d{1,3}\]|\
                              \[[\d,-]+\]|(\([A-Z][a-z]+, \d+[a-c]?\))|\
                              ([A-Z][a-z]+ (et al\.)? \(\d+[a-c]?\))|\
                              [A-Z][a-z]+ and [A-Z][a-z]+ \(\d+[a-c]?\)]"

      # reg1 = r'(([A-Z])\w+( and [A-Z]\w+)?) *\((19|20)\d{2}\)|([A-Z][a-z]+ ((\w+) )*(et *al. *)?\((19|20)\d{2}\))'  # Author (Year)
      # reg2 = r'\(([^)(])+, *(19|20)\d{2}([^)]+)?\)'# '\(([^)(]+)?(?:19|20)\d{2}?([^)]+)?\)' #(Author1 ,Year1) or (Author1 ,Year; Author2 ,Year2; etc)
      # reg3 = r'\[\d+\ *(, *\d+)*\]' # [1] or [1,2,3]
      # reg = reg1 + "|" +reg2 + "|"+ reg3
      #+++++++++++++++++++++++++++++++++++++++++++++++++++
      #     get references from xml.tei file paragraph
      #+++++++++++++++++++++++++++++++++++++++++++++++++++
      
      idx_list = []
      refs_idx = par.find_all('ref', type="bibr")      
      if refs_idx :
        #print("refs_idx", refs_idx)
        for idx in refs_idx:
          target = idx.get("target")
          if target:
            idx_list.append(idx.get("target")[1:])
          # else:
          #   idx_list.append("None")  
        #print(" just take idx_list: ",idx_list, " refs_idx", refs_idx,"text", text)
       #++++++++++++++++++++++++++++++++++++++++++++++++++++
       #   join rferences with the dataset
       #++++++++++++++++++++++++++++++++++++++++++++++++++++
      
      if len(idx_list) > 1:  # got it from beautifulsoup
        #x = re.findall(reg, text) # cites inside the text
        cite_spans = list({"text":i.group(),"start":i.start(),"end":i.end()} for i in re.finditer(reg, text)) # got it from regular expression
      i = 0
      
      #count_cite_spans = 0
      for  item in cite_spans:  # cite_spans=[{"text":[1],"start":,"end":}, {"text":[1,2],"start":,"end":}]        
        refs = item["text"]
        reg_year = r'\d{4}'
        years = re.findall(reg_year, refs)
        count = len(years)
        #count_cite_spans += count
        if count > 1 :
          refs = refs.split(';')
        elif count == 1 :
          pass
        else:
          refs = item["text"][1:-1]
          refs = refs.split(',')          
          count = len(refs)
          #count_cite_spans += count
        item["refs_id"] = []
       
        for _ in range(count):  
          try:        
            links[idx_list[i]].append(idx_list[i]) 
            i = i+1    
          except:   
            pass  # this will be wrong! 
      if cite_spans:
        dataset.append({"cite_spans":cite_spans}) 
      # if dataset:
      #   print("dataset", dataset)  
      return dataset

    def _get_cite_spans_links(self,par, idx_list):  
      i = 0
      refs_idx = par.find_all('ref', type="bibr")      
      if refs_idx :
        if len(idx_list) == len(refs_idx):
        #print("refs_idx", refs_idx)
          for idx in refs_idx:
            target = idx.get("target")
            if target:
              #bib_entry ={"entry":idx.get("target")[1:], "text":idx.get_text(separator=' ', strip=True)}
              idx_list[i]["entry"] =idx.get("target")[1:]
              #idx_list.append(bib_entry)
              i += 1
         
      return idx_list    

    def _get_grobid_cite_spans(self,par):  
      text = par.get_text(separator=' ', strip=True)
      # all kind of citations
      r1 = r"\(\s?(([A-Za-z\-]+\s)+([A-Za-z\-]+\.?)?,?\s\d{2,4}[a-c]?(;\s)?)+\s?\)|"
      r2=  r"\[(\d{1,3},\s?)+\d{1,3}\]|"
      r3 = r"\[[\d,-]+\]|(\([A-Z][a-z]+, \d+[a-c]?\))|"
      r4 = r"([A-Z][a-z]+ (et al\.)? \(\d+[a-c]?\))|"
      r5 = r"[A-Z][a-z]+ and [A-Z][a-z]+ \(\d+[a-c]?\)]"                                             

      regex_find_citation = r1 + r2 + r3 + r4 + r5
      return list({"text":i.group(),"start":i.start(),"end":i.end()} for i in re.finditer(regex_find_citation , text))
   
    def to_json(self, path):
      '''
       parse paper to json file
      '''
      result= self.parsepaper()
      # res = json.dumps(result)
      with open(path, "w+") as json_file: 
        json.dump(result, json_file) 
      


In [84]:
# from pathlib import Path

# rootdir = Path('/content/drive/My Drive/project/ss_sample/ss-sample/second_sample_maks')

# for f in rootdir.glob('**/*'):
#   if f.is_file() and f.name.endswith(".tei.xml") :
#     print(f)
# # Return a list of regular files only, not directories

# # file_list = [f for f in rootdir.glob('**/*') if f.is_file()]

file 77e2fe8914531c28683d831f1554528171f0ce20.pdf.tei.xml was deleted cause the citation style was not supported

In [85]:
# https://stackoverflow.com/questions/19587118/iterating-through-directories-with-python
import time
start = time.time()
import os
from os.path import join
directory = Path("/content/drive/My Drive/project/ss_sample/ss-sample/second_sample_maks")
out_directory="/content/drive/My Drive/project/ss_sample/ss-sample/output2/"
wrongfiles = []
wrong =[]
right = []
rightfiles=[]


for filename in directory.glob('**/*'):
  if filename.is_file() and filename.name.endswith(".tei.xml") :
      tei_path = filename   
      tei = TEIFile(tei_path)
      try:
        json_path  = out_directory + filename.name.strip('.tei.xml').strip('/content/')+".json"
        #print(json_path)
        tei.to_json(json_path)   
        right.append(tei)
        rightfiles.append(tei_path) 
      except:
        #tei.to_json(json_path)   
        wrong.append(tei)  
        wrongfiles.append(tei_path)   
print(time.time() - start)

50.94217586517334


In [86]:
len(wrong), len(right),  len(wrong) + len(right)

(4, 1005, 1009)

In [87]:
result = right[11].parsepaper()


In [88]:
pprint.pprint(result)

{'body': [{'sec_text': [{'cite_spans': [],
                         'paragraph': 'Implant placement in maxillary anterior '
                                      'region has always been challenging for '
                                      'the implantologists. Different levels '
                                      'of gingival display along with the '
                                      'uncertainty of soft and hard tissue '
                                      'changes post-extraction make things '
                                      'highly predictable in this part of the '
                                      'alveolar ridges. This difficulty is '
                                      "augmented by the patient's desire to "
                                      'have teeth in this aesthetic zone '
                                      'immediately. Researchers have devised '
                                      'certain techniques to address this '
                      

In [12]:
pprint.pprint(result['meta'])

{'abstract': '16. Abstract This report documents the activities of an 11-month '
             'research project that considered various pedestrian-related '
             'factors and developed revised warrants for the installation of a '
             'traffic signal that are more sensitive to pedestrians and '
             'cyclists. Three warrant recommendations were made as a result of '
             'this research. The recommendations are summarized below with the '
             'warrants that are affected for each recommendation. The research '
             'shows that results from warrant analyses with the revised '
             'warrants better match professional engineering judgment than the '
             'results of warrant analyses using the current warrants. The '
             'warrant recommendations should not be used until they are '
             'officially adopted by the Texas Department of Transportation. '
             'Include pedestrians and cyclists in the minor-st

In [None]:
tei = TEIFile("/content/drive/My Drive/project/ss_sample/ss-sample/grobid/4e45bb05f6ba28e5e6d7942a09c21d900f1b4d5a.pdf.tei.xml")

In [None]:
tei.doi()

'10.1017/s0950268800051955'

In [None]:
result = tei.parsepaper()#
result['meta']

{'DOI': '10.1017/s0950268800051955',
 'abstract': 'Iso lation , purification and characterization of 3 new cytotoxins of a K. pneumoniae strain isolated from ready to eat pork sausage are reported . Purification process involved extraction of cytotoxins with polymyxi n B sulphate. salt precipitation, gel filtration and anion exchange chromatography. Klebsiella cytotoxin (KCT) I. a g lycoprotein of about 65 kDa was verocytotoxic, enterotoxic and dermonecrotic. KCT II was erythemogenic, verocytotoxi c and enterotox ic protein of co 55 kDa, while KE:T \'" was about double in MW (110 kDa) hadverocytotoxicity but neither enterotox ic ity nor dermatotoxicity. KCT I and II caused granulation, conglomeration, shrinkage, detachment and lysis of MDBK and Vero cells. while KCT \'" induced enlargement, vacuolation, granulation, multinucleolation and syncyti a formation in exposed cell s. All the three cytotoxins induced specific neutralizing antibodies and cytotoxins were detectable in nanogram qu

# New method

In [None]:
pip install tei-reader

In [None]:
from tei_reader import TeiReader

In [None]:
reader = TeiReader()
corpora = reader.read_file('/content/drive/My Drive/project/ss_sample/ss-sample/grobid/4e45bb05f6ba28e5e6d7942a09c21d900f1b4d5a.pdf.tei.xml') # or read_string
print(corpora.text)



In [None]:
!pip install -U ipython
!pip install tei2neo


In [None]:
#!python -m spacy download de_core_news_sm

In [None]:
from tei2neo import parse, GraphUtils
graph = Graph(host="localhost", user="neo4j", password="password")
doc, status, soup = parse(
	filename=file, 
	start_with_tag='TEI', 
	idno='20-MS-221'
)
tx = graph.begin()
doc.save(tx)
tx.commit()

In [59]:
import json

In [60]:
path ='/content/drive/My Drive/project/ss_sample/ss-sample/second_sample_maks/0032c94075eea9e18b159376278ce3bc403557ac/0032c94075eea9e18b159376278ce3bc403557ac.json'

In [72]:
with open(path, 'r') as json_file:
  f = json_file.read()
  pprint.pprint((f))

('{"entities":[],"magId":"2783595625","journalVolume":"165","journalPages":"77-81","pmid":"","fieldsOfStudy":["Economics"],"year":2018,"outCitations":["fa094265bd3a0ea3b07e156e227f52500af8fb0a","67e2aa6b41ac1052b535f552c4439d60ee82c25e","990c709cc04d22a448986a3b560f63d9290f39d1","033adaea461d8780352a623b7e97d7cb5571582d","865b1607a955f9fca1b46425a4c10e8fa79e3397","d53354ee88e8c4e355ab1f8adbf2f10d3c4ef511","4d33b6710a4f9ec34cfe57e5f037ae7b558c3741","f5849bd39f4a2d120d57f3fa63b58f7f60fff9cd","bb588fbd19a4f05b6d13e1fa029f9299f23c42a6","71e727e5526f543d1cae7f8aa10d5a0e30fde1b1","7db10b3d8b36e824691363082aaba1c4ce14baad","5add0432dcc9b8cd180e8fa510105e0870dc2a8f","a5ac284172122877ddd663dfacbfd4d086438935","b71bd26ef7464f24ff3d4a1ca4c5907f4af8acca","1ccc584e35694a3c59dfcc9dbe077ae00cfdf27f","2b47eb770a28142aec1b3d66fd8b4bbc5eabdf2a","3dc135ce8cbc730992d5b1a084314e0538b127d8","70d0c16add4f4e5a0459fc2aef41cbcb36cdf94c","8b8e5b3c26d65d1250b5212316ea5744d57858cd","2a3c806c7f31438767078b71bbbd1e1