In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import xml.etree.ElementTree as ET
import re
import pprint
from bs4 import BeautifulSoup
from dataclasses import dataclass
import json

In [None]:
def read_tei(tei_file):
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')

In [None]:
def elem_to_text(elem, default=''):
    if elem:
        return elem.getText(separator=' ', strip=True)
    else:
        return default

In [None]:
@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str

turing_author = Person(firstname='Alan', middlename='M', surname='Turing')

f"{turing_author.firstname} {turing_author.surname} authored many influential publications in computer science."

'Alan Turing authored many influential publications in computer science.'

In [None]:
class TEIFile(object):
  
###########################################################################
#                            class initializer
###########################################################################
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self._title = ''
        self._abstract = ''
        
###########################################################################
#                      getting paper sections(headings)
###########################################################################
    def get_sections(self):
      '''
      get body text
      '''
      body = self.soup.find("body")
      try:
        assert (body == self.soup.body)
      except:
        print("*************not equal*********************")
      body_text = []
      if body:        
        all_divs = body.find_all('div')
        for div in all_divs:
          if not div.get("type"):# not reference or appendex
            section={"section":'', "sec_text":[]}    
            if len(list(div) ) > 0:    
              if list(div)[0].name != 'p':
                # #if (len(list(div)))>1:
                #   for par in div.find_all('p'): # with that I got the text inside the p
                #     text={"paragraph":"", "cite_spans":[]}
                #     text["paragraph"] = par.get_text(separator=' ', strip=True)
                #     text["cite_spans"] = self._get_cite_spans(par) 
                #     section["sec_text"].append(text)
                 # in this case we have section name
                section["section"] = list(div)[0]  # get the title of section
            for par in div.find_all('p'): # with that I got the text inside the p
              text={"paragraph":"", "cite_spans":[]}
              text["paragraph"] = par.get_text(separator=' ', strip=True)
              text["cite_spans"] = self._get_grobid_cite_spans(par)#_get_cite_spans(par) 
              section["sec_text"].append(text)
            body_text.append(section)           
      #print("sections", body_text)
      return body_text
 

    # def doi(self):
    #     '''
    #     supposed to be paper id
    #     '''
    #     idno_elem = self.soup.find('idno', type='DOI')
    #     if not idno_elem:
    #         return ''
    #     else:
    #         return idno_elem.getText()

    def paper_title(self):
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title

    def paper_abstract(self):
        '''
        paper abstract
        '''
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract

    def authors(self):
      '''
      paper authors
      '''
      authors_in_header = self.soup.analytic.find_all('author')
      result = []
      for author in authors_in_header:
        # print("author  ",author )
        persname = author.persname
        if not persname:
          continue   
        firstname = elem_to_text(persname.find("forename", type="first"))
        middlename = elem_to_text(persname.find("forename", type="middle"))
        surname = elem_to_text(persname.surname)
        person = Person(firstname, middlename, surname)
        result.append({"first":person.firstname,"middle":person.middlename,"last":person.surname})
      self._authors = result

      return result



    def text(self):
        '''
        paper plain text
        '''
        print("text",self._text)
        if not self._text:
            divs_text = []
            
            for div in self.soup.body.find_all("div"):
              if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = plain_text
        return self._text

    def _get_references(self):
      ref_list =[]
      ref_sec = self.soup.find('div', type='references')
      if not ref_sec:
          return ''
      else:
        i=1
        for elem in ref_sec:
          if len(list(elem))>1:
            for refs in elem:                 
              if len(list(refs))>1:                                                             
                    i= i + 1 
                    ref_list.append(refs)
      return ref_list

    def _build_bib_entry(self,elem):
        ref_dic ={"ref_id":"","ref_title":"","authors":[],"DOI":"","ISSN":"","ISSNe":"" ,"year":"","publisher":"","volume":"","issue":"" }# ref_dic ={"ref_id":"","ref_title":"","authors":[],"idno":"","date":"","publisher":"","volume":"","issue":"", "pages":"" }
        ref_dic["ref_id"] = elem.get("xml:id")
        authors = elem.find_all('author')
        result = []
        for author in authors:
          persname = author.persname
          if not persname:
              pass   
          firstname = elem_to_text(persname.find("forename", type="first"))
          middlename = elem_to_text(persname.find("forename", type="middle"))
          surname = elem_to_text(persname.surname)
          person = Person(firstname, middlename, surname)
          per={}
          per["fisrt"] = person.firstname
          per["middle"] = person.middlename
          per["last"] = person.surname
          result.append(per)            
        ref_dic["authors"]= result               
        title = elem.find_all("title")[0]
        ref_dic["ref_title"] = title.get_text(separator=' ', strip=True)
        idno_elem = elem.find_all('idno')
        if idno_elem:
          if len(idno_elem)>1:
            for elem in idno_elem:
              elem_type = elem.get("type")
              if elem_type == 'DOI':
                ref_dic["DOI"] = elem.get_text(separator=' ', strip=True)
              elif elem_type == 'ISSN': 
                ref_dic["ISSN"] = elem.get_text(separator=' ', strip=True)
              elif elem_type == 'ISSNe': 
                ref_dic["ISSNe"] = elem.get_text(separator=' ', strip=True)

        date_ = elem.find('date')
        if  date_:
          ref_dic["year"] = date_.get("when")
        else:
          ref_dic["year"] = date_


        publisher = elem.find('publisher')
        if not publisher :
          pass
        else:
          ref_dic["publisher"] = publisher.get_text(separator=' ', strip=True)
        biblscope = elem.find_all('biblscope')
        if len(biblscope) ==0:
          pass
        else:            
            for bib in biblscope:
              unit = bib.get("unit")
              if unit =="volume":
                ref_dic["volume"] = bib.get_text(separator=' ', strip=True)
              if unit =="issue":
                ref_dic["issue"] = bib.get_text(separator=' ', strip=True)  
        return ref_dic

    def refrences(self):
      '''
      Get all references
      '''
      bib_entries=[]# for all metadata
      ref_list=[] # for all references
      ref_list = self._get_references()
      for i, elem in enumerate(ref_list):
        ref_dic = self._build_bib_entry(elem)    
        bib_entries.append(ref_dic)     
      return bib_entries  

    def parsepaper(self): 
      '''
      Parse paper
      '''
      self.paper={}
      self.papermeta = {}
      self.papermeta["title"] = self.paper_title()
      self.papermeta["authors"] =  self.authors()
      
      self.papermeta["abstract"] =  self.paper_abstract()
      #self.papermeta["DOI"] =  self.doi()
      self.paper["meta"] = self.papermeta
      self.paper["body"] = self.get_sections()
      self.paper["references"] = self.refrences()
      return self.paper       

    def _get_cite_spans(self,par):
      '''
       private method for getting citation in a aparagraph
      input: text represent the text of one section from article specified outside this function
      output: dataset dictionary contains {number of sentence:(x,y)}; x the citation(s) , y the sentence that contains citations 
      '''
      dataset = []
      text = par.get_text(separator=' ', strip=True)
      cite_spans = []
      reg1 = r'(([A-Z])\w+( and [A-Z]\w+)?) *\((19|20)\d{2}\)|([A-Z][a-z]+ ((\w+) )*(et *al. *)?\((19|20)\d{2}\))'  # Author (Year)
      reg2 = r'\(([^)(])+, *(19|20)\d{2}([^)]+)?\)'# '\(([^)(]+)?(?:19|20)\d{2}?([^)]+)?\)' #(Author1 ,Year1) or (Author1 ,Year; Author2 ,Year2; etc)
      reg3 = r'\[\d+\ *(, *\d+)*\]' # [1] or [1,2,3]
      reg = reg1 + "|" +reg2 + "|"+ reg3
      #+++++++++++++++++++++++++++++++++++++++++++++++++++
      #     get references from xml.tei file paragraph
      #+++++++++++++++++++++++++++++++++++++++++++++++++++
      
      idx_list = []
      refs_idx = par.find_all('ref', type="bibr")      
      if refs_idx :
        #print("refs_idx", refs_idx)
        for idx in refs_idx:
          target = idx.get("target")
          if target:
            idx_list.append(idx.get("target")[1:])
          # else:
          #   idx_list.append("None")  
        #print(" just take idx_list: ",idx_list, " refs_idx", refs_idx,"text", text)
       #++++++++++++++++++++++++++++++++++++++++++++++++++++
       #   join rferences with the dataset
       #++++++++++++++++++++++++++++++++++++++++++++++++++++
      
      if len(idx_list) > 1:  # got it from beautifulsoup
        #x = re.findall(reg, text) # cites inside the text
        cite_spans = list({"text":i.group(),"start":i.start(),"end":i.end()} for i in re.finditer(reg, text)) # got it from regular expression
      i = 0
      
      #count_cite_spans = 0
      for  item in cite_spans:  # cite_spans=[{"text":[1],"start":,"end":}, {"text":[1,2],"start":,"end":}]        
        refs = item["text"]
        reg_year = r'\d{4}'
        years = re.findall(reg_year, refs)
        count = len(years)
        #count_cite_spans += count
        if count > 1 :
          refs = refs.split(';')
        elif count == 1 :
          pass
        else:
          refs = item["text"][1:-1]
          refs = refs.split(',')          
          count = len(refs)
          #count_cite_spans += count
        item["refs_id"] = []
       
        for _ in range(count):  
          try:        
            links[idx_list[i]].append(idx_list[i]) 
            i = i+1    
          except:   
            pass  # this will be wrong! 
      if cite_spans:
        dataset.append({"cite_spans":cite_spans}) 
      # if dataset:
      #   print("dataset", dataset)  
      return dataset

    def _get_grobid_cite_spans(self,par):  
     
      idx_list = []
      refs_idx = par.find_all('ref', type="bibr")      
      if refs_idx :
        #print("refs_idx", refs_idx)
        for idx in refs_idx:
          target = idx.get("target")
          if target:
            bib_entry ={"entry":idx.get("target")[1:], "text":idx.get_text(separator=' ', strip=True)}
            idx_list.append(bib_entry)
      
      return idx_list    
   
    def to_json(self, path):
      '''
       parse paper to json file
      '''
      result= self.parsepaper()
      res = json.dumps(result)
      with open(path, "w+") as json_file: 
        json.dump(result, json_file) 
      


file 77e2fe8914531c28683d831f1554528171f0ce20.pdf.tei.xml was deleted cause the citation style was not supported

In [None]:
%time
import os
from os.path import join
directory ="/content/drive/My Drive/project/ss_sample/ss-sample/grobid"
out_directory="/content/drive/My Drive/project/ss_sample/ss-sample/output"
wrongfiles = []
wrong =[]
right = []
rifgtfiles=[]
for filename in os.listdir(directory):
    if filename.endswith(".pdf.tei.xml") : 
      tei_path = join(directory, filename)      
      tei = TEIFile(tei_path)
      tei_path  = join(out_directory,filename.strip('.pdf.tei.xml')+".json")
      try:
        tei.to_json(tei_path)    
        print("======"*40)
        print("right  ",filename)
        right.append(tei)
        rifgtfiles.append(tei_path) 
      except:
        print("======"*40)
        print("wrong ",filename)
        wrong.append(tei)  
        wrongfiles.append(tei_path)   
    else:
        continue

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs
right   04a06ee282ac397ae34729ee5bc49e3614d516c1.pdf.tei.xml
right   4e45bb05f6ba28e5e6d7942a09c21d900f1b4d5a.pdf.tei.xml
right   0f2325700e7c631808374ce9baa4671902245e4b.pdf.tei.xml
right   6de3132a8da214a57f294680dc71a00670d08460.pdf.tei.xml
right   ccb3323b132de3c21d38c41b4031f1de2a227ede.pdf.tei.xml
right   f77f9a7619abdf12fc1758e425ab17c7d37ba11a.pdf.tei.xml
right   e7b636baf805d554d159408827ada5f5a625c349.pdf.tei.xml
right   cc8ba20d7557c4e1ecdcac4d3a46cd98257ca59d.pdf.tei.xml


In [None]:
for i in  rifgtfiles:
  print(i)


/content/drive/My Drive/project/ss_sample/ss-sample/output/04a06ee282ac397ae34729ee5bc49e3614d516c1.json
/content/drive/My Drive/project/ss_sample/ss-sample/output/4e45bb05f6ba28e5e6d7942a09c21d900f1b4d5a.json
/content/drive/My Drive/project/ss_sample/ss-sample/output/0f2325700e7c631808374ce9baa4671902245e4b.json
/content/drive/My Drive/project/ss_sample/ss-sample/output/6de3132a8da214a57f294680dc71a00670d08460.json
/content/drive/My Drive/project/ss_sample/ss-sample/output/ccb3323b132de3c21d38c41b4031f1de2a227.json
/content/drive/My Drive/project/ss_sample/ss-sample/output/77f9a7619abdf12fc1758e425ab17c7d37ba11a.json
/content/drive/My Drive/project/ss_sample/ss-sample/output/7b636baf805d554d159408827ada5f5a625c349.json
/content/drive/My Drive/project/ss_sample/ss-sample/output/cc8ba20d7557c4e1ecdcac4d3a46cd98257ca59.json


In [None]:
result= right[5].get_sections()

In [None]:
pprint.pprint(result)

[{'sec_text': [{'cite_spans': [],
                'paragraph': 'This article discusses a system designed for '
                             'repetitive movement learning. The concrete '
                             'target domain is a motor-skill, which requires '
                             'both knowledge and physical ability. Cyclic '
                             'behavior of a segmented unit constitutes '
                             'repetitive movement. It is hard to learn a '
                             'stabilized way of such movement without any '
                             'guides. Therefore, the present study provides a '
                             'supporting system based on a model for guiding '
                             'players to make appropriate motor actions. Two '
                             'types of functionality, which work with a fixed '
                             'model and a fluctuation model, can be optionally '
                             'select

In [None]:
result['meta']

{'abstract': "The modulation of visuomotor processing of various body movements by motor expertise due to dance practice was investigated in 12 professional contemporary dancers and 12 right-handed controls. 212 video pairs of dance actions lasting 3 seconds were shown to participants, while their event-related brain potentials (ERPs) were recorded. The second video of each pair might be either the repetition of the previous one, or a slight variation of it, along 3 main dimensions (time, space and body). The task consisted in responding to static images of a dance action by pressing a button. A repetition suppression (RS) effect elicited by a repetition of the same video was visible in both groups, whereas only in dancers it was found a significant modulation of brain responses to deviant stimuli indexing a strong effect of neural plasticity due to motor practice. SwLORETA source reconstruction, performed on the ERPs difference waves ''different'' minus ''same'' videos (450-550 ms) re

In [None]:
tei = TEIFile("/content/drive/My Drive/project/ss_sample/ss-sample/grobid/4e45bb05f6ba28e5e6d7942a09c21d900f1b4d5a.pdf.tei.xml")

In [None]:
tei.doi()

'10.1017/s0950268800051955'

In [None]:
result = tei.parsepaper()#
result['meta']

{'DOI': '10.1017/s0950268800051955',
 'abstract': 'Iso lation , purification and characterization of 3 new cytotoxins of a K. pneumoniae strain isolated from ready to eat pork sausage are reported . Purification process involved extraction of cytotoxins with polymyxi n B sulphate. salt precipitation, gel filtration and anion exchange chromatography. Klebsiella cytotoxin (KCT) I. a g lycoprotein of about 65 kDa was verocytotoxic, enterotoxic and dermonecrotic. KCT II was erythemogenic, verocytotoxi c and enterotox ic protein of co 55 kDa, while KE:T \'" was about double in MW (110 kDa) hadverocytotoxicity but neither enterotox ic ity nor dermatotoxicity. KCT I and II caused granulation, conglomeration, shrinkage, detachment and lysis of MDBK and Vero cells. while KCT \'" induced enlargement, vacuolation, granulation, multinucleolation and syncyti a formation in exposed cell s. All the three cytotoxins induced specific neutralizing antibodies and cytotoxins were detectable in nanogram qu

# New method

In [None]:
pip install tei-reader

In [None]:
from tei_reader import TeiReader

In [None]:
reader = TeiReader()
corpora = reader.read_file('/content/drive/My Drive/project/ss_sample/ss-sample/grobid/4e45bb05f6ba28e5e6d7942a09c21d900f1b4d5a.pdf.tei.xml') # or read_string
print(corpora.text)



In [None]:
!pip install -U ipython
!pip install tei2neo


In [None]:
#!python -m spacy download de_core_news_sm

In [None]:
from tei2neo import parse, GraphUtils
graph = Graph(host="localhost", user="neo4j", password="password")
doc, status, soup = parse(
	filename=file, 
	start_with_tag='TEI', 
	idno='20-MS-221'
)
tx = graph.begin()
doc.save(tx)
tx.commit()