In [48]:
import os, glob
import numpy as np
import spacy as sp
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from spacy import displacy
import joblib
from multiprocessing import Pool

#The 'xx' dataset is the biggest multilanguage one.  It catches the most names
#The 'en' dataset does the best job of parsing organizations and labels verbs and other parts of speech
# Install model with `python -m spacy download en`

nlp = sp.load('en')


In [49]:
# Sample read
sample = '<ack><title>Acknowledgements</title><p>The authors wish to acknowledge Diya Ma, Matthew-Lun Wong, Ka-Long Ko, Ka-Hei Ko and Jin-Peng Lee for their important contributions to the software development.</p><sec id=""FPar1""><title>Funding</title><p id=""Par28"">The work described in this paper was supported by a grant from the Research Grants Council of the Hong Kong Special Administrative Region, China (Project No.: CUHK 14113214), grants from the Innovation and Technology Commission (Project No: ITS/149/14FP, GHP/028/14SZ, ITS/293/14FP), grants from CUHK Technology and Business Development Fund (Project No.: TBF16MED002, TBF16MED004), a grant from The Science, Technology and Innovation Commission of Shenzhen Municipality (Project No.: CXZZ20140606164105361), and a grant from The Scientific Research Project of Guangdong Province (Project No.: 2014B090901055).</p></sec></ack>'
sample2 = '<ack id=""ack0010""><title>Acknowledgements</title><p>The authors thank Dr. R Kaneko for the gift of the iSip2 vector; and Mss. T Honma, K Harada, A Morita, and Y Shimoda for providing technical and secretarial assistance. We thank the staff at the Department of Genetic and Behavioral Neuroscience and Bioresource Center, Gunma University Graduate School of Medicine for their critical comments and technical assistance. This study was supported by <funding-source id=""gs1"">Grants-in-Aid for Scientific Research</funding-source> (23115503, 26290002, 15H01415 and 15H05872 to Y.Y.), a Grant-in-Aid for Scientific Research on Innovative Areas (Comprehensive Brain Science Network) (to Y.Y.) from the <funding-source id=""gs2"">Ministry of Education, Culture, Sports, Science and Technology (MEXT)</funding-source> of Japan, a grant from the Co-operative Study Program of the <funding-source id=""gs3"">National Institute for Physiological Sciences</funding-source>, Japan (to Y.Y.), and a grant from the <funding-source id=""gs4"">Takeda Science Foundation</funding-source> (to Y.Y.).</p></ack>'
sample = u'<ack id=""ack0005""><title>Acknowledgments</title><p>The project was supported by a start-up funding provided to the author by the <funding-source id=""gs0005"">Department of Neurology of the University of Utah</funding-source>.</p><p>This project was inspired by studying the work of Dr. Ed Dudek and the results of the initial experiments were discussed with him.</p><p>I am also grateful to Dr. Erika Scholl for her assistance in measuring rat serum osmolarity and to Dr. Noel Carlson for his insightful comments on the manuscript.</p></ack>'

soup = BeautifulSoup(sample,'lxml')
samp_txt = soup.find_all('ack')[0].get_text(separator=' ')

print(samp_txt)
doc = nlp(samp_txt)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Acknowledgments The project was supported by a start-up funding provided to the author by the  Department of Neurology of the University of Utah . This project was inspired by studying the work of Dr. Ed Dudek and the results of the initial experiments were discussed with him. I am also grateful to Dr. Erika Scholl for her assistance in measuring rat serum osmolarity and to Dr. Noel Carlson for his insightful comments on the manuscript.
the  Department of Neurology of the University of Utah 90 144 ORG
Ed Dudek 201 209 PERSON
Erika Scholl 304 316 PERSON
Noel Carlson 381 393 PERSON


In [50]:
F_CSV = glob.glob("../source_data/extracted/*.csv")

In [88]:
def parse_df(row, k):
    if k%1000==0:
        print(k)
    
    item = {"filename":row.filename}
    if row.Acknowledgment_Tag is None or type(row.Acknowledgment_Tag) == float:
        return item
    
    soup = BeautifulSoup(row.Acknowledgment_Tag,'lxml')

    for ele in soup.find_all('title'):
        ele.decompose()
        
    text = soup.find('ack')
    
    if text is None:
        return item
    
    text = text.get_text(separator=' ')
    
    doc = nlp(text)

    item["Verbs"] = ';'.join([ word.lemma_ for word in doc if word.pos_ == 'VERB' and not word.is_stop])  
    item["Nouns"] = ';'.join([ word.text for word in doc if word.pos_ == 'NOUN' and not word.is_stop])
    item["Names"] = ';'.join([ ent.text for ent in doc.ents if ent.label_ == 'PERSON' and len(ent.text.split(' ')) > 1 ])
    item["Organizations"] = ';'.join([ ent.text for ent in doc.ents if ent.label_ == 'ORG' ])
    return item

In [92]:
for f in F_CSV:
    df = pd.read_csv(f, nrows=2000)
    
    dfunc = joblib.delayed(parse_df)
    with joblib.Parallel(1) as MP:
        dx = MP(dfunc(row,k) for k,row in df.iterrows())
        
    dx = pd.DataFrame(dx)
    f_save = os.path.join("../parsed_data/new_parse/",
                          os.path.basename(f))
    dx.to_csv(f_save)
dx

0
1000
0
1000
0
1000
0
1000


Unnamed: 0,Names,Nouns,Organizations,Verbs,filename
0,R. Ganesan;Indira Gandhi Centre,authors;support;study;P;p;values;publication,the Austrian Science Fund (FWF;the Scientifi...,want;thank;provide,CALPHAD/PMC4270480.nxml
1,Stephan Puchegger;Olivia Appay,support;project;No;P;project;No;addition;autho...,the Austrian Science Fund (FWF;the Scientifi...,acknowledge;wish;thank;want;thank;prepare,CALPHAD/PMC4270483.nxml
2,H. Flandorfer;St. Puchegger,support;study;authors;suggestions;discussions;...,the Austrian Science Fund (FWF;SEM,come;want;acknowledge,CALPHAD/PMC4456117.nxml
3,,,,,CASE_(Phila)/PMC6034458.nxml
4,Maria Carr;Michael Yamashita,contributions;support;project,,thank;acknowledge,CASE_(Phila)/PMC6058279.nxml
5,Hitoshi Sakuraba,suggestions,Meiji Pharmaceutical University,thank,CASE_(Phila)/PMC6058397.nxml
6,,,,,CASE_(Phila)/PMC6058921.nxml
7,,,,,CASE_(Phila)/PMC6034467.nxml
8,,,,,CASE_(Phila)/PMC6034476.nxml
9,,,,,CASE_(Phila)/PMC6034456.nxml
