!pip install textract

In [1]:
import docx
from string import punctuation
import re
import csv  

In [2]:
from Bio import Entrez
email='simgeekiz48@gmail.com'
from Bio.Entrez import efetch, read

In [3]:
rba_path = 'data/ferrofumaraat.docx'

In [4]:
csv_path = rba_path.replace('.docx', '.csv')


In [5]:
def search(query):
    Entrez.email = email
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='20',
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = email
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results


In [6]:
from docx import Document
doc = Document(rba_path)

In [7]:
table = doc.tables[1]

# Data will be a list of rows represented as dictionaries
# containing each row's data.
data = []
for i, row in enumerate(table.rows):
    text = (cell.text for cell in row.cells)

    # Establish the mapping based on the first row
    # headers; these will become the keys of our dictionary
    if i == 0:
        keys = tuple(text)
        continue

    # Construct a dictionary for this row, mapping
    # keys to values for this row
    row_data = dict(zip(keys, text))
    data.append(row_data)

In [8]:
table_headers = ['bron', 'bewijs', 'effect', 'opmerkingen', 'source', 'evidence', 'effect', 'remarks']

referenceList = []
ref_count = 0
for table in doc.tables:
#     for row in table.rows:
    for i, row in enumerate(table.rows):
#         text = (cell.text for cell in row.cells)
#         if i == 0:
#             keys = tuple(text)
#             continue
        for cell in row.cells:
            for paragraph in cell.paragraphs:
                bold_text = ''
                for run in paragraph.runs:
                    if (run.bold and run.text.lower().strip().startswith('ref')):
                        ref_count += 1
                    elif i == 0 and run.bold and run.text.lower().strip() in table_headers:
                        continue
                    elif (run.bold):
                        if (run.text.strip() in paragraph.text.strip()):
                            bold_text += run.text.strip() + ' '
#                     print(bold_text)
                if bold_text.strip() != ''  and re.search(r'^[0-9].[0-9]', bold_text) is None: 
                    referenceList.append(paragraph.text.strip().replace('\xa0',' '))

In [9]:
header = ['record_id', 'title', 'abstract', 'doi', 'final_included']
with open(csv_path, 'w', newline='') as csvfile:
    cw = csv.writer(csvfile, delimiter=',') #quotechar='|', quoting=csv.QUOTE_MINIMAL
    cw.writerow(header)

In [10]:
#Loop over the reference list in the docx file
for ref in referenceList:
    ref = ref.strip()
    if (ref.lower().startswith('reference')):
        print('There might be some problem with the reference below. \
            Please check and add the reference if it is not on the list',ref)
        continue
    elif (ref.strip() == ''):
        print('There might be some problem with the reference below. \
            Please check and add the reference if it is not on the list',ref)
        continue
    else:
        try:
            # Search the article in the Pubmed Database
            PubmedSearchResults = search(ref)
            id_list = PubmedSearchResults['IdList']
            textTitle = ref
            #Return the research result
#             print('####Looking For ----', textTitle)
            papers = fetch_details(id_list)
            referenceFound = False

            for paperIndex, paper in enumerate(papers['PubmedArticle']):
                paperTitle = paper['MedlineCitation']['Article']['ArticleTitle'].strip(punctuation).strip()
                if (paperTitle == textTitle.strip(punctuation).strip()):
                    doi = ''
                    for idn, ids in enumerate(papers['PubmedArticle'][paperIndex]['PubmedData']['ArticleIdList']):
                        if papers['PubmedArticle'][paperIndex]['PubmedData']['ArticleIdList'][idn].attributes['IdType'] == 'doi':
                            doi = papers['PubmedArticle'][paperIndex]['PubmedData']['ArticleIdList'][idn]

                    abstract = ''
                    if 'Abstract' in paper['MedlineCitation']['Article']:
                        abstract = ' '.join([str(x) for x in paper['MedlineCitation']['Article']['Abstract']['AbstractText']])

                    data = [id_list[paperIndex], paperTitle, abstract, doi, 1]
                    with open(csv_path, 'a') as csvfile:
                        cw = csv.writer(csvfile, delimiter=',')
                        cw.writerow(data)
                    referenceFound = True 
                    break

        except:
            print('The reference could not be found automaticaly, please add it to the list manually')
            print(ref)

        if not referenceFound:
            print('The reference could not be found automaticaly, please add it to the list manually')
            print(ref)

The reference could not be found automaticaly, please add it to the list manually
Rationale of iron dosage and formulations in under three children.
The reference could not be found automaticaly, please add it to the list manually
Guidelines for the use of iron supplements to prevent and treat iron deficiency anemia.
The reference could not be found automaticaly, please add it to the list manually
Iron deficiency in infancy and childhood.
The reference could not be found automaticaly, please add it to the list manually
Low dose 'Sprinkles'- an innovative approach to treat iron deficiency anemia in infants and young children.
The reference could not be found automaticaly, please add it to the list manually
Iron deficiency in rural Ghanaian children.
Agyei-Frempong MT, Asare G, Owiredu WK, Yeboah FA.
The reference could not be found automaticaly, please add it to the list manually
Effectiveness of two programs of intermittent
The reference could not be found automaticaly, please add it t

In [12]:
search_query = referenceList[7]
print(search_query)

Iron deficiency in infancy and childhood.


In [13]:
# Search the article in the Pubmed Database
try:
    PubmedSearchResults = search(search_query)
    id_list = PubmedSearchResults['IdList']
    print(id_list)
    textTitle = search_query
    #Return the research result
    papers = fetch_details(id_list)
except:
    print('The reference could not be found automaticaly, please add it to the list manually')


['34519599', '29897579', '31566157', '28233303', '34424013', '32778008', '31091416', '22770698', '34692346', '23731448', '25636824', '23772831', '34203528', '28933876', '24117340', '27778088', '30477632', '25512881', '30783211', '29070552']


In [14]:
referenceFound = False
for paperIndex, paper in enumerate(papers['PubmedArticle']):
    paperTitle = paper['MedlineCitation']['Article']['ArticleTitle'].strip(punctuation).strip()
    if (paperTitle == textTitle.strip(punctuation).strip()):
        print('item found', paperTitle)
        doi = ''
        for idn, ids in enumerate(papers['PubmedArticle'][paperIndex]['PubmedData']['ArticleIdList']):
            if papers['PubmedArticle'][paperIndex]['PubmedData']['ArticleIdList'][idn].attributes['IdType'] == 'doi':
                doi = papers['PubmedArticle'][paperIndex]['PubmedData']['ArticleIdList'][idn]

        abstract = ''
        if 'Abstract' in paper['MedlineCitation']['Article']:
            abstract = ' '.join([str(x) for x in paper['MedlineCitation']['Article']['Abstract']['AbstractText']])

        data = [id_list[paperIndex], paperTitle, abstract, doi, 1]
        with open(csv_path, 'a') as csvfile:
            cw = csv.writer(csvfile, delimiter=',')
            cw.writerow(data)
        referenceFound = True 
        break
    else:
        print('It is not this item ', paperTitle )

if not referenceFound:
    print('The reference could not be found automaticaly, please add it to the list manually')
    print(ref)

It is not this item  Iron Deficiency in Infancy and Sluggish Cognitive Tempo and ADHD Symptoms in Childhood and Adolescence
It is not this item  Iron Deficiency, Anemia, and Low Vitamin B-12 Serostatus in Middle Childhood Are Associated with Behavior Problems in Adolescent Boys: Results from the Bogotá School Children Cohort
It is not this item  Iron deficiency in healthy 18-month-old Danish children is associated with no oral iron supplementation in infancy and prolonged exclusive breast-feeding
It is not this item  Associations Among Infant Iron Deficiency, Childhood Emotion and Attention Regulation, and Adolescent Problem Behaviors
It is not this item  Iron deficiency in infancy and neurocognitive and educational outcomes in young adulthood
It is not this item  Young adult outcomes associated with lower cognitive functioning in childhood related to iron-fortified formula in infancy
It is not this item  The Benefits and Risks of Iron Supplementation in Pregnancy and Childhood
It is n

In [94]:
papers['PubmedArticle'][0]['PubmedData']['ArticleIdList']

[StringElement('34496782', attributes={'IdType': 'pubmed'}), StringElement('10.1186/s12887-021-02638-8', attributes={'IdType': 'doi'}), StringElement('10.1186/s12887-021-02638-8', attributes={'IdType': 'pii'}), StringElement('PMC8424794', attributes={'IdType': 'pmc'})]

In [98]:
for idn, ids in enumerate(papers['PubmedArticle'][0]['PubmedData']['ArticleIdList']):
    print(ids,idn)
    if papers['PubmedArticle'][0]['PubmedData']['ArticleIdList'][idn].attributes['IdType'] == 'doi':
        print('doi', ids)

34496782 0
10.1186/s12887-021-02638-8 1
doi 10.1186/s12887-021-02638-8
10.1186/s12887-021-02638-8 2
PMC8424794 3


In [99]:
papers['PubmedArticle'][0]['PubmedData']['ArticleIdList'][1]

StringElement('10.1186/s12887-021-02638-8', attributes={'IdType': 'doi'})

In [56]:
len(papers['PubmedArticle'])

3