In [1]:
import docx
from string import punctuation
import re
import csv 
import os
from simhash import Simhash, SimhashIndex

In [2]:
from Bio import Entrez
email='simgeekiz48@gmail.com'
from Bio.Entrez import efetch, read

In [3]:
rba_path = 'data/newdata/1. Risicoanalyse kinderformularium gentamicine nieuw.docx'

In [4]:
csv_dir = os.path.abspath(os.path.join(os.path.dirname(rba_path), 'csv'))
if not os.path.exists(csv_dir):
    os.makedirs(csv_dir)
    
head, filename = os.path.split(rba_path)
csv_path = os.path.join(csv_dir, filename.replace('.docx', '.csv'))

In [5]:
def search(query):
    Entrez.email = email
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='20',
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = email
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results


# Search the article in the Pubmed Database
def searchArticle(query):
    PubmedSearchResults = search(query)
    id_list = PubmedSearchResults['IdList']
    
    papers = fetch_details(id_list)
    return id_list, papers

In [6]:
from docx import Document
doc = Document(os.path.abspath(rba_path))

In [68]:
def collectFromEndnote():
    findBegin = False
    referenceList = []

    for i in range(0, len(doc.paragraphs)):
        if (doc.paragraphs[i].text.lower().startswith('references') or \
            doc.paragraphs[i].text.lower().startswith('referen')):
            for run in doc.paragraphs[i].runs:
                if run.bold:
                    findBegin = True
        if (findBegin == True):
            referenceList.append(doc.paragraphs[i].text)
    return referenceList

In [69]:
referenceList = collectFromEndnote()

In [113]:
trial = 1
if not id_list:
    new_search_query = createNewSearchTerm(ref, trial)
    trial += 1

In [131]:
new_search_query = createNewSearchTerm(ref, 7)

In [132]:
new_search_query

False

In [34]:
papers = fetch_details(id_list)

In [130]:
def createNewSearchTerm(searchTerm, trial):
    if len(searchTerm.split(' '))/2 > 2 and  trial == 1:
        newTerm = ' '.join(str(x) for x in searchTerm.split(' ')[0:int(len(searchTerm.split(' '))/2)])
        return newTerm
    elif (len(searchTerm.split(' '))/2-trial +1) > 2 and trial != 1:
        newTerm = ' '.join(str(x) for x in searchTerm.split(' ')[0:int(len(searchTerm.split(' '))/2)-trial +1])
        return newTerm
    else:
        return False

In [None]:
### version 1
table_headers = ['bron', 'bewijs', 'effect', 'opmerkingen', 'source', 'evidence', 'effect', 'remarks']
data = []
referenceListfromTable = []
authorListfromTable = []
for table in doc.tables:
    for i, row in enumerate(table.rows):

        text = [cell.text.lower() for cell in row.cells]

        if i == 0:
            effect_index = text.index('effect') if 'effect' in text else None
            continue

#         if effect_index is None:
#             continue

        for j, cell in enumerate(row.cells):

            if j==effect_index:

                bold_text = ''
                authorsText = ''
                authorsTextRaw = ''
                untilSummary = False
                
                for paragraph in cell.paragraphs:
                    for k, run in enumerate(paragraph.runs):
                        if run.bold and authorsTextRaw == '':
                            justReadBold = True
                            bold_text += run.text.strip() + ' '
                        elif ('et al' in run.text and not paragraph.runs[k-1].bold 
                                  and authorsTextRaw == ''):

                            if (paragraph.runs[k-1].text not in run.text):
                                authorsTextRaw = paragraph.runs[k-1].text + run.text
                            else:
                                authorsTextRaw = run.text
                            authorsText = authorsTextRaw.strip(punctuation).strip()
                            authorsText = re.sub(r'et al.*\d*', '',authorsText).strip()
                        
                        
                        elif run.underline and (run.text.lower().startswith('samenvatting') 
                          or run.text.lower().startswith('summary')):
                            print(run.text)
#                             print(authorsText, authorsTextRaw)
#                         else:
#                             print(run.text)
                if bold_text.strip() != ''  and re.search(r'^[0-9].[0-9]', bold_text) is None: 
                    referenceListfromTable.append(bold_text.strip().replace('\xa0',' '))
                    authorListfromTable.append({
                        'title': bold_text.strip().replace('\xa0',' '),
#                         'authors': authorsText,
                        'authors': authorsTextRaw
                    })

In [63]:
def collectFromTables():
    # table_headers = ['bron', 'bewijs', 'effect', 'opmerkingen', 'source', 'evidence', 'effect', 'remarks']

    data = []
    referencesFromTables = []
    for table in doc.tables:
        for i, row in enumerate(table.rows):

            text = [cell.text.lower() for cell in row.cells]

            if i == 0:
                effect_index = text.index('effect') if 'effect' in text else None
                continue

            for j, cell in enumerate(row.cells):

                if j == effect_index:

                    bold_text = ''
                    p_title = ''
                    p_authors = ''
                    untilSummary = False

                    for p, paragraph in enumerate(cell.paragraphs):

                        if len(paragraph.runs) < 1:
                            continue

                        if (paragraph.runs[0].bold 
                            and paragraph.runs[-1].bold and p_title == ''):

                            bold_text += paragraph.text.strip() + ' '

                        elif (not paragraph.runs[-1].bold 
                              and p > 0
                              and len(cell.paragraphs[p-1].runs) > 0
                              and cell.paragraphs[p-1].runs[-1].bold):

                            p_title = bold_text

                            p_authors = ' '.join([p_authors, paragraph.text.strip()])

                        elif (paragraph.runs[0].underline 
                              and (paragraph.runs[0].text.lower().startswith('samenvatting') 
                                   or paragraph.runs[0].text.lower().startswith('summary'))):

                            break

                    if p_title.strip() != ''  and re.search(r'^[0-9].[0-9]', p_title) is None: 
                        referencesFromTables.append({
                            'p_title': p_title.strip().replace('\xa0',' '),
                            'p_authors': p_authors,
                        })
    return referencesFromTables

In [65]:
referencesFromTables = collectFromTables()

In [66]:
def writeFromTablesToCsv():
    header = ['record_id', 'title', 'abstract', 'doi', 'final_included']
    with open(csv_path, 'w', newline='') as csvfile:
        cw = csv.writer(csvfile, delimiter=',')
        cw.writerow(header)

    #Loop over the reference list in the docx file
    for ref in referencesFromTables:
        ref = ref['title'].strip()

        if (ref.lower().startswith('reference') or ref.lower().startswith('referen')):
            print('There might be some problem with the reference below. \
                Please check and add the reference if it is not on the list',ref)
            continue
        elif (ref.strip() == ''):
            print('There might be some problem with the reference below. \
                Please check and add the reference if it is not on the list',ref)
            continue
        else:
            try:
                papers = searchArticle(ref)
                referenceFound = False

                for paperIndex, paper in enumerate(papers['PubmedArticle']):
                    paperTitle = paper['MedlineCitation']['Article']['ArticleTitle'].strip(punctuation).strip()
                    if (paperTitle == textTitle.strip(punctuation).strip()):
                        doi = ''
                        for idn, ids in enumerate(papers['PubmedArticle'][paperIndex]['PubmedData']['ArticleIdList']):
                            if papers['PubmedArticle'][paperIndex]['PubmedData']['ArticleIdList'][idn].attributes['IdType'] == 'doi':
                                doi = papers['PubmedArticle'][paperIndex]['PubmedData']['ArticleIdList'][idn]

                        abstract = ''
                        if 'Abstract' in paper['MedlineCitation']['Article']:
                            abstract = ' '.join([str(x) for x in paper['MedlineCitation']['Article']['Abstract']['AbstractText']])

                        data = [id_list[paperIndex], paperTitle, abstract, doi, 1]
                        with open(csv_path, 'a') as csvfile:
                            cw = csv.writer(csvfile, delimiter=',')
                            cw.writerow(data)
                        referenceFound = True 
                        break

            except:
                print('!!!! The reference could not be found automaticaly, please add it to the list manually')
                print(ref)

In [67]:
def writeFromEndnoteTOCsv(referenceList):
    #Loop over the reference list in the docx file
    for ref in referenceList:
        ref = ref.strip()
        if (ref.lower().startswith('referen')):
            continue
        elif (ref.strip() == ''):
            continue
        else:
#             print('##########', ref)
            search_item = {
                'p_authors': '',
                'p_title': '',
                'p_doi': ''
            }
            x1 = re.findall("(.*et al.?)([^.]+).", ref)
            x2 = re.findall("(.*)\d{4}.?\s?(\".*\")", ref)
            x3 = re.findall(r'doi:?\s?(.+).?', ref)
            
            if x3:
                search_item['p_doi'] = x3[0].strip(punctuation).strip()
                
            if x1:
                search_item['p_authors'] = x1[0][0].strip(punctuation).strip()
                search_item['p_title'] = x1[0][1].strip(punctuation).strip()
                
            elif x2:
                search_item['p_authors'] = x2[0][0].strip(punctuation).strip()
                search_item['p_title'] = x2[0][1].strip(punctuation).strip()

            elif (ref.lower().startswith('who') or ref.lower().startswith('lci') \
                  or ref.lower().startswith('nvn') or ref.lower().startswith('nice')):
#                 print('This reference is book', ref)
                continue

            try:
                search_query = search_item['p_doi'] if search_item['p_doi'] else search_item['p_title'] 
                print('####search_query', search_query)
                papers = searchArticle(search_query)
                referenceFound = False

                for paperIndex, paper in enumerate(papers['PubmedArticle']):
                    paperTitle = paper['MedlineCitation']['Article']['ArticleTitle'].strip(punctuation).strip()
                    if (Simhash(paperTitle).distance(Simhash(search_item['p_title'])) <= 5):
                        doi = ''
                        for idn, ids in enumerate(papers['PubmedArticle'][paperIndex]['PubmedData']['ArticleIdList']):
                            if papers['PubmedArticle'][paperIndex]['PubmedData']['ArticleIdList'][idn].attributes['IdType'] == 'doi':
                                doi = papers['PubmedArticle'][paperIndex]['PubmedData']['ArticleIdList'][idn]

                        abstract = ''
                        if 'Abstract' in paper['MedlineCitation']['Article']:
                            abstract = ' '.join([str(x) for x in paper['MedlineCitation']['Article']['Abstract']['AbstractText']])

                        data = [id_list[paperIndex], paperTitle, abstract, doi, 1]
                        with open(csv_path, 'a') as csvfile:
                            cw = csv.writer(csvfile, delimiter=',')
                            cw.writerow(data)
                        referenceFound = True 
                        break
            except:
                print('!!!! The reference could not be found automaticaly, please add it to the list manually', ref)