In [1]:
# 0.1 Install dependencies for handling PDF, DOCX, and Excel files
!pip install pdfplumber python-docx openpyxl xlrd



In [3]:
# 0.2 Import required libraries
import pandas as pd
import os
import re
import pdfplumber
import docx
import openpyxl
import xlrd
import random
from pathlib import Path
from collections import defaultdict

In [5]:
# 0.3 Set path variables
CSV_path = 'Cases per paper.csv'
input_dir = 'manual_edited'
output_dir = 'preprocess_outputs'

matched_path = os.path.join(input_dir, "matched_final.xlsx")
group_path = os.path.join(input_dir, "study_groups_review_final.xlsx")

In [7]:
# 1.1 Load matched study-group pairs and group-file metadata
matched_df = pd.read_excel(matched_path)
group_df = pd.read_excel(group_path)

In [9]:
# 1.2 Map each study name to its corresponding list of file entries based on matched group
study_to_file = defaultdict(list)

for _, row in matched_df.iterrows():
    study_name = row["Study Name"]
    matched_group = row["Matched Group Name"]
    matched_entries = group_df[group_df["Study Group Key"] == matched_group]
    study_to_file[study_name] = matched_entries.to_dict(orient="records")

In [11]:
# 1.3 Summarize the total number of studies and associated files
total_studies = len(study_to_file)
total_files = sum(len(v) for v in study_to_file.values())

print(f"Total Number of Studies：{total_studies}")
print(f"Total Number of Files to be Read：{total_files}")

Total Number of Studies：652
Total Number of Files to be Read：664


In [13]:
# 1.4 Identify studies that are linked to zero or multiple files (for manual inspection)
for study, entries in study_to_file.items():
    if len(entries) != 1:
        print(f"{study}: {len(entries)} files")

### After manual inspection, it was found that all files were correctly associated with their corresponding studies.

Ho, Chan - Wong , 2018: 2 files
Schankin -Straube, 2016: 2 files
Florance, Davis - Dalmau, 2009: 2 files
Iizuka - Nishiyama, 2016: 2 files
Mohammad - Dale, 2016: 2 files
Dalmau-Lynch, 2007: 2 files
Irani, Bera - Vincent, 2010: 2 files
Gabilondo, 2011: 2 files
Shin - Chu, 2018: 2 files
Dale, Brilot - Lim, 2014: 4 files


In [15]:
# 2.1 Define the functions to extract text content from different document formats

# 2.1.1 Function to extract text data from a PDF file
def extract_text_from_pdf(filepath):
    """
    Extract all text content from a PDF file.
    Parameters:
        filepath (str): The path to the PDF file.
    Returns:
        str: Extracted text from all pages.
    """
    text = ""
    try:
        with pdfplumber.open(filepath) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f" ⚠️ Failed to read PDF: {filepath}, Error: {e}")          ### Print an error message if reading fails
    return text

# 2.1.2 Function to extract text data from a DOCX (Word) file
def extract_text_from_docx(filepath):
    """
    Extract all text content from a DOCX (Word) file.
    Parameters:
        filepath (str): The path to the DOCX file.
    Returns:
        str: Extracted text concatenated from all paragraphs.
    """
    text = ""
    try:
        doc = docx.Document(filepath)
        for para in doc.paragraphs:
            text += para.text + "\n"
    except Exception as e:
        print(f" ⚠️ Failed to read DOCX: {filepath}, Error: {e}")        ### Print an error message if reading fails
    return text

# 2.1.3 Function to extract text data from a XLS (Excel) file
def extract_text_from_xls(filepath):
    """
    Extract all text content from an XLS (Excel) file.
    Parameters:
        filepath (str): The path to the XLS file.
    Returns:
        str: Extracted text from all sheets and rows, cells joined by tabs and rows by newlines.
    """
    text = ""
    try:
        book = xlrd.open_workbook(filepath)
        for sheet in book.sheets():
            for row_idx in range(sheet.nrows):
                row = sheet.row_values(row_idx)
                row_text = [str(cell) for cell in row if cell]
                if row_text:
                    text += "\t".join(row_text) + "\n"
    except Exception as e:
        print(f" ⚠️ Failed to read XLS: {filepath}, Error: {e}")         ### Print an error message if reading fails
    return text

In [17]:
# 2.2 Extract all text in every file associated with each study
study_to_text = {}

for i, (study_name, entry_list) in enumerate(study_to_file.items(), 1):
    all_text = ""
    print(f"\nProcessing Study #{i}: {study_name}")

    if not entry_list:                                                             ### If no associated files, report a warning and assign empty string
        print(f" ⚠️ No study group found for study: {study_name}")
        study_to_text[study_name] = ""
        continue
        
    for entry in entry_list:
        path_data = entry["Full Path"]
        file_name = entry["Filename"]
        file_path = os.path.join(input_dir, path_data)

        if not os.path.exists(file_path):                                          ### If the file doesn't exist or path is invalid, print a warning and skip
            print(f" ⚠️ Invalid file path: {file_path}")
            continue
            
        print(f" Reading: {file_name}")

        ext = Path(file_path).suffix.lower()                                       ### Get file extension in lowercase
        if ext == ".pdf":                                                          ### Extract text based on file type
            all_text += extract_text_from_pdf(file_path) + "\n"
        elif ext == ".docx":
            all_text += extract_text_from_docx(file_path) + "\n"
        elif ext == ".xls":
            all_text += extract_text_from_xls(file_path) + "\n"
        else:
            print(f" ⚠️ Skipped unsupported file type: {file_name}")               ### If file type is unsupported, print a warning and skip

    study_to_text[study_name] = all_text.strip()                                   ### Store the combined text under the study name


Processing Study #1: Tabata - Hara, 2014
 Reading: 2014, Tabata.pdf

Processing Study #2: Day, 2011
 Reading: 2011, Day.pdf

Processing Study #3: Reyes-Botero
 Reading: 2011, Reyes-Botero.pdf

Processing Study #4: Warren - Blum, 2017
 Reading: 2017, Warren - Blum.pdf

Processing Study #5: Lekoubou - Honnorat, 2012
 Reading: 2012, Lekoubou.pdf

Processing Study #6: Mitani
 Reading: 2013, Mitani.docx

Processing Study #7: Iriondo -Aguilera, 2017 
 Reading: 2017, Iriondo -Aguilera.pdf

Processing Study #8: Alsaadi - Hamid, 2015
 Reading: 2015, Alsaadi - Hamid.pdf

Processing Study #9: Mohammad, Wallace - Dale, 2014
 Reading: 2014, Mohammad, Wallace - Dale.pdf

Processing Study #10: Luca, 2011
 Reading: 2011, Luca.pdf

Processing Study #11: Ueda - Kohara, 2017
 Reading: 2017, Ueda - Kohara.pdf

Processing Study #12: Eker
 Reading: 2008, Eker.pdf

Processing Study #13: Broderick - Nanji, 2014 
 Reading: 2014, Broderick - Nanji.pdf

Processing Study #14: Kramina -Viksna, 2015
 Reading: 2015

Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #25: Azizyan - Moser, 2014 
 Reading: 2014, Azizyan - Mose.pdf

Processing Study #26: Hansen
 Reading: 2013, Hansen.pdf

Processing Study #27: Osei-Lah - Kirkham, 2014
 Reading: 2014, Osei-Lah - Kirkham.pdf

Processing Study #28: Huang - Guo, 2015
 Reading: 2015, Huang - Guo.pdf

Processing Study #29: Pillai-Dale, 2010
 Reading: 2010, Pillai.pdf

Processing Study #30: Greiner -Krueger, 2011
 Reading: 2011, Greiner.pdf

Processing Study #31: Behrendt - Kleiter, 2016 
 Reading: 2016, Behrendt - Kleiter.pdf

Processing Study #32: Chia, 2011
 Reading: 2011, Chia.pdf

Processing Study #33: Hung
 Reading: 2011, Hung.pdf

Processing Study #34: Henry - de Broucker, 2009
 Reading: 2009, Henry.docx

Processing Study #35: Kumar-Dalmau, 2010
 Reading: 2010, Kumar.pdf

Processing Study #36: Xu
 Reading: 2016, Xu - Dai.pdf

Processing Study #37: Maqbool
 Reading: 2011, Maqbool.pdf

Processing Study #38: Frechette
 Reading: 2011, Frechette.pdf

Processing Study #39: Tsuyusaki
 Readi

Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #95: Kaplan -Probasco, 2017 
 Reading: 2017, Kaplan -Probasco.pdf

Processing Study #96: Cassa
 Reading: 2013, Cassa.pdf

Processing Study #97: Chanson  - Rosenberg, 2016
 Reading: 2016, Chanson  - Rosenberg.pdf

Processing Study #98: Mann
 Reading: 2012, Mann.pdf

Processing Study #99: Tatencloux -Deiva, 2015
 Reading: 2015, Tatencloux -Deiva.pdf

Processing Study #100: Bowes - Shannon Weickert, 2015 
 Reading: 2015, Bowes - Shannon Weickert.pdf

Processing Study #101: Ishiura
 Reading: 2008, Ishiura.pdf

Processing Study #102: Schankin -Straube, 2016
 Reading: 2016, Schankin -Straube.pdf
 Reading: 2016, Schankin -Straube Supplementary.pdf

Processing Study #103: Yilmaz -Tuzun, 2014 
 Reading: 2014, Yilmaz -Tuzun.pdf

Processing Study #104: Rodriguez-Osorio - Arias, 2014
 Reading: 2014, Rodriguez-Osorio.pdf

Processing Study #105: Suri-Suri
 Reading: 2013, Suri.docx

Processing Study #106: Goldberg -Cellucci, 2017
 Reading: 2017, Goldberg -Cellucci.pdf

Processing St

Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value



Processing Study #109: Gough - Nilforooshan, 2016
 Reading: 2016, Gough - Nilforooshan.pdf


Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color because /'P3' is an invalid float value
Cannot set gray stroke color because /'P4' is an invalid float value
Cannot set gray stroke color because /'P5' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value



Processing Study #110: Nazif, 2012
 Reading: 2012, Nazif.pdf

Processing Study #111: Hacohen - Lin, 2016
 Reading: 2016, Hacohen - Lin.pdf

Processing Study #112: Beatty - Khot, 2014
 Reading: 2014, Beatty - Khot.pdf

Processing Study #113: Roberts
 Reading: 2012, Roberts.pdf

Processing Study #114: Haberlandt  -Rostásy, 2017
 Reading: 2017, Haberlandt  -Rostásy.pdf

Processing Study #115: Di Capua
 Reading: 2013, Di Capua.pdf

Processing Study #116: Jandu - Vidgeon, 2016
 Reading: 2016, Jandu - Vidgeon.pdf

Processing Study #117: Leypoldt-Wandinger
 Reading: 2013, Leypoldt.pdf

Processing Study #118: Voice - Lakhi, 2017
 Reading: 2017, Voice - Lakhi.pdf

Processing Study #119: See
 Reading: 2012, See.pdf

Processing Study #120: Lin, Lin - Wang, 2014 
 Reading: 2014, Lin, Lin -Wang.pdf

Processing Study #121: Ikeguchi 
 Reading: 2012, Ikeguchi.pdf

Processing Study #122: Iglesias-Alonso - Iglesias-García, 2017
 Reading: 2017, Iglesias-Alonso - Iglesias-García.pdf

Processing Study #

Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #130: Shimoyama - Minami, 2016
 Reading: 2016, Shimoyama - Minami.pdf

Processing Study #131: Scott
 Reading: 2013, Scott.pdf

Processing Study #132: Bost - Honnorat, 2018 
 Reading: 2018, Bost - Honnorat.pdf

Processing Study #133: Nosadini - Sartori, 2014
 Reading: 2014, Nosadini - Sartori.pdf

Processing Study #134: Li, Liu - Liu, 2017
 Reading: 2017, Li, Liu - Liu.pdf

Processing Study #135: Guo - Lin, 2014
 Reading: 2014, Guo - Lin.pdf

Processing Study #136: Hara
 Reading: 2011, Hara.pdf

Processing Study #137: Martinez-Hernandez, 2011
 Reading: 2011, Martinez-Hernandez.pdf

Processing Study #138: Thomas
 Reading: 2013, Thomas.pdf

Processing Study #139: Sveinsson -  Piehl, 2017
 Reading: 2017, Sveinsson -  Piehl.pdf

Processing Study #140: Dengler - Seifi, 2017 
 Reading: 2017, Dengler - Seifi.pdf

Processing Study #141: Heine -Harms, 2016
 Reading: 2016, Heine -Harms.pdf

Processing Study #142: Salvucci - Sheth, 2014
 Reading: 2014, Salvucci - Sheth.pdf

Proce

Cannot set gray non-stroke color because /'P0' is an invalid float value



Processing Study #221: Chan - Lynch, 2015
 Reading: 2015.03, Chan - N Z Med J.pdf

Processing Study #222: Tituler, Höftberger - Dalmau, 2014
 Reading: 2014, Tituler, Höftberger - Dalmau.pdf

Processing Study #223: Uruha
 Reading: 2011, Uruha.pdf

Processing Study #224: Wang - Luo, 2015
 Reading: 2015, Wang - Luo.pdf

Processing Study #225: Hinkle -Heffelfinger, 2016
 Reading: 2016, Hinkle -Heffelfinger.pdf

Processing Study #226:  Viaccoz - Honnorat, 2014
 Reading: 2014, Viaccoz - Honnorat.pdf

Processing Study #227: Rainey -Cheesman, 2014
 Reading: 2014, Rainey -Cheesman.pdf

Processing Study #228: Dalmau-Saiz
 Reading: 2013, Dalmau-Saiz.pdf

Processing Study #229: AlHakeem -Tabarki, 2016 
 Reading: 2016, AlHakeem -Tabark.pdf

Processing Study #230: Gurcharran - Karkare, 2017
 Reading: 2017, Gurcharran - Karkare.pdf

Processing Study #231: Kirkpatrick 
 Reading: 2011, Kirkpatrick.pdf

Processing Study #232: Pham, 2011
 Reading: 2011, Pham.pdf

Processing Study #233: Beecher
 Reading

Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value



Processing Study #247: Wang , Li - Liu, 2017 
 Reading: 2017, Wang , Li.pdf


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P2' is an invalid float value
Cannot set gray stroke color because /'P3' is an invalid float value
Cannot set gray stroke color because /'P4' is an invalid float value
Cannot set gray stroke color because /'P5' is an invalid float value
Cannot set gray stroke color because /'P6' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value



Processing Study #248: Pascual-Ramírez, 2011
 Reading: 2011, Pascual-Ramirez.pdf

Processing Study #249: Hopkins - Chan, 2013
 Reading: 2013, Hopkins.pdf

Processing Study #250: Kawano
 Reading: 2011, Kawano.pdf

Processing Study #251: Neiman - Chokroverty, 2015 
 Reading: 2015, Neiman - Chokroverty.pdf

Processing Study #252: Peng - Wang, 2017 
 Reading: 2017, Peng - Wang.pdf

Processing Study #253: Marques - Sales, 2014 
 Reading: 2014, Marques.pdf

Processing Study #254: Agrawal - Wassmer, 2010
 Reading: 2010, Agrawal.pdf

Processing Study #255: Thomas - Honnorat, 2014
 Reading: 2014, Thomas - Honnorat.pdf

Processing Study #256: Miyauchi - Yamagata, 2016
 Reading: 2016, Miyauchi - Yamagat.pdf

Processing Study #257: Tamma
 Reading: 2011, Tamma.pdf

Processing Study #258: Ramanathan - Fung, 2013
 Reading: 2013, Ramanathan.pdf

Processing Study #259: Gitiaux - Kaminska, 2013
 Reading: 2013, Gitiaux.pdf

Processing Study #260: Hernaez-Goni - Tirapu-Ustarroz, 2017
 Reading: 2017, Hern

Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #267: Houtrow - Neufeld, 2012
 Reading: 2012, Houtrow.pdf

Processing Study #268: Imai - Sumi, 2015 
 Reading: 2015, Imai - Sumi.pdf

Processing Study #269: Seward, 2018
 Reading: 2018, Seward.pdf

Processing Study #270: Jones - Lockwood, 2014
 Reading: 2014, Jones - Lockwood.pdf

Processing Study #271: Sachs - Burdette, 2018
 Reading: 2018, Sachs - Burdette.pdf

Processing Study #272: Yuan - Glezer, 2013
 Reading: 2013, Yuan.pdf

Processing Study #273: Castellano -Robinson, 2017
 Reading: 2017, Castellano - Robinson.pdf

Processing Study #274: Clara - Oller, 2016
 Reading: 2016, Clara - Oller.pdf

Processing Study #275: Kurian
 Reading: 2012, Kurian.pdf

Processing Study #276: Jagota - Bhidayasiri, 2014 
 Reading: 2014, Jagota - Bhidayasiri.pdf

Processing Study #277: Niehusmann, 2009
 Reading: 2009, Niehusmann.pdf

Processing Study #278: Maggina
 Reading: 2012, Maggina.pdf


Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #279: Makuch - Irani, 2018 
 Reading: 2018, Makuch - Irani.pdf

Processing Study #280: Bustos - Ortiz, 2017
 Reading: 2017, Bustos - Ortiz.pdf

Processing Study #281: Tantipalakorn - Tongsong, 2016
 Reading: 2016, Tantipalakorn - Tongsong.pdf

Processing Study #282: Zhang, Wang, Wang - Guo, 2018
 Reading: 2018, Zhang, Wang, Wang - Guo.pdf

Processing Study #283: Howard -Guntupalli, 2014 
 Reading: 2014, Howard - Guntupalli.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value



Processing Study #284: Zubair - Majid, 2018
 Reading: 2018, Zubair - Majid.pdf


Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value



Processing Study #285: Reyna-Villasmil – Herrera-Moya, 2017 
 Reading: 2017, Reyna-Villasmil – Herrera-Moya.pdf

Processing Study #286: Liba - Sediva, 2016
 Reading: 2016, Liba - Sediva.pdf


Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #287: Abdul-Rahman -Palmer, 2016 
 Reading: 2016, Abdul-Rahman -Palmer.pdf


Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #288: Lwanga - Lastra, 2018 
 Reading: 2018, Lwanga - Lastra.pdf

Processing Study #289: Mythri -Mathew, 2016
 Reading: 2016, Mythri -Mathew.docx

Processing Study #290: Simabukuro -  Anghinah. 2014 
 Reading: 2014, Simabukuro -  Anghinah.pdf

Processing Study #291: Qin, Wu - Zheng, 2017 
 Reading: 2017, Qin, Wu - Zheng.pdf


Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #292: Frawley
 Reading: 2011, Frawley.pdf

Processing Study #293: Shindo, 2009
 Reading: 2009, Shindo.pdf

Processing Study #294: Ozelle - Riquin, 2017 
 Reading: 2017, Ozelle - Riquin.pdf

Processing Study #295: Matsumoto -  Takahashi, 2017 
 Reading: 2017, Matsumoto -  Takahashi.pdf

Processing Study #296: Scheibe - Meise, 2017
 Reading: 2017, Scheibe - Meise.pdf

Processing Study #297: Bseikri, 2012
 Reading: 2012, Bseikri.pdf

Processing Study #298: Frunza-Stefan - Malek, 2018
 Reading: 2018, Frunza-Stefan - Malek.pdf

Processing Study #299: Shi, 2017 
 Reading: 2017, Shi.pdf

Processing Study #300: Gold
 Reading: 2010, Gold.pdf

Processing Study #301: Lee - Kim, 2014
 Reading: 2014, Lee - Kim.pdf

Processing Study #302: Sanmaneechai
 Reading: 2013, Sanmaneechai.pdf

Processing Study #303: Boeck - Stangel, 2013
 Reading: 2013, Boeck.pdf

Processing Study #304: Dogan Onugoren -Bien, 2016
 Reading: 2016, Dogan Onugoren - Bien.pdf

Processing Study #305: Brenton -Sch

Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value



Processing Study #316: Tojo
 Reading: 2011, Tojo.pdf


Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value



Processing Study #317: Benjumea-Cuartas - Kaminska, 2017
 Reading: 2017, Benjumea-Cuartas - Kaminska.pdf

Processing Study #318: Shimazaki
 Reading: 2008, Shimazaki.pdf

Processing Study #319: Simon, 2014
 Reading: 2014.12, Simon - AANA J.pdf

Processing Study #320: Mathis -Neau, 2015
 Reading: 2015, Mathis -Neau.pdf

Processing Study #321: Wegner - Nabavi, 2014
 Reading: 2014, Wegner - Nabavi.pdf


Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #322: Chourasia - Kamdar, 2018
 Reading: 2018, Chourasia - Kamdar.pdf

Processing Study #323: Byrne - King, 2014
 Reading: 2014, Byrne - King.pdf

Processing Study #324: Atmaca - Gurses, 2017
 Reading: 2017, Atmaca - Gurses.pdf

Processing Study #325: Thilagavathi -Cheong , 2013
 Reading: 2013, Thilagavathi -Cheong.pdf

Processing Study #326: Asai, 2011
 Reading: 2011, Asai.pdf

Processing Study #327: Sudan - Patil, 2016 
 Reading: 2016, Sudan - Patil.pdf

Processing Study #328: Mirza
 Reading: 2011, Mirza.pdf

Processing Study #329: Zhang, Li - Wang, 2017
 Reading: 2017, Zhang, Li - Wang.pdf

Processing Study #330: Schmiedeskamp , 2010
 Reading: 2010, Schmiedeskamp.pdf

Processing Study #331: Kort
 Reading: 2009, Kort.pdf

Processing Study #332: Hole
 Reading: 2014, Hole.pdf

Processing Study #333: Gabilondo, 2011
 Reading: 2011, Gabilondo.pdf
 Reading: 2011, Gabilondo Supplementary Appendix_e-1.docx

Processing Study #334: Ding - Son, 2017
 Reading: 2017, Ding - Son

Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #335: Sawamura -Tsuji, 2014
 Reading: 2014, Sawamura -Tsuji.pdf

Processing Study #336: Suleman - Javed, 2018
 Reading: 2018, Suleman - Javed.pdf

Processing Study #337: Armangue, Titulaer - Dalmau, 2013
 Reading: 2013, Armangue.pdf

Processing Study #338: Zhang - Zhou, 2018
 Reading: 2018, Zhang.pdf

Processing Study #339: Creten
 Reading: 2011, Creten.pdf

Processing Study #340: Sommeling -Santens, 2014
 Reading: 2014, Sommeling -Santens.pdf

Processing Study #341: Hallowell - Hand, 2017
 Reading: 2017, Hallowell - Hand.pdf

Processing Study #342: Shahani 
 Reading: 2015, Shahani.pdf

Processing Study #343: Kung
 Reading: 2011, Kung.pdf

Processing Study #344: Tachibana
 Reading: 2010, Tachibana.pdf

Processing Study #345: Foff - Quigg, 2017
 Reading: 2017, Foff - Quigg.pdf

Processing Study #346: Filatenkov - Rajaram , 2017
 Reading: 2017, Filatenkov - Rajaram.pdf

Processing Study #347: Prüss-Wandinger, 2010 and Finke, 2012
 Reading: 2010, Prüss.pdf

Processing S

Cannot set gray non-stroke color because /'Pattern1' is an invalid float value
Cannot set gray non-stroke color because /'Pattern2' is an invalid float value
Cannot set gray non-stroke color because /'Pattern3' is an invalid float value
Cannot set gray non-stroke color because /'Pattern4' is an invalid float value
Cannot set gray non-stroke color because /'Pattern5' is an invalid float value
Cannot set gray non-stroke color because /'Pattern6' is an invalid float value
Cannot set gray non-stroke color because /'Pattern7' is an invalid float value
Cannot set gray non-stroke color because /'Pattern8' is an invalid float value
Cannot set gray non-stroke color because /'Pattern9' is an invalid float value
Cannot set gray non-stroke color because /'Pattern10' is an invalid float value
Cannot set gray non-stroke color because /'Pattern11' is an invalid float value



Processing Study #392: Motta
 Reading: 2012, Motta.docx

Processing Study #393: Arshad - Zahid, 2018
 Reading: 2018, Arshad - Zahid.pdf

Processing Study #394: Arboleya - Julià, 2016
 Reading: 2016, Arboleya - Julià.pdf

Processing Study #395: Sakpichaisakul -Suwannachote, 2018
 Reading: 2018, Sakpichaisakul -Suwannachote.pdf

Processing Study #396: Sands, 2015
 Reading: 2015, Sands.pdf

Processing Study #397: Chatterjee - Mitra, 2017
 Reading: 2017, Chatterjee - Mitra.pdf

Processing Study #398: Steriade -Rae-Grant, 2018
 Reading: 2018, Steriade -Rae-Grant.pdf

Processing Study #399: Tapin
 Reading: 2013, Tapin.pdf

Processing Study #400: Cundiff - Shehata, 2015
 Reading: 2015, Cundiff - Shehata.pdf

Processing Study #401: Bushman
 Reading: 2011, Bushman.pdf

Processing Study #402: Punja - Schwartz, 2013
 Reading: 2013, Punja.pdf

Processing Study #403: Garcia – Lorenzo-Bosquet, 2015
 Reading: 2015, Garcia – Lorenzo-Bosquet.pdf

Processing Study #404: Borlot, 2012
 Reading: 2012, Bo

Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #423: Singh - Prabhakar, 2016
 Reading: 2016, Singh - Prabhakar.docx

Processing Study #424: Consoli
 Reading: 2011, Consoli.pdf


Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #425: McKeon - Robinson, 2016 
 Reading: 2016, McKeon - Robinson.pdf

Processing Study #426: Casanova-Gracia - Cortina-Lacambra, 2012
 Reading: 2012, Casanova-Gracia.pdf

Processing Study #427: Mitra
 Reading: 2018, Mitra.pdf

Processing Study #428: Berg - Coffey, 2015
 Reading: 2015, Berg - Coffey.pdf

Processing Study #429: Colley - Smith, 2014
 Reading: 2014, Colley - Smith.pdf

Processing Study #430: Leel - Bouhadiba, 2018 
 Reading: 2018, Leel - Bouhadiba.pdf

Processing Study #431: Dou
 Reading: 2012, Dou.pdf

Processing Study #432: Kalam - Singh-Curry, 2018
 Reading: 2018, Kalam - Singh-Curry.pdf

Processing Study #433: Cleverly  -Navaratnarajah, 2014
 Reading: 2014, Cleverly  -Navaratnarajah.pdf

Processing Study #434: Wong - Fries, 2014
 Reading: 2014, Wong - Fries.pdf

Processing Study #435: Llorens
 Reading: 2010, Llorens.pdf

Processing Study #436: Fields
 Reading: 2013, Fields.pdf

Processing Study #437: Ryan
 Reading: 2013, Ryan.pdf

Processing Study #43

Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats



Processing Study #445: Abe -Yamamoto, 2016
 Reading: 2016, Abe -Yamamoto.pdf


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats



Processing Study #446: Raha, 2012
 Reading: 2012, Raha.pdf

Processing Study #447: Xu - Dai, 2016
 Reading: 2016, Xu - Dai.pdf

Processing Study #448: Labate
 Reading: 2013, Labate.pdf

Processing Study #449: Bravo-Oro - Campos-Guevara, 2013
 Reading: 2013, Bravo-Oro - Campos-Guevara.pdf

Processing Study #450: Menon - Thomas, 2018
 Reading: 2018, Menon - Thomas.pdf

Processing Study #451: Novillo-López, Graus
 Reading: 2008, Novillo-López.pdf

Processing Study #452: Motoyama - Tanaka, 2010
 Reading: 2010, Motoyama.docx

Processing Study #453: Almuslamani - Mahmood, 2015
 Reading: 2015, Almuslamani - Mahmood.pdf

Processing Study #454: van Vliet
 Reading: 2012, van Vliet.docx

Processing Study #455: Wu - Zhang, 2016
 Reading: 2016, Wu - Zhang.pdf

Processing Study #456: Padma
 Reading: 2011, Padma.docx

Processing Study #457: Kim, Kang - Day, 2018
 Reading: 2018, Kim, Kang - Day.pdf

Processing Study #458: Appu, Noetzel, 2014
 Reading: 2014, Appu - Noetzel.pdf

Processing Study #459:

Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #468: Probasco - Kaplan, 2014 
 Reading: 2014, Probasco - Kaplan.pdf

Processing Study #469: Moura - Talina, 2016
 Reading: 2016, Moura - Talina.pdf

Processing Study #470: Joe - Desai, 2016 
 Reading: 2016, Joe - Desai.pdf

Processing Study #471: Shin - Chu, 2018
 Reading: 2018, Shin.pdf
 Reading: 2018, Shin. Supplementary acn3557-sup-0001-tables1-s2.docx

Processing Study #472: Aung - Grageda, 2017
 Reading: 2017, Aung - Grageda.pdf

Processing Study #473: Tanyi, 2012
 Reading: 2012, Tanyi.pdf

Processing Study #474: Palakkuzhiyil
 Reading: 2018, Palakkuzhiyil.pdf

Processing Study #475: Bravo-Oro - Reyes-Vaca, 2015
 Reading: 2015, Bravo-Oro - Reyes-Vaca.pdf

Processing Study #476: Wali
 Reading: 2011, Wali.pdf

Processing Study #477: Yu
 Reading: 2011, Yu.pdf

Processing Study #478: Kohler - Fassbender, 2015
 Reading: 2015, Kohler - Fassbende.pdf

Processing Study #479: Yang - Guan, 2015
 Reading: 2015, Guan - Jia.pdf

Processing Study #480: Hachiya
 Reading: 2013,

Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #504: Fan , Xu - Cui, 2018 
 Reading: 2018, Fan , Xu - Cu.pdf

Processing Study #505: Tzang
 Reading: 2018, Tzang.pdf

Processing Study #506: Batra
 Reading: 2012, Batra.pdf

Processing Study #507: Day - Munoz, 2014
 Reading: 2014, Day - Munoz.pdf

Processing Study #508: Dabner, 2012
 Reading: 2012, Dabner.pdf

Processing Study #509: Zandi
 Reading: 2009, Zandi.pdf

Processing Study #510: Phillips - DiFazio, 2017
 Reading: 2017, Phillips - DiFazio.pdf

Processing Study #511: Doden - Ikeda, 2017
 Reading: 2017, Doden - Ikeda.pdf

Processing Study #512: Nunez-Enamorado
 Reading: 2012, Nunez-Enamorado.pdf

Processing Study #513: Sakamoto - Nakamura, 2013
 Reading: 2013, Sakamoto.pdf

Processing Study #514: Dulcey
 Reading: 2012, Dulcey.pdf

Processing Study #515: Afanasiev - Psimaras, 2016
 Reading: 2016, Afanasiev - Psimaras.pdf

Processing Study #516: Schumacher -  MacKenzie, 2016
 Reading: 2016, Schumacher -  MacKenzie.pdf

Processing Study #517: Sameshima
 Reading: 2

Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #520: Noble - Lancaster, 2018
 Reading: 2018, Noble - Lancaster.pdf

Processing Study #521: Okanishi - Enoki, 2018
 Reading: 2018, Okanishi - Enoki.pdf

Processing Study #522: Peacock -Syed, 2016
 Reading: 2016, Peacock -Syed.pdf

Processing Study #523: Sato - Arai, 2018 
 Reading: 2018, Sato - Arai.pdf

Processing Study #524: Toral - Dumas, 2018
 Reading: 2018, Toral - Dumas.pdf

Processing Study #525: Rypulak - Czuczwar, 2016 
 Reading: 2016, Rypulak - Czuczwar.pdf

Processing Study #526: Schimmel
 Reading: 2009, Schimmel.pdf

Processing Study #527: Finke - Ruprecht, 2014
 Reading: 2014, Finke - Ruprecht.pdf

Processing Study #528: Allen
 Reading: 2012, Allen.pdf

Processing Study #529: Naeije
 Reading: 2010, Naeije.pdf

Processing Study #530: Xiao, Gui - Zhou, 2017
 Reading: 2017, Xiao, Gui - Zhou.pdf

Processing Study #531: Barros - Figueiroa, 2014
 Reading: 2014, Barros - Figueiroa.pdf

Processing Study #532: Heresco-Levy - Mori, 2015
 Reading: 2015, Heresco-Levy

Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #609: Li - Ren, 2015
 Reading: 2015, Li - Ren.pdf

Processing Study #610: Li - Zhao, 2015
 Reading: 2015, Li - Zhao.pdf

Processing Study #611: Lim - Yip, 2017
 Reading: 2017, Lim - Yip.pdf

Processing Study #612: Liu - Han, 2015
 Reading: 2015, Lu - Han.pdf


Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported
Cannot set non-stroke color because 2 components are specified but only 1 (grayscale), 3 (rgb) and 4 (cmyk) are supported



Processing Study #613: Liu - Liu, 2015
 Reading: 2015, Liu - Liu.pdf

Processing Study #614: Lu - Lu, 2016
 Reading: 2016, Lu - Lu.pdf

Processing Study #615: Martín-Viota
 Reading: 2012, Martín-Viota.pdf

Processing Study #616: Martínez
 Reading: 2012, Martínez.pdf

Processing Study #617: Mathai - Janssen, 2016
 Reading: 2017, Mathai - Janssen.pdf

Processing Study #618: Millichap
 Reading: 2011, Millichap.pdf

Processing Study #619: Monteiro - das Neves, 2015
 Reading: 2015, Monteiro - das Neves.pdf


Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value



Processing Study #620: Murdie - Ferguson, 2016
 Reading: 2016, Murdie - Ferguson.pdf

Processing Study #621: Nagata - Mitsuo, 2018
 Reading: 2018, Nagata - Mitsuo.pdf

Processing Study #622: Ng -Mirsattari, 2018 
 Reading: 2018, Ng -Mirsattari.pdf

Processing Study #623: Parfene - Gordon-Elliott, 2016 
 Reading: 2016, Parfene - Gordon-Elliott.pdf

Processing Study #624: Passareli - Rocha, 2016
 Reading: 2016, Passareli - Rocha.pdf

Processing Study #625: Player - Croi, 2015x
 Reading: 2015, Player - Croi.pdf

Processing Study #626: Poloni, 2010
 Reading: 2009, Poloni.pdf

Processing Study #627: Pruss - Ebinger, 2014 
 Reading: 2014, Pruss - Ebinger.pdf

Processing Study #628: Prüss-Wandinger, 2010
 Reading: 2010, Prüss.pdf

Processing Study #629: Reid
 Reading: 2013, Reid.pdf

Processing Study #630: Rutledge - Tubridy, 2016
 Reading: 2016, Rutledge - Tubridy.pdf

Processing Study #631: Sacré, 2011
 Reading: 2011, Sacré.pdf

Processing Study #632: Safadieh
 Reading: 2013, Safadieh.pd

In [19]:
# 2.3 Remove the studies with duplicate or unreadable source files
duplicate_studies = [                                                                 ### List of duplicate (same content appears under multiple study names)
    "Tituler, Höftberger - Dalmau, 2014\n\nKruer, 2010",
    "Tituler, Höftberger - Dalmau, 2014\n\nYamamoto, 2013\n\nKokubun, 2016",
    "Tituler, Höftberger - Dalmau, 2014\n\nSakamoto, 2013",
    "Prüss-Wandinger, 2010 and Finke, 2012"
]
unreadable_file = ["Raynor -Berkowitz, 2016"]                                         ### List of studies linked to unreadable files

for k in duplicate_studies + unreadable_file:
    if k in study_to_text:
        del study_to_text[k]
        print(f" Removed: {repr(k)}")
    else:
        print(f"⚠️ Not found: {repr(k)}")                                             ### Warn if the key isn't found

print(f" Total number of studies in the final dataset: {len(study_to_text)}")         ### Print final number of valid studies retained for dataset split

 Removed: 'Tituler, Höftberger - Dalmau, 2014\n\nKruer, 2010'
 Removed: 'Tituler, Höftberger - Dalmau, 2014\n\nYamamoto, 2013\n\nKokubun, 2016'
 Removed: 'Tituler, Höftberger - Dalmau, 2014\n\nSakamoto, 2013'
 Removed: 'Prüss-Wandinger, 2010 and Finke, 2012'
 Removed: 'Raynor -Berkowitz, 2016'
 Total number of studies in the final dataset: 647


In [21]:
# 2.4 Save the extracted and structured text data as JSON
with open(os.path.join(output_dir, 'study_to_text.json'), 'w', encoding='utf-8') as f:
    json.dump(study_to_text, f, ensure_ascii=False, indent=2)

In [23]:
# 3.1 Load the CSV file containing the number of cases per study and standardize the study name
cases_df = pd.read_csv(CSV_path)
cases_df["Study Name"] = cases_df["0"].astype(str)

In [25]:
# 3.2 Processes on the cases-per-study data (merge the duplicate and delete the unreadable)

# 3.2.1 Map the duplicate study names to be merged  
merge_mapping = {
    "Tituler, Höftberger - Dalmau, 2014": [
        "Tituler, Höftberger - Dalmau, 2014",
        "Tituler, Höftberger - Dalmau, 2014\n\nKruer, 2010",
        "Tituler, Höftberger - Dalmau, 2014\n\nYamamoto, 2013\n\nKokubun, 2016",
        "Tituler, Höftberger - Dalmau, 2014\n\nSakamoto, 2013"
    ],
    "Prüss-Wandinger, 2010": [
        "Prüss-Wandinger, 2010",
        "Prüss-Wandinger, 2010 and Finke, 2012"
    ]
}

# 3.2.2 Build a list of new merged rows with summed counts
merged_rows = []
for new_name, original_names in merge_mapping.items():
    sub_df = cases_df[cases_df["Study Name"].isin(original_names)]
    total_count = sub_df["count"].sum()
    merged_rows.append({
        "Study Name": new_name,
        "count": total_count
    })

merged_df = pd.DataFrame(merged_rows)
print(" The merged studies and their sum of case number:")
print(merged_df)

# 3.2.3 Remove the original rows and append the merged rows back
cases_df = cases_df[~cases_df["Study Name"].isin(sum(merge_mapping.values(), []))]
cases_df = pd.concat([cases_df, merged_df], ignore_index=True)

# 3.2.4 Remove row associated with unreadable file
cases_df = cases_df[cases_df["Study Name"] != "Raynor -Berkowitz, 2016"]

# 3.2.5 Check the final number of studies in cases-per-study data
print(f"\n Final dataset size: {len(cases_df)}")

 The merged studies and their sum of case number:
                           Study Name  count
0  Tituler, Höftberger - Dalmau, 2014     22
1               Prüss-Wandinger, 2010      6

 Final dataset size: 647


In [27]:
# 3.3 Categorize studies by the number of cases
one_case_reports = cases_df[cases_df["count"] == 1]["Study Name"].tolist()
multi_case_series = cases_df[cases_df["count"] > 1]["Study Name"].tolist()

print(f" Number of case reports with only one case: {len(one_case_reports)}")                    ### Print counts of each category
print(f" Number of case series with multiple cases: {len(multi_case_series)}")

 Number of case reports with only one case: 470
 Number of case series with multiple cases: 177


In [29]:
# 3.4 Randomly sample studies for pilot and development sets
random.seed(42)

pilot_sample = random.sample(one_case_reports, 5)
remaining_one_case = list(set(one_case_reports) - set(pilot_sample))             ### Exclude the pilot sample from the rest of the single-case reports
development_one_case_sample = random.sample(remaining_one_case, 20)
development_multi_cases_sample = random.sample(multi_case_series, 10)

In [31]:
# 3.5 Construct pilot, development, and validation sets from study-text dictionary
pilot_set = {k: study_to_text[k] for k in pilot_sample}

development_set = {k: study_to_text[k]
                   for k in (development_one_case_sample + development_multi_cases_sample)}

all_selected_keys = set(pilot_sample + development_one_case_sample + development_multi_cases_sample)        ### Include all remaining studies as the validation set
validation_set = {k: v for k, v in study_to_text.items() if k not in all_selected_keys}

In [33]:
# 3.6 Save the three datasets as JSON
with open(os.path.join(output_dir, 'pilot_set.json'), 'w', encoding='utf-8') as f:
    json.dump(pilot_set, f, ensure_ascii=False, indent=2)

with open(os.path.join(output_dir, 'development_set.json'), 'w', encoding='utf-8') as f:
    json.dump(development_set, f, ensure_ascii=False, indent=2)

with open(os.path.join(output_dir, 'validation_set.json'), 'w', encoding='utf-8') as f:
    json.dump(validation_set, f, ensure_ascii=False, indent=2)