# Libraries

In [None]:
#pip install pandas

In [None]:
#pip install PyPDF2

In [None]:
#pip install pdfplumber

In [None]:
#pip install Pillow

# Extract Text

In [130]:
import os
import pandas as pd
import PyPDF2
import re

def pdf_extract(folder):
    data = []
    for filename in os.listdir(folder):
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder, filename)
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                extracted_text = ''.join(page.extract_text() or '' for page in pdf_reader.pages)
                extracted_text = re.sub(r'\s+', ' ', extracted_text).strip()  # Normalize whitespace

                # Patterns to remove
                patterns = [
                    r'journal of orthopaedic & sports physical therapy',
                    r'\b(volume|number)\s+\d+',
                    r'\b(january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{4}',
                    r'\|',
                    r'\[\s*musculoskeletal imaging\s*\]',
                    r'^\s*\d+\s+'
                ]
                # Apply patterns
                for pattern in patterns:
                    extracted_text = re.sub(pattern, '', extracted_text, flags=re.IGNORECASE)

                # Remove specific characters or patterns
                extracted_text = extracted_text.replace(" - ", "")

                # Append result to list
                data.append({"file_name": filename, "extracted_text": extracted_text})

    # Create DataFrame from list
    return pd.DataFrame(data)


In [131]:
# Example usage
folder_path = "sample_data"
text_df = pdf_extract(folder_path)
print(text_df)

                    file_name  \
0          mcinerney_2107.pdf   
1             kosnik_2018.pdf   
2   cpa_valuept_erdept-en.pdf   
3             beneck_2017.pdf   
4          hernandez_2016.pdf   
5            bittner_2018.pdf   
6              carow_2013.pdf   
7           arumugam_2018.pdf   
8                cho_2013.pdf   
9            halfpap_2016.pdf   
10            callan_2016.pdf   
11           gilotra_2016.pdf   
12           glensek_2013.pdf   

                                       extracted_text  
0   A 57-year-old man with insidious onset of prog...  
1   A 13-year-old male gymnast presented via direc...  
2   Physiotherapists working in the emergency depa...  
3   A 57-year-old woman was recruited for a resear...  
4   the patient was a 20-year-old Division I femal...  
5   A 69-year-old man was referred to physical the...  
6   The patient was a 25-year-old man who was curr...  
7   A 36-year-old man with insidious onset of post...  
8   The patient was a 21-year-old

In [132]:
print(text_df.loc[0, 'extracted_text'])

A 57-year-old man with insidious onset of progressive bilateral upper extremity weakness over approximately 1 year was referred to physical therapy by his primary care physician. Visual assessment of the patient revealed atrophy throughout the bilateral upper extremities, with an inability to reach the arms above shoulder height. Neurological exam findings revealed intact sensation, reflexes, cranial nerve function, coordination, and lower extremity strength. Babinski sign, clonus, and Hoffmann’s reflex were negative. The patient reported an absence of pain and demonstrated 2+/5 strength in the bilateral C5-T1 myotomes. Due to progressive bilateral weakness, the physical therapist referred the patient back to his primary care physician with a request for cervical magnetic resonance imaging (MRI) and neurology consultation. Differential diagnosis included cervical myelopathy. The American College of Radiology Appropriateness Criteria recommend MRI in cases of myelopathy with slow, progr

# Extract images

In [85]:
import os
import base64
import pandas as pd
import pdfplumber
import io
from PIL import Image

def extract_images(folder):
    # Initialize a list to store dictionaries for eventual DataFrame creation
    data_list = []

    # Iterate over every file in the specified folder
    for filename in os.listdir(folder):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(folder, filename)

            # Initialize a dictionary to hold the base64 encoded images
            images_dict = {}
            image_counter = 1

            # Open the PDF file
            with pdfplumber.open(file_path) as pdf:
                # Extract images from each page
                for page in pdf.pages:
                    if 'images' in page.objects:
                        for img_dict in page.images:
                            # Get the image object using its object ID within the PDF
                            im_obj = pdf.extracted_objects[img_dict['object_id']]
                            # Access the raw image data
                            if im_obj['type'] == 'image':
                                # Convert the raw image data into a PIL Image
                                im = Image.open(io.BytesIO(im_obj['data']))
                                # Convert the image to bytes and then encode in base64
                                buffered = io.BytesIO()
                                im.save(buffered, format="JPEG")
                                img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')
                                # Store the image in dictionary with a unique key
                                images_dict[f'image_{image_counter}'] = img_str
                                image_counter += 1

            # Append the data dictionary to the list
            data_list.append({
                'file_name': filename,
                'extracted_images': images_dict
            })

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data_list)

    # Return the DataFrame
    return df

In [86]:
image_df = extract_images('sample_data')