In [4]:
%pip install PyMuPDF

Note: you may need to restart the kernel to use updated packages.


In [5]:
import os
import tensorflow as tf
import fitz
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

def extract_images_from_pdf(pdf_path, output_folder="extracted_images"):
    """Extract and filter images from PDF"""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    extracted_images = 0
    
    try:
        pdf_document = fitz.open(pdf_path)
        
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            image_list = page.get_images(full=True)
            
            for img_index, img in enumerate(image_list):
                try:
                    xref = img[0]
                    pix = fitz.Pixmap(pdf_document, xref)
                    
                    if pix.width < 100 or pix.height < 100:
                        pix = None
                        continue
                    
                    if pix.n > 4:
                        pix = fitz.Pixmap(fitz.csRGB, pix)
                    
                    if pix.n - pix.alpha < 4:
                        img_name = f"page_{page_num + 1}_img_{img_index + 1}.png"
                        img_path = os.path.join(output_folder, img_name)
                        
                        with open(img_path, "wb") as img_file:
                            img_file.write(pix.tobytes("png"))
                        extracted_images += 1
                    
                    pix = None
                    
                except Exception:
                    continue
        
        pdf_document.close()
        return extracted_images
        
    except Exception:
        return 0

def load_model(model_path='medical_classifier.keras'):
    """Load the medical image classifier"""
    try:
        with tf.device('/CPU:0'):
            model = tf.keras.models.load_model(model_path)
        return model
    except Exception:
        return None

def preprocess_image(image_path, target_size=(224, 224)):
    """Preprocess image for classification"""
    try:
        img = Image.open(image_path)
        
        if img.mode != 'RGB':
            if img.mode == 'L':
                img = img.convert('RGB')
            elif img.mode == 'RGBA':
                background = Image.new('RGB', img.size, (255, 255, 255))
                background.paste(img, mask=img.split()[3])
                img = background
            else:
                img = img.convert('RGB')
        
        img = img.resize(target_size, Image.Resampling.LANCZOS)
        img_array = np.array(img, dtype=np.float32) / 255.0
        img_array = np.expand_dims(img_array, axis=0)
        
        return img_array
    except Exception:
        return None

def classify_and_display_images(model_path='medical_classifier.keras', images_folder="extracted_images"):
    """Classify images and display medical ones"""
    model = load_model(model_path)
    if model is None:
        print("Failed to load model")
        return []
    
    if not os.path.exists(images_folder):
        print("No images folder found")
        return []
    
    image_files = [f for f in os.listdir(images_folder) 
                   if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
    
    medical_images = []
    
    print(f"Classifying {len(image_files)} extracted images...")
    print("="*50)
    
    for image_file in image_files:
        image_path = os.path.join(images_folder, image_file)
        img_array = preprocess_image(image_path)
        
        if img_array is not None:
            try:
                with tf.device('/CPU:0'):
                    prediction = model.predict(img_array, verbose=0)[0][0]
                
                if prediction <= 0.5:  # Medical classification
                    confidence = 1.0 - prediction
                    medical_images.append({
                        'filename': image_file,
                        'path': image_path,
                        'confidence': float(confidence)
                    })
                    
                    # Display the medical image
                    img = Image.open(image_path)
                    plt.figure(figsize=(6, 6))
                    plt.imshow(img)
                    plt.axis('off')
                    plt.title(f"MEDICAL IMAGE: {image_file}\nConfidence: {confidence:.3f}", 
                             fontsize=12, fontweight='bold', color='green')
                    plt.show()
                    
                    print(f"✓ MEDICAL: {image_file} (confidence: {confidence:.3f})")
                
            except Exception:
                continue
    
    print(f"\nFound {len(medical_images)} medical images total")
    return medical_images

def extract_medical_images_from_pdf(pdf_path, model_path='medical_classifier.keras'):
    """Main function: Extract, classify, and display medical images from PDF"""
    print(f"Processing PDF: {pdf_path}")
    
    # Extract all images
    extracted_count = extract_images_from_pdf(pdf_path)
    print(f"Extracted {extracted_count} images from PDF")
    
    if extracted_count == 0:
        print("No images found in PDF")
        return []
    
    # Classify and display medical images
    medical_images = classify_and_display_images(model_path)
    return medical_images

# Usage
if __name__ == "__main__":
    pdf_file = "/content/a-case-study-pneumonia-2329-6879-1000242.pdf"
    medical_images = extract_medical_images_from_pdf(pdf_file)
    
    print(f"\n🎯 FINAL RESULTS:")
    print(f"Found {len(medical_images)} medical images:")
    for img in medical_images:
        print(f"- {img['filename']} (confidence: {img['confidence']:.3f})")


ModuleNotFoundError: No module named 'tensorflow'