In [2]:
import fitz 
import re
from pathlib import Path
from collections import Counter
import json

pdf_path = Path("../data/CoCoA.pdf")


doc = fitz.open(pdf_path)
print(f"Total pages: {len(doc)}")
print(f"Metadata: {doc.metadata}")
print(f"\nPDF is {'encrypted' if doc.is_encrypted else 'not encrypted'}")
print(f"PDF is {'PDF/A compliant' if doc.is_pdf else 'standard PDF'}")


sample_pages = [0, 1, 2, 5, 10, 50, 100, 200, 500, 800]

for page_num in sample_pages:
    if page_num < len(doc):
        page = doc[page_num]
        text = page.get_text()
        print(f"\n{'='*80}")
        print(f"PAGE {page_num}")
        print(f"{'='*80}")
        print(text[:1500])


print("\n\n" + "="*80)
print("ANALYZING CIM-10 CODE PATTERNS")
print("="*80)


patterns = {
    'standard': r'\b[A-Z]\d{2}\.?\d?\b',  # A12.3 or A12
    'with_dash': r'\b[A-Z]\d{2}-[A-Z]\d{2}\b',  # A10-A15 (ranges)
    'detailed': r'\b[A-Z]\d{2}\.\d{1,2}\b'  # A12.34
}

all_codes = {key: [] for key in patterns.keys()}

# first 200 pages 
for page_num in range(min(200, len(doc))):
    page = doc[page_num]
    text = page.get_text()
    
    for pattern_name, pattern in patterns.items():
        codes = re.findall(pattern, text)
        all_codes[pattern_name].extend(codes)

print("\nCode Pattern Analysis:")
for pattern_name, codes in all_codes.items():
    unique_codes = set(codes)
    print(f"\n{pattern_name}: {len(codes)} total, {len(unique_codes)} unique")
    print(f"Sample: {list(unique_codes)[:10]}")

# Most common codes
all_found = all_codes['standard']
print(f"\n\nMost common CIM-10 codes (first 200 pages):")
for code, count in Counter(all_found).most_common(30):
    print(f"  {code}: {count} occurrences")

# Cell 5: Detect document structure
print("\n\n" + "="*80)
print("DOCUMENT STRUCTURE ANALYSIS")
print("="*80)

structure_keywords = {
    'sommaire': [],
    'table des matières': [],
    'chapitre': [],
    'section': [],
    'introduction': [],
    'règle': [],
    'règles générales': [],
    'règles spécifiques': [],
    'codage': [],
    'exemple': [],
    'note': [],
    'attention': [],
    'inclus': [],
    'exclus': [],
    'à ne pas coder': [],
}

# first 100 pages 
for page_num in range(min(100, len(doc))):
    page = doc[page_num]
    text = page.get_text()
    text_lower = text.lower()
    
    for keyword in structure_keywords.keys():
        if keyword in text_lower:
            structure_keywords[keyword].append(page_num)

print("\nKeyword Distribution (first 100 pages):")
for keyword, pages in structure_keywords.items():
    if pages:
        print(f"  '{keyword}': {len(pages)} occurrences on pages {pages[:10]}")


print("\n\n" + "="*80)
print("IDENTIFYING KEY SECTIONS")
print("="*80)


toc_pages = []
for page_num in range(min(20, len(doc))):
    page = doc[page_num]
    text = page.get_text()
    text_lower = text.lower()
    
    if 'sommaire' in text_lower or 'table des matières' in text_lower:
        toc_pages.append(page_num)
        print(f"\n--- TABLE OF CONTENTS found on page {page_num} ---")
        print(text[:2000])


intro_pages = []
for page_num in range(20, 100):
    page = doc[page_num]
    text = page.get_text()
    text_lower = text.lower()
    
    if ('introduction' in text_lower or 'règles générales' in text_lower) and len(text) > 500:
        intro_pages.append(page_num)
        if len(intro_pages) <= 2:  
            print(f"\n--- INTRODUCTION/RULES on page {page_num} ---")
            print(text[:1000])


print("\n\n" + "="*80)
print("ANALYZING CODE PRESENTATION FORMAT")
print("="*80)


code_dense_pages = []
for page_num in range(100, min(300, len(doc))):
    page = doc[page_num]
    text = page.get_text()
    codes = re.findall(patterns['standard'], text)
    
    if len(codes) >= 5:  # Page has multiple codes
        code_dense_pages.append((page_num, len(codes), codes[:5]))

print(f"\nFound {len(code_dense_pages)} code-dense pages (100-300 range)")
print("\nSample code-dense pages:")


for page_num, code_count, sample_codes in code_dense_pages[:3]:
    page = doc[page_num]
    text = page.get_text()
    print(f"\n{'='*80}")
    print(f"Page {page_num} - {code_count} codes - Sample: {sample_codes}")
    print(f"{'='*80}")
    print(text[:1500])


print("\n\n" + "="*80)
print("EXTRACTING CHAPTER STRUCTURE")
print("="*80)

chapters = []
chapter_pattern = r'chapitre\s+([IVXLCDM]+|[0-9]+)'

for page_num in range(min(500, len(doc))):
    page = doc[page_num]
    text = page.get_text()
    
    matches = re.finditer(chapter_pattern, text.lower())
    for match in matches:
        chapters.append({
            'page': page_num,
            'chapter': match.group(1),
            'context': text[max(0, match.start()-50):match.end()+200]
        })

if chapters:
    print(f"\nFound {len(chapters)} chapter references")
    print("\nFirst 10 chapters:")
    for ch in chapters[:10]:
        print(f"\nPage {ch['page']}: Chapter {ch['chapter']}")
        print(f"Context: {ch['context'][:150]}...")


print("\n\n" + "="*80)
print("ANALYZING CODING RULES FORMAT")
print("="*80)

rule_indicators = [
    'ne pas coder',
    'coder en premier',
    'coder également',
    'utiliser un code supplémentaire',
    'si nécessaire',
    'inclus',
    'exclus'
]

rule_examples = []

for page_num in range(50, min(300, len(doc))):
    page = doc[page_num]
    text = page.get_text()
    text_lower = text.lower()
    
    for indicator in rule_indicators:
        if indicator in text_lower:
            
            idx = text_lower.find(indicator)
            context = text[max(0, idx-100):idx+300]
            
            rule_examples.append({
                'page': page_num,
                'indicator': indicator,
                'context': context
            })
            
            if len(rule_examples) <= 10:
                print(f"\n--- Page {page_num}: '{indicator}' ---")
                print(context)

# Summary
print("\n\n" + "="*80)
print("ANALYSIS SUMMARY & CHUNKING RECOMMENDATIONS")
print("="*80)

summary = {
    'total_pages': len(doc),
    'estimated_codes': len(set(all_codes['standard'])),
    'toc_pages': toc_pages,
    'intro_pages': intro_pages,
    'code_dense_pages': len(code_dense_pages),
    'chapters_found': len(chapters),
    'rule_examples': len(rule_examples)
}

print(json.dumps(summary, indent=2))

doc.close()

Total pages: 1040
Metadata: {'format': 'PDF 1.7', 'title': 'Breizh CoCoA 2023', 'author': 'CoCoA & ABIMES', 'subject': 'CIM10', 'keywords': 'CIM10 PMSI codage diagnostics', 'creator': 'Microsoft® Word 2016', 'producer': 'iLovePDF', 'creationDate': "D:20230624153056+02'00'", 'modDate': "D:20230625185453+02'00'", 'trapped': '', 'encryption': None}

PDF is not encrypted
PDF is PDF/A compliant

PAGE 0


PAGE 1


PAGE 2
 
2023 - RG – Présentation 1 
 
pour le codage  
des maladies  
et des problèmes de santé connexes  
dans le cadre  
du PMSI-MCO  
et du PMSI-SSR 
Édition du 21 juin 2023 
 
 


PAGE 5
 
2023 – RG – Présentation 4 
 
 
 
 
 
Cette édition 2023 a été entièrement revue et corrigée par un groupe d’irréductibles – et 
sympathiques – experts armoricains de l’Association Bretonne d’Information Médicale des 
Etablissements de Santé [ABIMES]. 
 
Elle a été intégralement conçue et réalisée par intelligence humaine, collective et bénévole. 
Elle n’est donc pas exempte de défauts, ce q