<a href="https://colab.research.google.com/github/KwangHyunNam/health-data/blob/main/summarize_basic_medical_information.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install pdfplumber pandas

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import pdfplumber
import pandas as pd

def summarize_medical_expenses(pdf_path):
    data = []

    # PDF에서 테이블 추출
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            table = page.extract_table()
            if not table:
                continue

            for row in table[1:]:  # 첫 행은 헤더 생략
                try:
                    진료시작일 = row[1].strip()
                    주상병명 = row[6].strip()
                    총진료비 = int(row[8].replace(",", "").strip())
                    내가낸의료비 = int(row[10].replace(",", "").strip())

                    data.append({
                        "진료시작일": 진료시작일,
                        "주상병명": 주상병명,
                        "총 진료비": 총진료비,
                        "내가 낸 의료비": 내가낸의료비
                    })
                except (IndexError, ValueError):
                    continue  # 누락된 값 또는 숫자 형식 오류가 있을 경우 skip

    # 데이터프레임 생성
    df = pd.DataFrame(data)

    if df.empty:
        print("유효한 진료 정보가 없습니다.")
        return None

    # 집계 요약
    summary = df.groupby("주상병명").agg({
        "진료시작일": lambda x: ", ".join(sorted(set(x))),
        "총 진료비": "sum",
        "내가 낸 의료비": "sum"
    }).reset_index()

    # 총 진료비 기준 내림차순 정렬
    summary = summary.sort_values(by="총 진료비", ascending=False).reset_index(drop=True)

    return summary

In [None]:
import os

# 1. GitHub 저장소 주소 설정
github_repo_url = "https://github.com/KwangHyunNam/health-data.git"  # 실제 저장소 주소로 변경

# 2. 저장소 클론
!git clone {github_repo_url}

# 3. PDF 파일 경로 찾기
repo_name = github_repo_url.split("/")[-1].replace(".git", "")
pdf_file_path = None
for root, dirs, files in os.walk(repo_name):
    for file in files:
        if file.endswith(".pdf"):
            pdf_file_path = os.path.join(root, file)
            break
    if pdf_file_path:
        break

# 4. PDF 파일 처리 및 요약
if pdf_file_path:
    summary_df = summarize_medical_expenses(pdf_file_path)
    if summary_df is not None:
        display(summary_df)
else:
    print("PDF 파일을 찾을 수 없습니다.")