# Pydantic 라이브러리 실습
**학습목표**: Python의 `Pydantic` 라이브러리를 사용해 데이터 검증, 파싱, 직렬화를 효율적으로 처리하는 방법을 배운다.  
**Last updated**: 2025-11-03  
**Reference**: *Pydantic Documentation* (https://docs.pydantic.dev/)"

### Pydantic의 기본: BaseModel

In [2]:
from pydantic import BaseModel

class User(BaseModel):
    id: int
    name: str
    email: str

u = User(id=1, name="Ada", email="ada@example.com")
u

User(id=1, name='Ada', email='ada@example.com')

#### Type 강제와 에러

In [None]:
from pydantic import ValidationError

# Coercion: "2" -> 2
print(User(id="2", name="Bob", email="b@example.com"))

# Error example
try:
    User(id="oops", name=123, email="e@example.com")
except ValidationError as e:
    print(e)

id=2 name='Bob' email='b@example.com'
2 validation errors for User
id
  Input should be a valid integer, unable to parse string as an integer [type=int_parsing, input_value='oops', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/int_parsing
name
  Input should be a valid string [type=string_type, input_value=123, input_type=int]
    For further information visit https://errors.pydantic.dev/2.12/v/string_type


### Field (기본값 설정 및 메타데이터 입력)

In [4]:
from pydantic import Field

class Product(BaseModel):
    sku: str
    price: float = Field(0.0, ge=0, description="Unit price in USD")
    in_stock: bool = True

Product(sku="X-1")

Product(sku='X-1', price=0.0, in_stock=True)

### Strict types (Coercion 해제)

In [8]:
class StrictExample(BaseModel):
    qty: int = Field(..., strict=True)
    flag: bool = Field(..., strict=True)

print(StrictExample(qty="3", flag=True))
# Next line should fail: "3" is not int when strict
StrictExample(qty="3", flag=1)

ValidationError: 1 validation error for StrictExample
qty
  Input should be a valid integer [type=int_type, input_value='3', input_type=str]
    For further information visit https://errors.pydantic.dev/2.12/v/int_type

### Nested model

In [11]:
class LineItem(BaseModel):
    item_id: str
    qty: int = Field(..., ge=1)
    unit_price: float = Field(..., ge=0)

class Order(BaseModel):
    order_id: str
    items: list[LineItem]

o = Order(
    order_id="A100",
    items=[{"item_id":"X-1","qty":2,"unit_price":3.5}, {"item_id":"Y-9","qty":1,"unit_price":10}],
)
o

Order(order_id='A100', items=[LineItem(item_id='X-1', qty=2, unit_price=3.5), LineItem(item_id='Y-9', qty=1, unit_price=10.0)])

### Optional, Union, Literal

In [14]:
from typing import Optional, Union, Literal

class Payment(BaseModel):
    method: Literal["card","cash","bank"]
    note: Optional[str] = None
    receipt: Union[str, int]  # id as str or int

Payment(method="card", receipt=12345)

Payment(method='card', note=None, receipt=12345)

### Computed field

In [15]:
from pydantic import computed_field

class Rectangle(BaseModel):
    width: float
    height: float

    @computed_field
    @property
    def area(self) -> float:
        return self.width * self.height

Rectangle(width=3, height=4).area

12.0

### Field validator (커스텀 검증 Logic)

In [16]:
from pydantic import field_validator

class Person(BaseModel):
    name: str
    age: int

    @field_validator("name")
    @classmethod
    def name_not_empty(cls, v: str):
        v = v.strip()
        if not v:
            raise ValueError("name cannot be empty")
        return v.title()

    @field_validator("age")
    @classmethod
    def age_range(cls, v: int):
        if not (0 <= v <= 130):
            raise ValueError("age must be 0..130")
        return v

Person(name="  ada  ", age=36)

Person(name='Ada', age=36)

### Model validator (cross-field checks)

In [17]:
from pydantic import model_validator

class Signup(BaseModel):
    email: str
    email_confirm: str

    @model_validator(mode="after")
    def emails_match(self):
        if self.email != self.email_confirm:
            raise ValueError("emails do not match")
        return self

Signup(email="a@x.com", email_confirm="a@x.com")

Signup(email='a@x.com', email_confirm='a@x.com')

## 종합 실습: 생물정보학 데이터 모델링\n\nPydantic의 모든 기능을 활용한 RNA-seq 데이터 분석 파이프라인 모델을 구현해보겠습니다."

In [9]:
from pydantic import BaseModel, Field, field_validator, model_validator, computed_field
from typing import Literal, Optional
from datetime import datetime

# One gene's expression entry
class GeneExpr(BaseModel):
    gene_id: str                  # e.g., "ENSG00000141510"
    gene_symbol: str              # e.g., "TP53"
    value: float = Field(..., ge=0)
    unit: Literal["counts","TPM","FPKM"]

    @field_validator("gene_id")
    @classmethod
    def ensembl_format(cls, v: str):
        if not v.startswith("ENSG") or not v[4:].isdigit():
            raise ValueError("gene_id should look like ENSG#########")
        return v

# One RNA-seq sample with many genes
class RnaSeqSample(BaseModel):
    sample_id: str
    species: Literal["human","mouse","rat"] = "human"
    tissue: str
    condition: Literal["control","treated","disease","baseline"]
    platform: Literal["bulk","single_cell"] = "bulk"
    library_prep: Optional[str] = None
    run_at: datetime
    expressions: list[GeneExpr]

    @model_validator(mode="after")
    def non_empty_expr(self):
        if not self.expressions:
            raise ValueError("expressions must have at least one gene")
        return self

    @computed_field
    @property
    def detected_genes(self) -> int:
        return sum(1 for g in self.expressions if g.value > 0)

    @computed_field
    @property
    def library_size(self) -> float:
        # Sum counts if unit=="counts"; else sum of values as a quick proxy
        # (teaching demo; real pipelines normalize per unit)
        return float(sum(g.value for g in self.expressions))

# A small study with multiple samples
class RnaSeqStudy(BaseModel):
    study_id: str
    normalization: Literal["none","CPM","TPM","FPKM"] = "none"
    samples: list[RnaSeqSample]

    @model_validator(mode="after")
    def consistent_units(self):
        # If study normalization is TPM/FPKM, entries should be in matching units or raw counts before normalization.
        allowed = {"none": {"counts","TPM","FPKM"},
                   "CPM": {"counts"},
                   "TPM": {"TPM"},
                   "FPKM": {"FPKM"}}[self.normalization]
        for s in self.samples:
            for g in s.expressions:
                if g.unit not in allowed:
                    raise ValueError(f"unit {g.unit} not allowed under {self.normalization}")
        return self

    @computed_field
    @property
    def total_samples(self) -> int:
        return len(self.samples)

# ----- Example data (two samples) -----
study = RnaSeqStudy(
    study_id="TXP_2025_11",
    normalization="TPM",
    samples=[
        RnaSeqSample(
            sample_id="S1",
            species="human",
            tissue="liver",
            condition="treated",
            platform="bulk",
            run_at="2025-11-01T10:00:00",
            expressions=[
                {"gene_id":"ENSG00000012048","gene_symbol":"BRCA1","value":12.4,"unit":"TPM"},
                {"gene_id":"ENSG00000141510","gene_symbol":"TP53","value":4.1,"unit":"TPM"},
                {"gene_id":"ENSG00000139618","gene_symbol":"BRCA2","value":0.0,"unit":"TPM"},
            ],
        ),
        RnaSeqSample(
            sample_id="S2",
            species="human",
            tissue="liver",
            condition="control",
            platform="bulk",
            run_at="2025-11-01T10:05:00",
            expressions=[
                {"gene_id":"ENSG00000012048","gene_symbol":"BRCA1","value":8.7,"unit":"TPM"},
                {"gene_id":"ENSG00000141510","gene_symbol":"TP53","value":6.3,"unit":"TPM"},
            ],
        ),
    ],
)

study, study.total_samples, study.samples[0].detected_genes, study.samples[0].library_size


(RnaSeqStudy(study_id='TXP_2025_11', normalization='TPM', samples=[RnaSeqSample(sample_id='S1', species='human', tissue='liver', condition='treated', platform='bulk', library_prep=None, run_at=datetime.datetime(2025, 11, 1, 10, 0), expressions=[GeneExpr(gene_id='ENSG00000012048', gene_symbol='BRCA1', value=12.4, unit='TPM'), GeneExpr(gene_id='ENSG00000141510', gene_symbol='TP53', value=4.1, unit='TPM'), GeneExpr(gene_id='ENSG00000139618', gene_symbol='BRCA2', value=0.0, unit='TPM')], detected_genes=2, library_size=16.5), RnaSeqSample(sample_id='S2', species='human', tissue='liver', condition='control', platform='bulk', library_prep=None, run_at=datetime.datetime(2025, 11, 1, 10, 5), expressions=[GeneExpr(gene_id='ENSG00000012048', gene_symbol='BRCA1', value=8.7, unit='TPM'), GeneExpr(gene_id='ENSG00000141510', gene_symbol='TP53', value=6.3, unit='TPM')], detected_genes=2, library_size=15.0)], total_samples=2),
 2,
 2,
 16.5)