In [4]:
import pandas as pd
import pandera as pa
from pandera import DataFrameModel, Field, check_types, infer_schema, Check, check_input
from pandera.typing import DataFrame, Series
import pandera.extensions as extensions

In [2]:
df = pd.read_csv("medical_cost.csv")

df

Unnamed: 0,Id,age,sex,bmi,children,smoker,region,charges
0,1,19,female,27.900,0,yes,southwest,16884.92400
1,2,18,male,33.770,1,no,southeast,1725.55230
2,3,28,male,33.000,3,no,southeast,4449.46200
3,4,33,male,22.705,0,no,northwest,21984.47061
4,5,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...,...
1333,1334,50,male,30.970,3,no,northwest,10600.54830
1334,1335,18,female,31.920,0,no,northeast,2205.98080
1335,1336,18,female,36.850,0,no,southeast,1629.83350
1336,1337,21,female,25.800,0,no,southwest,2007.94500


In [None]:
@extensions.register_check_method()
def validate_charges(df: pd.DataFrame) -> pd.Series:
    min_charges = df['age'] * 100 + df['bmi'] * 10
    max_charges = df['age'] * 200 + df['bmi'] * 20
    return df['charges'].between(min_charges, max_charges)

In [9]:
class MedCost(DataFrameModel):
    id: Series[int] = Field(gt=0, nullable=False, unique=True)
    age: Series[int] = Field(ge=0, nullable=False)
    sex: Series[str] = Field(isin=["female", "male"])
    bmi: Series[float]
    children: Series[int] = Field(ge=0)
    smoker: Series[str] = Field(isin=["yes", "no"])
    region: Series[str] = Field(isin=["southwest", "southeast", "northwest", "northeast"])
    charges: Series[float] = Field(ge=0)

    class Config:
        validate_charges = ()