In [24]:
import pandas as pd
from pandera import DataFrameModel, Field, check_types, infer_schema, Check
from pandera.typing import DataFrame, Series

In [17]:
df_heart = pd.read_csv("heart_failure_clinical_records_dataset.csv")


In [18]:
schema_script = infer_schema(df).to_script()

with open("schema_heart.py", "w") as file:
    file.write(schema_script)

In [19]:
from altered_schema_heart import schema

In [20]:
schema.validate(df_heart, lazy=True)

SchemaErrors: {
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": null,
                "column": "creatinine_phosphokinase",
                "check": "less_than_or_equal_to(5000.0)",
                "error": "Column 'creatinine_phosphokinase' failed element-wise validator number 1: less_than_or_equal_to(5000.0) failure cases: 7861, 7702, 5882, 5209"
            }
        ]
    }
}

Maneira de gerar um schema usando typing do Pydantic

In [22]:
df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [25]:
import pandera as pa
from pandera import DataFrameModel, Field, Check

class HeartFailure(DataFrameModel):
    age: int = Field(ge=0)
    anaemia: int = Field(isin=[0, 1])
    creatinine_phosphokinase: int
    diabetes: int = Field(isin=[0, 1])
    ejection_fraction: int
    high_blood_pressure: int = Field(isin=[0, 1])
    platelets: float
    serum_creatinine: float
    serum_sodium: int
    sex: int = Field(isin=[0, 1])
    smoking: int = Field(isin=[0, 1])
    time: int
    DEATH_EVENT: int = Field(isin=[0, 1])

    class Config:
        strict = True
        coerce = True

@check_types
def return_males(df: DataFrame[HeartFailure]) -> DataFrame[HeartFailure]:
    pass

# Example usage
# schema = HeartFailureSchema.to_schema()
# df = ...  # your dataframe
# validated_df = schema.validate(df)
