In [3]:
import pandas as pd
import pandera as pa
from pandera import DataFrameModel, Field, check_types, infer_schema, Check, check_input, Column, DataFrameSchema
from pandera.typing import DataFrame, Series
import pandera.extensions as extensions

In [66]:
df = pd.read_csv("medical_cost.csv")

In [60]:
class MedCost(DataFrameModel):
    Id: Series[int] = Field(gt=0, nullable=False, unique=True)
    age: Series[int] = Field(ge=0, nullable=False)
    sex: Series[str] = Field(isin=["female", "male"])
    bmi: Series[float]
    children: Series[int] = Field(ge=0)
    smoker: Series[str] = Field(isin=["yes", "no"])
    region: Series[str] = Field(isin=["southwest", "southeast", "northwest", "northeast"])
    charges: Series[float] = Field(ge=0)

    @pa.check("bmi", name="check_bmi")
    def check_bmi(cls, bmi : Series[float]) -> Series[bool]:
        return bmi < 100

    @pa.dataframe_check
    def validate_charges(csl, df: pd.DataFrame) -> pd.Series:
        min_charges = (df['age'] + df['bmi']) * 15
        max_charges = (df['age'] + df['bmi']) * 900
        return df['charges'].between(min_charges, max_charges)


In [65]:
MedCost.validate(df)

Unnamed: 0,Id,age,sex,bmi,children,smoker,region,charges
0,1,19,female,27.900,0,yes,southwest,16884.92400
1,2,18,male,33.770,1,no,southeast,1725.55230
2,3,28,male,33.000,3,no,southeast,4449.46200
3,4,33,male,22.705,0,no,northwest,21984.47061
4,5,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...,...
1333,1334,50,male,30.970,3,no,northwest,10600.54830
1334,1335,18,female,31.920,0,no,northeast,2205.98080
1335,1336,18,female,36.850,0,no,southeast,1629.83350
1336,1337,21,female,25.800,0,no,southwest,2007.94500


In [44]:
@check_types
def select_males(df: DataFrame[MedCost]) -> DataFrame[MedCost]:
    males_df = df[df[MedCost.sex] == 'male']
    return males_df

In [45]:
select_males(df).head(5)

Unnamed: 0,Id,age,sex,bmi,children,smoker,region,charges
1,2,18,male,33.77,1,no,southeast,1725.5523
2,3,28,male,33.0,3,no,southeast,4449.462
3,4,33,male,22.705,0,no,northwest,21984.47061
4,5,32,male,28.88,0,no,northwest,3866.8552
8,9,37,male,29.83,2,no,northeast,6406.4107


In [10]:
class FemaleMedCost(MedCost):
    charges: Series[float] = Field(ge=0)

    @pa.dataframe_check
    def validate_charges(csl, df: pd.DataFrame) -> pd.Series:
        max_charges = df["children"] * 1000 + 5000
        return df["charges"] <= max_charges

Dropando não aceitos

In [25]:
class FemaleMedCost(MedCost):
    charges: Series[float] = Field(ge=0)

    class Config:
        drop_invalid_rows = True

    @pa.dataframe_check
    def validate_charges(csl, df: pd.DataFrame) -> pd.Series:
        max_charges = df["children"] * 1000 + 5000
        return df["charges"] <= max_charges

In [26]:
@check_types(lazy=True)
def select_females(df: DataFrame[FemaleMedCost]) -> DataFrame[FemaleMedCost]:
    females_df = df[df[FemaleMedCost.sex] == 'female']
    return females_df

In [27]:
select_females(df)

Unnamed: 0,Id,age,sex,bmi,children,smoker,region,charges
5,6,31,female,25.740,0,no,southeast,3756.62160
7,8,37,female,27.740,3,no,northwest,7281.50560
21,22,30,female,32.400,1,no,southwest,4149.73600
31,32,18,female,26.315,0,no,northeast,2198.18985
32,33,19,female,28.600,5,no,southwest,4687.79700
...,...,...,...,...,...,...,...,...
1311,1312,33,female,26.695,0,no,northwest,4571.41305
1316,1317,19,female,20.600,0,no,southwest,1731.67700
1334,1335,18,female,31.920,0,no,northeast,2205.98080
1335,1336,18,female,36.850,0,no,southeast,1629.83350


TESTE DE HIPÓTESE

In [28]:
from scipy.stats import ttest_ind

class SmokerMedCost(MedCost):

    @pa.dataframe_check
    def validate_charges(cls, df: pd.DataFrame) -> pd.Series:
        return True

    @pa.dataframe_check(name="smoker_vs_non_smoker_charges")
    def validate_smoker_charges(cls, df: pd.DataFrame) -> Series[bool]:
        smokers = df[df["smoker"] == "yes"]["charges"]
        non_smokers = df[df["smoker"] == "no"]["charges"]
        
        t_stat, p_value = ttest_ind(smokers, non_smokers, equal_var=False, alternative="greater")

        # Reject the null hypothesis if p-value < 0.05, indicating smokers have higher charges
        return pd.Series([p_value < 0.05])


In [29]:
SmokerMedCost.validate(df)

Unnamed: 0,Id,age,sex,bmi,children,smoker,region,charges
0,1,19,female,27.900,0,yes,southwest,16884.92400
1,2,18,male,33.770,1,no,southeast,1725.55230
2,3,28,male,33.000,3,no,southeast,4449.46200
3,4,33,male,22.705,0,no,northwest,21984.47061
4,5,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...,...
1333,1334,50,male,30.970,3,no,northwest,10600.54830
1334,1335,18,female,31.920,0,no,northeast,2205.98080
1335,1336,18,female,36.850,0,no,southeast,1629.83350
1336,1337,21,female,25.800,0,no,southwest,2007.94500


PARSERS

In [74]:
class MaleMedCost(MedCost):
    
    @pa.parser("charges")
    def add_tax(cls, series):
        return series + 200

@check_types
def select_males(df: DataFrame[MedCost]) -> DataFrame[MaleMedCost]:
    males_df = df[df["sex"] == 'male']
    return males_df 

# Example usage
select_males(df).head(5)


Unnamed: 0,Id,age,sex,bmi,children,smoker,region,charges
1,2,18,male,33.77,1,no,southeast,1925.5523
2,3,28,male,33.0,3,no,southeast,4649.462
3,4,33,male,22.705,0,no,northwest,22184.47061
4,5,32,male,28.88,0,no,northwest,4066.8552
8,9,37,male,29.83,2,no,northeast,6606.4107
