In [7]:
import pandas as pd
import autoschema
help(autoschema.auto_schema)

Help on function auto_schema in module autoschema.autoschema:

auto_schema(df: pandas.core.frame.DataFrame, standardize_names: bool = True, write_schema: bool = True, schema_file_name: Optional[str] = None) -> pandas.core.frame.DataFrame
    Generate schema from a dataframe.
    Convert all columns to the proper data types before calling this function.
    
    Parameters
    -----------
    df: (pd.DataFrame) dataframe to apply schema creation
    standardize_names: (bool) True/False whether or not to standardize column names
    write_schema: (bool) True/False whether to write out schema to excel
    schema_file_name: (str | None) name of output schema file. You can input an entire path instead of just a file name if you wish to write to a sepcific location. Defaults to schema.xlsx in the current working directory.
    
    Examples
    ----------
    >>> import pandas as pd
    >>> data = pd.read_csv('data.csv')
    >>> schema = auto_schema(data, write_schema=True, schema_file_name=

In [2]:
sample_df = pd.DataFrame({
        'name': ['Alice', None, 'Bob'],
        'age': [25, 30, None],
        'income': [50000, None, 60000]
    })
sample_df

Unnamed: 0,name,age,income
0,Alice,25.0,50000.0
1,,30.0,
2,Bob,,60000.0


In [6]:
schema = autoschema.auto_schema(
    sample_df, 
    standardize_names=True,
    write_schema=True, 
    schema_file_name='schema.xlsx'
)
print(schema)

  column_name data_type  description            examples  default_fill_value  \
0        name    object          NaN        [Alice, Bob]                 NaN   
1         age   float64          NaN        [25.0, 30.0]                 NaN   
2      income   float64          NaN  [50000.0, 60000.0]                 NaN   

   required  
0       NaN  
1       NaN  
2       NaN  


In [3]:
# Manually edit the schema excel file and try to validate
validator = autoschema.SchemaValidator(schema_file='schema.xlsx', use_standardized_names=True).fit(sample_df)
print(validator.validation_rules)

  column_name data_type  description            examples  default_fill_value  \
0        name    object          NaN    ['Alice', 'Bob']                 NaN   
1         age   float64          NaN        [25.0, 30.0]                20.0   
2      income   float64          NaN  [50000.0, 60000.0]             40000.0   

   required  
0      True  
1      True  
2      True  


In [4]:
# Validate a new frame - Negative example where the schema fails
new_df = pd.DataFrame({
        'age': [25, 30, None],
        'income': [50000, None, 60000]
    })

new_df = validator.transform(new_df)

ValueError: Missing required column: name

In [5]:
# Get a working example
new_df = pd.DataFrame({
        'name ': ['Alice', None, 'Bob'],
        'AGE': ['25', '30', None],
        'in-come': [50000, None, 60000]
    })

new_df = validator.transform(new_df)
print(new_df)

    name   age   income
0  Alice  25.0  50000.0
1   None  30.0  40000.0
2    Bob  20.0  60000.0
