In [1]:
import numpy as np
import pandas as pd
from custom_ml_toolkit.preprocessor.formatter import DataFrameFormatter


In [2]:
df = pd.DataFrame({
    'bool_1': [True, False, None, np.nan],
    'int_1': ['1', '2', '3', 4],
    'int_2': ['1', '2.2', '3', 4],
    'int_3': ['1', '2.2', '3', np.nan],
    'float_1': ['1.1', 2.2, None, np.nan],
    'float_2': [1.0, 2.2, 3.0, ''],
    'float_3': [1.0, 2.2, 3.0, ' '],
    'float_4': [1.0, 2.2, 3.0, 'abc'],
    'str_1': ['1.1', 2.2, None, np.nan],
    'str_2': [1, 2.2, 'abc', ''],
    'str_3': [1.0, 2.2, 3.0, ' '],
    'str_4': [1.0, 2.2, 3.0, 'abc'],
    'dt_1': ['31/01/22', '31/01/2', '31/01/22', '31/01/22'],
    'dt_2': ['31/01/22', None, '31/01/22', '31/01/22'],
    'dt_3': ['31/01/22', '', '31/01/22', '31/01/22'],
    'dt_4': ['31/01/22', ' ', '31/01/22', '31/01/22'],
    'dt_5': ['31/01/22', '31/01/2', '31/01/22', '31/01/22'],
    'lit_1': ['red', 'green', 'blue', 'red'],
    'lit_2': ['red', 'green', None, ''],
    'lit_3': ['red', 'green', 'blue', 'yellow'],
    'lit_4': ['red', 'green', 'blue', ' '],

})

DFV = DataFrameFormatter(
    required_cols=['missing_1', 'bool_1'],
    bool_cols=['bool_1'],
    int_cols=['int_1', 'int_2', 'int_3'],
    float_cols=['float_1', 'float_2', 'float_3', 'float_4'],
    str_cols=['str_1', 'str_2', 'str_3', 'str_4'],
    dt_col_formats={'dt_1': '%d/%m/%y', 'dt_2': '%d/%m/%y', 'dt_3': '%d/%m/%y', 'dt_4': '%d/%m/%y', 'dt_5': '%d/%m/%Y'},
    literal_col_values={'lit_1': ['red', 'green', 'blue'], 'lit_2': ['red', 'green', 'blue'], 'lit_3': ['red', 'green', 'blue'], 'lit_4': ['red', 'green', 'blue']},
    non_nullable_cols=['float_1'],
    distinct_keys=['int_2']
)
df, errors, type_formatted_cols, type_unformatted_cols = DFV.format(
    df=df
)
print('')
for error in errors:
    print(error, '>>', errors[error])

df


Missing required columns. >> ['missing_1']
Not required columns. >> ['dt_2', 'int_1', 'str_2', 'float_2', 'str_1', 'int_2', 'lit_2', 'float_1', 'int_3', 'float_4', 'dt_3', 'dt_1', 'dt_5', 'str_4', 'dt_4', 'str_3', 'lit_1', 'lit_4', 'lit_3', 'float_3']
Null values in non-nullable columns. >> ['float_1', 'int_3']
Loss of precision (integer). >> ['int_2']
Invalid data type (integer). >> ['int_3']
Invalid data type (float). >> ['float_3', 'float_4']
Datetime formatting issues (%d/%m/%y). >> ['dt_1', 'dt_4']
Datetime formatting issues (%d/%m/%Y). >> ['dt_5']
Invalid literal value ['red', 'green', 'blue']. >> ['lit_3', 'lit_4']




Unnamed: 0,bool_1,int_1,int_2,int_3,float_1,float_2,float_3,float_4,str_1,str_2,...,str_4,dt_1,dt_2,dt_3,dt_4,dt_5,lit_1,lit_2,lit_3,lit_4
0,True,1,1,1.0,1.1,1.0,1.0,1,1.1,1,...,1.0,31/01/22,2022-01-31,2022-01-31,31/01/22,31/01/22,red,red,red,red
1,False,2,2,2.2,2.2,2.2,2.2,2.2,2.2,2.2,...,2.2,31/01/2,NaT,NaT,,31/01/2,green,green,green,green
2,False,3,3,3.0,,3.0,3.0,3,,abc,...,3.0,31/01/22,2022-01-31,2022-01-31,31/01/22,31/01/22,blue,,blue,blue
3,True,4,4,,,,,abc,,,...,abc,31/01/22,2022-01-31,2022-01-31,31/01/22,31/01/22,red,,yellow,
