In [13]:
from ydata_quality import DataQuality
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/ydataai/ydata-quality/master/datasets/transformed/census_10k.csv')

In [14]:
# create the main class that holds all quality modules
dq = DataQuality(df=df)
# run the tests
results = dq.evaluate()


[38;5;209m[1mPriority 1[0m - [1mheavy impact expected[0m:
	[38;5;209m*[0m [1m[DUPLICATES[0m - [4mDUPLICATE COLUMNS][0m Found 1 columns with exactly the same feature values as other columns.
[38;5;11m[1mPriority 2[0m - [1musage allowed, limited human intelligibility[0m:
	[38;5;11m*[0m [1m[DUPLICATES[0m - [4mEXACT DUPLICATES][0m Found 3 instances with exact duplicate feature values.
	[38;5;11m*[0m [1m[ERRONEOUS DATA[0m - [4mPREDEFINED ERRONEOUS DATA][0m Found 1960 ED values in the dataset.
	[38;5;11m*[0m [1m[DATA RELATIONS[0m - [4mHIGH COLLINEARITY - CATEGORICAL][0m Found 10 categorical variables with significant collinearity (p-value < 0.05). The variables listed in results are highly collinear with other variables in the dataset and sorted descending according to propensity. These will make model explainability harder and potentially give way to issues like overfitting.Depending on your end goal you might want to remove variables following the provide

In [15]:
dq.get_warnings(test="Duplicate Columns")



In [16]:
from ydata_quality.bias_fairness import BiasFairness
#create the main class that holds all quality modules
bf = BiasFairness(df=df, sensitive_features=['race', 'sex'], label='income')
# run the tests
bf_results = bf.evaluate()



[38;5;11m[1mPriority 2[0m - [1musage allowed, limited human intelligibility[0m:
	[38;5;11m*[0m [1m[BIAS&FAIRNESS[0m - [4mSENSITIVE ATTRIBUTE REPRESENTATIVITY][0m Found 2 values of 'race'                             sensitive attribute with low representativity in the dataset (below 1.00%).
	[38;5;11m*[0m [1m[BIAS&FAIRNESS[0m - [4mPROXY IDENTIFICATION][0m Found 1 feature pairs of correlation to sensitive attributes with values higher than defined threshold (0.5).



In [17]:
bf.get_warnings(test='Proxy Identification')

 relationship_sex    0.650656
 Name: association, dtype: float64)]

In [18]:
def improve_quality(df: pd.DataFrame):
    """Clean the data based on the Data Quality issues found previously."""
    # Bias & Fairness
    df = df.replace({'relationship': {'Husband': 'Married', 'Wife': 'Married'}}) # Substitute gender-based 'Husband'/'Wife' for generic 'Married'
    
    # Duplicates
    df = df.drop(columns=['workclass2']) # Remove the duplicated column
    df = df.drop_duplicates()            # Remove exact feature value duplicates

    return df

clean_df = improve_quality(df.copy())

In [24]:
better_dq = DataQuality(df=clean_df) # main class on cleaned data
results = better_dq.evaluate() # run the tests



[38;5;11m[1mPriority 2[0m - [1musage allowed, limited human intelligibility[0m:
	[38;5;11m*[0m [1m[ERRONEOUS DATA[0m - [4mPREDEFINED ERRONEOUS DATA][0m Found 1360 ED values in the dataset.
	[38;5;11m*[0m [1m[DATA RELATIONS[0m - [4mHIGH COLLINEARITY - CATEGORICAL][0m Found 9 categorical variables with significant collinearity (p-value < 0.05). The variables listed in results are highly collinear with other variables in the dataset and sorted descending according to propensity. These will make model explainability harder and potentially give way to issues like overfitting.Depending on your end goal you might want to remove variables following the provided order.



In [25]:
# Specific analysis for Bias & Fairness with improved dataframe
better_bf = BiasFairness(df=clean_df, sensitive_features=['race', 'sex'], label='income')
_ = better_bf.evaluate()



[38;5;11m[1mPriority 2[0m - [1musage allowed, limited human intelligibility[0m:
	[38;5;11m*[0m [1m[BIAS&FAIRNESS[0m - [4mSENSITIVE ATTRIBUTE REPRESENTATIVITY][0m Found 2 values of 'race'                             sensitive attribute with low representativity in the dataset (below 1.00%).



In [26]:
# Lowering the threshold detects more possible proxies
better_bf.proxy_identification(th=0.45)

features
relationship_sex      0.475097
marital-status_sex    0.459768
Name: association, dtype: float64