<h1>Function to build the data quality reports</h1>
<p>Here we create function that help us to easily build the data quality reports and various plot to help visualize the data.</p>

In [1]:
def createQuantitativeReport(df):
    """
    Build a data quality report for quantitative variables.
    
    Such a report should have the quantitative variables as rows with 
    columns describing various summary statistics of these variables.
    The statistics for each variable should include the total number of 
    instances ('Count'), the percentage of missing values ('Miss. (%)'),
    the cardinality ('Card.'), the minimum ('Min.'), the 1st quartile 
    ('1st Qrt.'), the mean ('Mean'), the median ('Median'), the 3rd 
    quartile ('3rd Qrt.'), the maximum ('Max.') and the standard 
    deviation ('Std. Dev.'). Note that many of these statistics can 
    also summarized by calling the pandas.DataFrame.info and
    pd.DataFrame.describe methods.
    
    In this context, cardinality represents the number of distinct 
    values per variable.
    
    Args:
        df: pandas.core.frame.DataFrame
            The DataFrame should hold the data that we want to 
            investigate. This DataFrame must divide the columns into the 
            set of variables. Each row contains a value fo each variable, 
            which represent a sample point. Furthermore, the variables 
            must be separated by 'categorical' and 'quantitative' indexes
            using hierarchical indexing.

    Returns:
        pandas.core.frame.DataFrame: The aforementioned data 
            quality report for quantitative variables.
    """
    df_qr = pd.DataFrame(0, index=df['quantitative'].columns, 
                            columns=['Count', 'Miss. (%)', 'Card.', 'Min.', '1st Qrt.', 
                                        'Mean', 'Median', '3rd Qrt.', 'Max.', 'Std. Dev.'])
    df_qr['Count'] = df.shape[0]
    df_qr['Miss. (%)'] = (df['quantitative'].isna().sum() / df.shape[0]) * 100
    df_qr['Card.'] = df['quantitative'].nunique(dropna=True)
    df_qr['Min.'] = df['quantitative'].min(skipna=True) 
    df_qr['1st Qrt.'] = df['quantitative'].quantile(q=0.25, interpolation='linear')
    df_qr['Mean'] = df['quantitative'].mean(skipna=True)
    df_qr['Median'] = df['quantitative'].quantile(q=0.5, interpolation='linear')
    df_qr['3rd Qrt.'] = df['quantitative'].quantile(q=0.75, interpolation='linear')
    df_qr['Max.'] = df['quantitative'].max(skipna=True)
    df_qr['Std. Dev.'] = df['quantitative'].std(ddof=1, skipna=True)
    df_qr.name = 'Quantitative Qualilty Report'
    return df_qr

In [2]:
def createCategoricalReport(df):
    """
    Build a data quality report for categorical variables.
    
    Such a report should have the categorical variables as rows with 
    columns describing various summary statistics of these variables.
    The statistics for each variable should include the total number of 
    instances ('Count'), the percentage of missing values ('Miss. (%)'),
    the cardinality ('Card.'), the first and second mode ('Mode' and 
    '2nd Mode'), the frequency of the first mode ('Mode Freq.' and '2nd 
    Mode Freq.') and the percentage of the first mode ('Mode (%)' and 
    '2nd Mode (%)'). Note that many of these statistics can also 
    summarized by calling the pandas.DataFrame.info and
    pd.DataFrame.describe (with the include argument set to 
    'category') methods.
    
    The same as for the data quality report for quantitative variables,
    cardinality represents the number of distinct values per variable.
    The first and second mode refer, respectively, to the first and 
    second most frequent levels for each categorical variable.
    
    Args:
        df: pandas.core.frame.DataFrame
            The DataFrame should hold the data that we want to 
            investigate. This DataFrame must divide the columns into the 
            set of variables. Each row contains a value fo each variable, 
            which represent a sample point. Furthermore, the variables 
            must be separated by 'categorical' and 'quantitative' indexes
            using hierarchical indexing.

    Returns:
        pandas.core.frame.DataFrame: The aforementioned data 
            quality report for categorical variables.
    """
    df_qr = pd.DataFrame(0, index=df['categorical'].columns, 
                            columns=['Count', 'Miss. (%)', 'Card.', 'Mode', 'Mode Freq.', 
                                        'Mode (%)', '2nd Mode', '2nd Mode Freq.', '2nd Mode (%)'])
    df_qr['Count'] = df.shape[0]
    df_qr['Miss. (%)'] = (df['categorical'].isna().sum() / df.shape[0]) * 100
    df_qr['Card.'] = df['categorical'].nunique(dropna=True)
    for variable, row in df_qr.iterrows():
        df_qr.loc[variable, 'Mode'] = df['categorical', variable].value_counts(dropna=True).index[0]
        df_qr.loc[variable, 'Mode Freq.'] = df['categorical', variable].value_counts(dropna=True).iloc[0]
        df_qr.loc[variable, 'Mode (%)'] = (df_qr.loc[variable, 'Mode Freq.'] / df['categorical'][variable].count()) * 100
        try:
            df_qr.loc[variable, '2nd Mode'] = df['categorical', variable].value_counts(dropna=True).index[1]
            df_qr.loc[variable, '2nd Mode Freq.'] = df['categorical', variable].value_counts(dropna=True).iloc[1]
            df_qr.loc[variable,'2nd Mode (%)'] = (df_qr.loc[variable, '2nd Mode Freq.'] / df['categorical'][variable].count()) * 100
        except IndexError:
            df_qr.loc[variable, '2nd Mode'] = None
            df_qr.loc[variable, '2nd Mode Freq.'] = None
            df_qr.loc[variable,'2nd Mode (%)'] = None
            
    df_qr.name = 'Categorical Qualilty Report'
    return df_qr

In [3]:
def createDataQualityReports(df):
    """ 
    Return both the quantitative and categorical reports that
    are created by the createQuantitativeReport() and 
    createCategoricalReport() functions, respectively.
    
    Args:
        df: pandas.core.frame.DataFrame
            The DataFrame should hold the data that we want to 
            investigate. This DataFrame must divide the columns into the 
            set of variables. Each row contains a value fo each variable, 
            which represent a sample point. Furthermore, the variables 
            must be separated by 'categorical' and 'quantitative' indexes
            using hierarchical indexing.

    Returns:
        pandas.core.frame.DataFrame, pandas.core.frame.DataFrame: A
            tuple containing the two aforementioned data quality reports
            for quantitative and categorical variables. See the
            createQuantitativeReport() and reateCategoricalReport() 
            functions for more details.
    """
    df_qr_quantitative = createQuantitativeReport(df)
    df_qr_categorical = createCategoricalReport(df)
    return df_qr_quantitative, df_qr_categorical

In [4]:
def plotQuantitativeVariables(df, width=15, height=6, approach='equal-width', bins=10):
    """
    This function returns frequency plots for each quantitative variable
    in the pandas DataFrame. 
    
    For variables with cardinality (ie, the number of distinct values per 
    variable) less than 10, we use bar plots. Otherwise, we use histograms.
    These histograms are created using either equal-width or equal-
    frequency binning.
    
    Args:
        df: pandas.core.frame.DataFrame
            The DataFrame should hold the data that we want to 
            investigate. This DataFrame must divide the columns into the 
            set of variables. Each row contains a value fo each variable, 
            which represent a sample point. Furthermore, the variables 
            must be separated by 'categorical' and 'quantitative' indexes
            using hierarchical indexing.
        width: float, default 15
            The width of each subplot.
        height: float, default 6
            The height of each subplot.
        approach: str {'equal-width', 'equal-frequency'}, default 'equal-width'
            If 'equal-width', then the histograms are created using equal-width
            binning. If 'equal-frequency', then the histograms are created 
            using equal-frequency binning.
        bins: int, default 10
            Number of bins used to create the histograms.
    """    
    assert approach in {'equal-width', 'equal-frequency'}, f'approach = {approach}'
    fontdict = {'fontsize': 18, 'fontweight': 'medium'}
    numberOfSubplots = df['quantitative'].shape[1]
    fig, axes = plt.subplots(numberOfSubplots, 1, constrained_layout=True)
    columns = df['quantitative'].columns.values.tolist()
    sr_cardinality = df['quantitative'].nunique(dropna=True)
    for index, column in enumerate(columns):
        cardinality = sr_cardinality.loc[column]
        if cardinality < 10:
            df['quantitative'][column].value_counts().sort_index().plot(kind='bar', ax=axes[index],
                                                figsize=(width, height*numberOfSubplots))
            axes[index].set_title(f'Bar plot of the \'{column}\' variable', fontdict=fontdict)
            axes[index].tick_params(axis="x", labelrotation=0)
        elif approach == 'equal-width':
            df['quantitative'][column].plot(kind='hist', bins=bins, ax=axes[index],
                                                figsize=(width, height*numberOfSubplots))
            axes[index].set_ylabel(None)         
            axes[index].set_title(f'Equal-width histogram of the \'{column}\' variable', fontdict=fontdict)
        else:
            buckets = pd.qcut(df['quantitative']['weight'], q=bins).value_counts().sort_index()
            buckets.plot(kind='bar', ax=axes[index],
                         figsize=(width, height*numberOfSubplots))
            axes[index].set_title(f'Equal-frequency histogram of the \'{column}\' variable', fontdict=fontdict)
            axes[index].tick_params(axis="x", labelsize=10, labelrotation=45)

In [5]:
def plotCategoricalVariables(df, width=15, height=4):
    """
    This function returns bar plots for each categorical variable
    in the pandas DataFrame. 
    
    Args:
        df: pandas.core.frame.DataFrame
            The DataFrame should hold the data that we want to 
            investigate. This DataFrame must divide the columns into the 
            set of variables. Each row contains a value fo each variable, 
            which represent a sample point. Furthermore, the variables 
            must be separated by 'categorical' and 'quantitative' indexes
            using hierarchical indexing.
        width: float, default 15
            The width of each subplot.
        height: float, default 4
            The height of each subplot.
    """    
    fontdict = {'fontsize': 18, 'fontweight': 'medium'}
    numberOfSubplots = df['categorical'].shape[1]
    fig, axes = plt.subplots(numberOfSubplots, 1, constrained_layout=True)
    columns = df['categorical'].columns.values.tolist()
    for index, column in enumerate(columns):
        df['categorical'][column].value_counts().sort_index().plot(kind='bar', ax=axes[index],
                                                figsize=(width, height*numberOfSubplots))
        axes[index].set_title(f'Bar plot of the \'{column}\' variable', fontdict=fontdict)
        axes[index].tick_params(axis="x", labelrotation=0)