## Data Quality Dashboard in Python

**Description**: Create a basic dashboard using a Python library (e.g., Plotly Dash) to visualize data quality metrics for a given dataset.

In [1]:
import dash
from dash import dcc, html, Input, Output
import dash_bootstrap_components as dbc
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from io import StringIO

# Sample data (can be replaced with your own dataset)
sample_data = """id,name,age,email,join_date,salary,department,is_active
1,John Doe,32,john@example.com,2020-01-15,75000,Marketing,True
2,Jane Smith,,jane@example.com,2019-05-22,82000,Sales,True
3,Bob Johnson,45,,2018-11-10,91000,IT,False
4,Alice Brown,28,alice@example.com,,78000,Marketing,True
5,Charlie Davis,51,charlie@example.com,2021-03-30,,Sales,True
6,,37,eve@example.com,2020-07-14,88000,IT,False
7,Grace Wilson,29,grace@example.com,2022-02-18,76000,None,True
8,Henry Miller,42,henry@example.com,2021-09-05,95000,Finance,True"""

# Initialize the Dash app
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
app.title = "Data Quality Dashboard"

# Define the layout
app.layout = dbc.Container([
    dbc.Row(dbc.Col(html.H1("Data Quality Dashboard", className="text-center my-4"))),
    
    dbc.Row([
        dbc.Col([
            dcc.Textarea(
                id='input-data',
                value=sample_data,
                style={'width': '100%', 'height': 200},
            ),
            dbc.Button("Analyze Data", id="analyze-button", color="primary", className="mt-2"),
        ], width=6),
        
        dbc.Col([
            html.Div(id='data-stats', className="p-3 border rounded"),
        ], width=6),
    ], className="mb-4"),
    
    dbc.Row([
        dbc.Col(dcc.Graph(id='completeness-graph'), width=6),
        dbc.Col(dcc.Graph(id='uniqueness-graph'), width=6),
    ]),
    
    dbc.Row([
        dbc.Col(dcc.Graph(id='distribution-graph'), width=12),
    ]),
    
    dbc.Row([
        dbc.Col(dcc.Graph(id='outliers-graph'), width=12),
    ]),
    
    dbc.Row([
        dbc.Col(html.Div(id='data-sample', className="p-3 border rounded"), width=12),
    ], className="mt-4"),
], fluid=True)

# Callbacks for interactivity
@app.callback(
    [Output('data-stats', 'children'),
     Output('completeness-graph', 'figure'),
     Output('uniqueness-graph', 'figure'),
     Output('distribution-graph', 'figure'),
     Output('outliers-graph', 'figure'),
     Output('data-sample', 'children')],
    [Input('analyze-button', 'n_clicks')],
    [dash.dependencies.State('input-data', 'value')]
)
def update_dashboard(n_clicks, input_data):
    if n_clicks is None:
        return dash.no_update
    
    # Load the data
    try:
        df = pd.read_csv(StringIO(input_data))
    except Exception as e:
        return html.Div(f"Error loading data: {str(e)}"), dash.no_update, dash.no_update, dash.no_update, dash.no_update, dash.no_update
    
    # Calculate data quality metrics
    # 1. Completeness
    completeness = df.notna().mean().round(4) * 100
    completeness_df = completeness.reset_index()
    completeness_df.columns = ['Column', 'Completeness (%)']
    
    # 2. Uniqueness
    uniqueness = df.nunique() / len(df) * 100
    uniqueness_df = uniqueness.reset_index()
    uniqueness_df.columns = ['Column', 'Uniqueness (%)']
    
    # 3. Data types
    data_types = df.dtypes.reset_index()
    data_types.columns = ['Column', 'Data Type']
    
    # 4. Potential outliers (for numeric columns)
    numeric_cols = df.select_dtypes(include=np.number).columns
    outliers_info = []
    for col in numeric_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
        outliers_info.append(f"{col}: {outliers} outliers")
    
    # Create visualizations
    # 1. Completeness graph
    completeness_fig = px.bar(
        completeness_df, 
        x='Column', 
        y='Completeness (%)',
        title='Data Completeness by Column',
        color='Completeness (%)',
        color_continuous_scale='Blues'
    )
    completeness_fig.update_layout(yaxis_range=[0, 100])
    
    # 2. Uniqueness graph
    uniqueness_fig = px.bar(
        uniqueness_df, 
        x='Column', 
        y='Uniqueness (%)',
        title='Data Uniqueness by Column',
        color='Uniqueness (%)',
        color_continuous_scale='Greens'
    )
    uniqueness_fig.update_layout(yaxis_range=[0, 100])
    
    # 3. Distribution graph
    distribution_fig = go.Figure()
    for col in numeric_cols:
        distribution_fig.add_trace(go.Box(
            y=df[col],
            name=col,
            boxpoints='all',
            jitter=0.3,
            pointpos=-1.8
        ))
    distribution_fig.update_layout(title='Distribution of Numeric Columns')
    
    # 4. Outliers graph
    outliers_counts = []
    for col in numeric_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers_count = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
        outliers_counts.append(outliers_count)
    
    outliers_fig = px.bar(
        x=numeric_cols,
        y=outliers_counts,
        title='Potential Outliers in Numeric Columns',
        labels={'x': 'Column', 'y': 'Number of Outliers'}
    )
    
    # Create stats summary
    stats_summary = [
        html.H4("Dataset Summary"),
        html.P(f"Number of rows: {len(df)}"),
        html.P(f"Number of columns: {len(df.columns)}"),
        html.Hr(),
        html.H5("Data Types"),
        dbc.Table.from_dataframe(data_types, striped=True, bordered=True, hover=True),
        html.Hr(),
        html.H5("Potential Outliers"),
        html.Ul([html.Li(item) for item in outliers_info])
    ]
    
    # Create data sample display
    data_sample = [
        html.H4("Data Sample (First 5 Rows)"),
        dbc.Table.from_dataframe(df.head(), striped=True, bordered=True, hover=True)
    ]
    
    return stats_summary, completeness_fig, uniqueness_fig, distribution_fig, outliers_fig, data_sample

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)

ModuleNotFoundError: No module named 'dash_bootstrap_components'