## Build a Data Quality Dashboard

**Description**: Create a simple dashboard that displays data quality metrics using a library like `dash` or `streamlit`.

**Steps:**
1. Install Streamlit: pip install streamlit
2. Create a Python script dashboard.py.
3. Run the dashboard: streamlit run dashboard.py

In [2]:
import streamlit as st
import pandas as pd
import os

REQUIRED_COLUMNS = ['id', 'name', 'age', 'email']  # example required columns

def calculate_dqi(total, valid, errors):
    if total == 0:
        return 0.0
    # Use errors parameter for calculation: DQI = valid / total
    return valid / total

def load_data(file_path):
    if not os.path.exists(file_path):
        st.error(f"File '{file_path}' does not exist.")
        return None

    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        st.error(f"Error reading file: {e}")
        return None

    missing_cols = [col for col in REQUIRED_COLUMNS if col not in df.columns]
    if missing_cols:
        st.error(f"Missing required columns: {missing_cols}")
        return None
    
    return df

def detect_errors(df):
    # Missing values in any required column
    missing_errors = df[REQUIRED_COLUMNS].isnull().any(axis=1)

    # Duplicate rows (considered errors)
    duplicate_errors = df.duplicated()

    # Combine errors
    errors_mask = missing_errors | duplicate_errors
    error_count = errors_mask.sum()
    return error_count

def main():
    st.title("Enhanced Data Quality Dashboard")

    file_path = st.text_input("Enter CSV file path:", "data.csv")

    df = load_data(file_path)
    if df is not None:
        total_entries = len(df)
        error_entries = detect_errors(df)
        valid_entries = total_entries - error_entries
        dqi = calculate_dqi(total_entries, valid_entries, error_entries)

        st.metric(label="Total Entries", value=total_entries)
        st.metric(label="Valid Entries", value=valid_entries)
        st.metric(label="Error Entries", value=error_entries)
        st.metric(label="Data Quality Index (DQI)", value=f"{dqi:.2%}")

        st.write("### Sample Data")
        st.dataframe(df.head())

        st.bar_chart({
            "DQI (%)": [dqi * 100],
            "Errors": [error_entries]
        })

# Basic unit tests
def test_functions():
    # Test calculate_dqi
    assert calculate_dqi(10, 8, 2) == 0.8
    assert calculate_dqi(0, 0, 0) == 0.0
    
    # Test detect_errors with a DataFrame
    data = {
        'id': [1, 2, 2, 4],
        'name': ['A', 'B', 'B', None],
        'age': [30, None, None, 40],
        'email': ['a@example.com', 'b@example.com', 'b@example.com', 'd@example.com']
    }
    test_df = pd.DataFrame(data)
    errors = detect_errors(test_df)
    assert errors == 3  # one None in 'name', two duplicates
    print("All tests passed!")

if __name__ == "__main__":
    test_functions()
    main()




All tests passed!
