## Build a Data Quality Dashboard

**Description**: Create a simple dashboard that displays data quality metrics using a library like `dash` or `streamlit`.

**Steps:**
1. Install Streamlit: pip install streamlit
2. Create a Python script dashboard.py.
3. Run the dashboard: streamlit run dashboard.py

In [6]:
# dashboard.py

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt

# Set Streamlit page configuration
st.set_page_config(page_title="Data Quality Dashboard", layout="centered")

# Dashboard Title
st.title("📊 Data Quality Dashboard")

# Upload CSV
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])

# When a file is uploaded, process and display data
if uploaded_file is not None:
    try:
        # Read the CSV file
        df = pd.read_csv(uploaded_file)
        st.success("✅ File successfully loaded!")

        # Display preview of the dataset
        st.subheader("🔍 Dataset Preview")
        st.dataframe(df.head())

        # Calculate Data Quality Index (DQI)
        total_cells = df.size
        total_missing = df.isnull().sum().sum()
        dqi = round((1 - total_missing / total_cells) * 100, 2)

        # Show Data Quality Metrics
        st.subheader("📈 Data Quality Metrics")
        st.metric(label="Total Rows", value=df.shape[0])
        st.metric(label="Total Columns", value=df.shape[1])
        st.metric(label="Total Missing Values", value=total_missing)
        st.metric(label="Data Quality Index (DQI)", value=f"{dqi} %")

        # Display a bar plot for DQI and Errors
        st.subheader("📊 DQI vs Errors (%)")
        fig, ax = plt.subplots()
        ax.bar(["DQI", "Errors"], [dqi, 100 - dqi], color=["green", "red"])
        ax.set_ylabel("Percentage")
        ax.set_ylim([0, 100])
        for i, val in enumerate([dqi, 100 - dqi]):
            ax.text(i, val + 2, f"{val:.2f}%", ha='center')
        st.pyplot(fig)

        # Display missing values per column
        st.subheader("🧮 Missing Values per Column")
        missing_per_column = df.isnull().sum()
        st.bar_chart(missing_per_column)

    except pd.errors.ParserError:
        st.error("❌ The file appears to be malformed. Please check the CSV format.")
    except UnicodeDecodeError:
        st.error("❌ The file contains unsupported characters. Try another encoding.")
    except ValueError:
        st.error("❌ Unexpected value found in the file. Ensure proper format.")
    except Exception as e:
        st.error(f"❌ An unexpected error occurred: {e}")
else:
    st.info("📁 Please upload a CSV file to begin.")




In [7]:
# test_dashboard.py

import unittest
import pandas as pd
import io
from dashboard import *  # Import the Streamlit app or functions directly

class TestDQIDashboard(unittest.TestCase):

    def test_dqi_computation(self):
        """Test if DQI is calculated correctly."""
        # Simulate a small DataFrame with missing values
        df = pd.DataFrame({
            'name': ['Alice', None],
            'age': [25, 30]
        })
        total_cells = df.size
        total_missing = df.isnull().sum().sum()
        dqi = round((1 - (total_missing / total_cells)) * 100, 2)
        self.assertEqual(dqi, 75.0)

    def test_empty_file(self):
        """Test if the script handles empty CSV files."""
        # Simulate an empty file upload
        df = pd.DataFrame()
        self.assertEqual(df.shape[0], 0)

    def test_missing_values(self):
        """Test if missing values are counted correctly."""
        df = pd.DataFrame({
            'col1': [1, 2, None],
            'col2': [None, 2, 3]
        })
        missing_values = df.isnull().sum()
        self.assertEqual(missing_values['col1'], 1)
        self.assertEqual(missing_values['col2'], 1)

    def test_invalid_csv(self):
        """Test the script's error handling for invalid CSV."""
        invalid_csv = io.StringIO("col1,col2\n1,2\n3,4")
        with self.assertRaises(pd.errors.ParserError):
            df = pd.read_csv(invalid_csv)  # Simulate malformed CSV

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)


..F.
FAIL: test_invalid_csv (__main__.TestDQIDashboard)
Test the script's error handling for invalid CSV.
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_775/1243765372.py", line 41, in test_invalid_csv
    with self.assertRaises(pd.errors.ParserError):
AssertionError: ParserError not raised

----------------------------------------------------------------------
Ran 4 tests in 0.009s

FAILED (failures=1)


In [8]:
import pandas as pd

data = {
    "id": [1,2,3,4,5,6,7,8,9,10],
    "name": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace", "Hank", "Ivy", "Jack"],
    "age": [25, None, 30, 27, None, 22, 29, 35, 28, 31],
    "email": ["alice@example.com", "bob[at]example.com", None, "david@example.com", "eve@example.com",
              "frank@example.com", "grace@example.com", "hank@example.com", "ivy@example.com", "jack@example.com"],
    "gender": ["F", "M", "M", "M", "F", "M", "F", "M", "F", "M"],
    "grade": ["A", "B", "C", "A", "B", "D", "E", "F", "G", "H"]
}

df = pd.DataFrame(data)
df.to_csv("data_quality_sample.csv", index=False)
