## Architecture to Monitor Data Quality Over Time

**Description**: Design a monitoring system in Python that checks and logs data quality metrics (accuracy, completeness) for a dataset over time.

**Steps to follow:**
1. Implement a Scheduled Script:
    - Use schedule library to periodically run a script.
2. Script to Calculate Metrics:
    - For simplicity, use a function calculate_quality_metrics() that calculates and logs metrics such as missing rate or mismatch rate.
3. Store Logs:
    - Use Python's logging library to save these metrics over time.

In [4]:
import pandas as pd
import logging
import unittest
from io import StringIO

# Setup logging (configurable log level)
LOG_LEVEL = logging.DEBUG
logging.basicConfig(
    filename="data_quality.log",
    level=LOG_LEVEL,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def calculate_quality_metrics(file_path):
    """
    Load dataset and calculate:
    - Overall missing data rate (%)
    - Mismatch rate between 'value_1' and 'value_2' if columns exist
    Logs the results or errors.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        logging.error(f"File not found: {file_path}")
        return None
    except pd.errors.ParserError:
        logging.error(f"Parsing error: File format not supported or corrupted: {file_path}")
        return None
    except Exception as e:
        logging.error(f"Unexpected error: {e}")
        return None

    missing_rate = df.isnull().mean().mean() * 100

    mismatch_rate = None
    if "value_1" in df.columns and "value_2" in df.columns:
        mismatch_rate = (df["value_1"] != df["value_2"]).mean() * 100

    log_msg = f"Missing Rate: {missing_rate:.2f}%"
    if mismatch_rate is not None:
        log_msg += f", Mismatch Rate: {mismatch_rate:.2f}%"

    logging.info(log_msg)
    return missing_rate, mismatch_rate

# Unit test class
class TestQualityMetrics(unittest.TestCase):

    def test_missing_rate(self):
        csv_data = StringIO("value_1,value_2\n1,1\n,2\n3,")
        df = pd.read_csv(csv_data)
        missing_rate = df.isnull().mean().mean() * 100
        self.assertAlmostEqual(missing_rate, 33.33, places=1)

    def test_mismatch_rate(self):
        csv_data = StringIO("value_1,value_2\n1,1\n2,3\n3,3\n4,4")
        df = pd.read_csv(csv_data)
        mismatch_rate = (df["value_1"] != df["value_2"]).mean() * 100
        self.assertAlmostEqual(mismatch_rate, 25.0)

    def test_file_not_found(self):
        result = calculate_quality_metrics("non_existent_file.csv")
        self.assertIsNone(result)

    def test_valid_file(self):
        # Create a small CSV file for testing
        test_csv = "test_data.csv"
        df_test = pd.DataFrame({
            "value_1": [1, 2, None],
            "value_2": [1, 3, None]
        })
        df_test.to_csv(test_csv, index=False)
        result = calculate_quality_metrics(test_csv)
        self.assertIsNotNone(result)
        import os
        os.remove(test_csv)

# Run unit tests in Jupyter/IPython
unittest.main(argv=['first-arg-is-ignored'], exit=False)


....
----------------------------------------------------------------------
Ran 4 tests in 0.012s

OK


<unittest.main.TestProgram at 0x756721103670>