In [None]:
# Ques_6.ipynb — Architecture to Monitor Data Quality Over Time

import pandas as pd
import datetime
import os

# -------- CONFIGURATION --------
DATA_FILE = "monitored_dataset.csv"  # File to monitor
LOG_FILE = "data_quality_log.csv"    # Log file to store results
CRITICAL_COLUMNS = ["id", "value", "timestamp"]  # Define your critical fields
REFERENCE_FILE = "reference_data.csv"  # Optional: for accuracy comparison

# -------- QUALITY METRIC FUNCTIONS --------
def calculate_completeness(df, fields):
    return (1 - df[fields].isnull().mean()).round(3) * 100

def calculate_accuracy(df, ref_df, key="id", col_to_check="value"):
    merged = pd.merge(df, ref_df, on=key, suffixes=('', '_ref'))
    correct = (merged[col_to_check] == merged[f"{col_to_check}_ref"]).sum()
    total = len(merged)
    return round((correct / total) * 100, 2) if total > 0 else None

# -------- MONITORING FUNCTION --------
def monitor_data_quality():
    now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    try:
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print(f"Data file '{DATA_FILE}' not found.")
        return

    # Completeness
    completeness_scores = calculate_completeness(df, CRITICAL_COLUMNS)

    # Accuracy (if reference available)
    if os.path.exists(REFERENCE_FILE):
        ref_df = pd.read_csv(REFERENCE_FILE)
        accuracy_score = calculate_accuracy(df, ref_df)
    else:
        accuracy_score = None

    # Log result
    log_entry = {
        "timestamp": now,
        **{f"completeness_{col}": completeness_scores[col] for col in CRITICAL_COLUMNS},
        "accuracy": accuracy_score
    }

    # Save log entry
    log_df = pd.DataFrame([log_entry])

    if os.path.exists(LOG_FILE):
        log_df.to_csv(LOG_FILE, mode='a', header=False, index=False)
    else:
        log_df.to_csv(LOG_FILE, index=False)

    print("Data quality check completed and logged.")
    print(log_df)

# -------- RUN MONITOR --------
monitor_data_quality()