In [None]:
# Step 1: Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import os

# Step 2: Define Data Quality Metric Functions
def calculate_missing_percentage(df):
    """Returns average percentage of missing values across all columns."""
    return df.isnull().mean() * 100

def calculate_duplicate_percentage(df):
    """Returns percentage of duplicate rows in the dataset."""
    return df.duplicated().mean() * 100

# Step 3: Initialize Tracking Lists
data_directory = 'data_snapshots/'  # Replace with your folder path

dates = []
missing_trend = []
duplicate_trend = []

# Step 4: Loop Over Time-Based Files
for file in sorted(os.listdir(data_directory)):
    if file.endswith('.csv'):
        file_path = os.path.join(data_directory, file)
        df = pd.read_csv(file_path)

        date = file.replace('.csv', '')  # assumes file name is date
        dates.append(date)

        avg_missing = calculate_missing_percentage(df).mean()
        dup_percent = calculate_duplicate_percentage(df)

        missing_trend.append(avg_missing)
        duplicate_trend.append(dup_percent)

# Step 5: Plot Data Quality Trends
plt.figure(figsize=(12, 6))
plt.plot(dates, missing_trend, label='Missing Values (%)', marker='o')
plt.plot(dates, duplicate_trend, label='Duplicate Rows (%)', marker='x')
plt.xlabel('Date')
plt.ylabel('Percentage')
plt.title('Data Quality Trends Over Time')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.grid(True)
plt.show()

# Step 6: Optional - Trigger Alerts for Threshold Violations
threshold_missing = 5.0  # percent
threshold_duplicates = 2.0  # percent

if missing_trend[-1] > threshold_missing:
    print(f"[ALERT] Missing data exceeded {threshold_missing}% on {dates[-1]}")

if duplicate_trend[-1] > threshold_duplicates:
    print(f"[ALERT] Duplicate data exceeded {threshold_duplicates}% on {dates[-1]}")