In [None]:
# data_monitoring.py

import pandas as pd
import numpy as np
import time
from sklearn.ensemble import IsolationForest
import streamlit as st

# Simulate Real-Time Data Loading
@st.cache_data
def load_data():
    # You can replace this with real-time sources like Kafka, APIs, or databases
    df = pd.read_csv("sample_data.csv")  # Replace with your actual data source
    return df

# Data Quality Checks
def check_quality(df):
    quality_report = {
        "Total Rows": len(df),
        "Missing Values (%)": df.isnull().mean() * 100,
        "Duplicate Rows": df.duplicated().sum()
    }
    return pd.DataFrame(quality_report)

# AI-based Anomaly Detection
def detect_anomalies(df, features):
    clf = IsolationForest(contamination=0.05, random_state=42)
    df = df.dropna(subset=features)
    clf.fit(df[features])
    df['anomaly'] = clf.predict(df[features])
    df['anomaly'] = df['anomaly'].map({1: 'Normal', -1: 'Anomaly'})
    return df

# Streamlit Dashboard
def main():
    st.title("AI-Based Data Quality & Real-Time Monitoring")

    df = load_data()

    st.subheader("Raw Data Snapshot")
    st.write(df.head())

    st.subheader("Data Quality Report")
    quality_df = check_quality(df)
    st.write(quality_df)

    st.subheader("Anomaly Detection")
    features = st.multiselect("Select features for anomaly detection:", options=df.select_dtypes(include=np.number).columns.tolist())

    if features:
        result_df = detect_anomalies(df, features)
        st.write(result_df[['anomaly'] + features].value_counts().reset_index(name='count'))
        st.write(result_df[result_df['