NeurArk · NeurArk · May 20, 2025 · May 20, 2025
diff --git a/TODO.md b/TODO.md
@@ -32,16 +32,16 @@ After completing a milestone, create a pull request with your changes for review
 
 ## PR3: Exploratory Data Analysis
 
-- [ ] Create summary statistics generator
-- [ ] Implement data quality assessment
-- [ ] Create correlation analysis functionality
-- [ ] Add distribution analysis for numeric variables
-- [ ] Implement categorical variable analysis
-- [ ] Add missing value visualization
-- [ ] Create data profile report generator
-- [ ] Implement data insights summary
-- [ ] Write tests for all EDA functions
-- [ ] Create test cases with different data types and edge cases
+- [x] Create summary statistics generator
+- [x] Implement data quality assessment
+- [x] Create correlation analysis functionality
+- [x] Add distribution analysis for numeric variables
+- [x] Implement categorical variable analysis
+- [x] Add missing value visualization
+- [x] Create data profile report generator
+- [x] Implement data insights summary
+- [x] Write tests for all EDA functions
+- [x] Create test cases with different data types and edge cases
 
 ## PR4: Data Visualization Module
 

diff --git a/app.py b/app.py
@@ -3,6 +3,7 @@
 import streamlit as st
 from utils import config
 from utils import data as data_utils
+from utils import eda
 
 st.set_page_config(page_title="PredictStream", layout="wide")
 
@@ -53,10 +54,22 @@ def main() -> None:
         end = start + page_size
         st.dataframe(data.iloc[start:end])
 
-        st.subheader("Data Summary")
-        summary = data_utils.data_summary(data)
+        st.subheader("Summary Statistics")
+        summary = eda.summary_statistics(data)
         st.dataframe(summary)
 
+        st.subheader("Data Quality")
+        quality = eda.data_quality_assessment(data)
+        st.dataframe(quality)
+
+        st.subheader("Correlation Matrix")
+        corr = eda.correlation_matrix(data)
+        st.dataframe(corr)
+
+        st.subheader("Insights")
+        for insight in eda.data_insights_summary(data):
+            st.write(f"- {insight}")
+
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/tests/test_eda.py b/tests/test_eda.py
@@ -0,0 +1,61 @@
+import pandas as pd
+from utils import eda
+
+
+def sample_df():
+    return pd.DataFrame({
+        'num1': [1, 2, 3, 4, 5],
+        'num2': [1, 2, 3, 4, 5],
+        'cat': ['a', 'b', 'a', None, 'b'],
+    })
+
+
+def test_summary_statistics():
+    df = sample_df()
+    summary = eda.summary_statistics(df)
+    assert 'num1' in summary.columns
+    assert 'cat' in summary.columns
+
+
+def test_data_quality_assessment():
+    df = sample_df()
+    quality = eda.data_quality_assessment(df)
+    assert quality.loc['cat', 'missing'] == 1
+    assert quality.loc['num1', 'missing'] == 0
+
+
+def test_correlation_matrix():
+    df = sample_df()
+    corr = eda.correlation_matrix(df)
+    assert corr.loc['num1', 'num2'] == 1.0
+
+
+def test_numeric_distributions():
+    df = sample_df()
+    hists = eda.numeric_distributions(df, bins=2)
+    assert 'num1' in hists
+    assert hists['num1'].sum() == len(df)
+
+
+def test_categorical_analysis():
+    df = sample_df()
+    counts = eda.categorical_analysis(df)
+    assert counts['cat']['a'] == 2
+
+
+def test_missing_value_matrix():
+    df = sample_df()
+    matrix = eda.missing_value_matrix(df)
+    assert matrix['cat'].sum() == 1
+
+
+def test_profile_report():
+    df = sample_df()
+    report = eda.profile_report(df)
+    assert 'summary' in report and 'quality' in report and 'correlation' in report
+
+
+def test_data_insights_summary():
+    df = sample_df()
+    insights = eda.data_insights_summary(df)
+    assert any('missing values' in text for text in insights)
diff --git a/utils/__init__.py b/utils/__init__.py
@@ -2,5 +2,6 @@
 
 from . import config
 from . import data
+from . import eda
 
-__all__ = ["config", "data"]
+__all__ = ["config", "data", "eda"]
diff --git a/utils/eda.py b/utils/eda.py
@@ -0,0 +1,83 @@
+"""Exploratory data analysis utilities."""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+import numpy as np
+import pandas as pd
+
+
+def summary_statistics(df: pd.DataFrame) -> pd.DataFrame:
+    """Return summary statistics for all columns."""
+    return df.describe(include="all")
+
+
+def data_quality_assessment(df: pd.DataFrame) -> pd.DataFrame:
+    """Return data quality metrics for each column."""
+    total = len(df)
+    return pd.DataFrame({
+        "dtype": df.dtypes,
+        "missing": df.isna().sum(),
+        "missing_percent": df.isna().mean() * 100,
+        "unique": df.nunique(dropna=False),
+    })
+
+
+def correlation_matrix(df: pd.DataFrame, method: str = "pearson") -> pd.DataFrame:
+    """Return the correlation matrix for numeric columns."""
+    numeric_df = df.select_dtypes(include="number")
+    return numeric_df.corr(method=method)
+
+
+def numeric_distributions(df: pd.DataFrame, bins: int = 10) -> Dict[str, pd.Series]:
+    """Return histogram counts for numeric columns."""
+    histograms: Dict[str, pd.Series] = {}
+    numeric_df = df.select_dtypes(include="number")
+    for column in numeric_df.columns:
+        histograms[column] = pd.cut(numeric_df[column], bins=bins).value_counts().sort_index()
+    return histograms
+
+
+def categorical_analysis(df: pd.DataFrame, top_n: int = 10) -> Dict[str, pd.Series]:
+    """Return value counts for categorical columns."""
+    counts: Dict[str, pd.Series] = {}
+    categorical_df = df.select_dtypes(exclude="number")
+    for column in categorical_df.columns:
+        counts[column] = categorical_df[column].value_counts(dropna=False).head(top_n)
+    return counts
+
+
+def missing_value_matrix(df: pd.DataFrame) -> pd.DataFrame:
+    """Return a boolean matrix indicating missing values."""
+    return df.isna()
+
+
+def profile_report(df: pd.DataFrame) -> Dict[str, Any]:
+    """Generate a simple data profile report."""
+    return {
+        "summary": summary_statistics(df),
+        "quality": data_quality_assessment(df),
+        "correlation": correlation_matrix(df),
+    }
+
+
+def data_insights_summary(df: pd.DataFrame) -> List[str]:
+    """Generate simple insights from the data."""
+    insights: List[str] = []
+    quality = data_quality_assessment(df)
+    missing_cols = quality[quality["missing"] > 0].index.tolist()
+    if missing_cols:
+        insights.append("Columns with missing values: " + ", ".join(missing_cols))
+
+    corr = correlation_matrix(df).abs()
+    if not corr.empty:
+        upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
+        strong = upper.stack().loc[lambda s: s > 0.8]
+        if not strong.empty:
+            pairs = [f"{i} & {j}" for i, j in strong.index]
+            insights.append("Strong correlations detected: " + ", ".join(pairs))
+
+    if not insights:
+        insights.append("No notable data issues detected.")
+    return insights