Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,16 @@ After completing a milestone, create a pull request with your changes for review

## PR3: Exploratory Data Analysis

- [ ] Create summary statistics generator
- [ ] Implement data quality assessment
- [ ] Create correlation analysis functionality
- [ ] Add distribution analysis for numeric variables
- [ ] Implement categorical variable analysis
- [ ] Add missing value visualization
- [ ] Create data profile report generator
- [ ] Implement data insights summary
- [ ] Write tests for all EDA functions
- [ ] Create test cases with different data types and edge cases
- [x] Create summary statistics generator
- [x] Implement data quality assessment
- [x] Create correlation analysis functionality
- [x] Add distribution analysis for numeric variables
- [x] Implement categorical variable analysis
- [x] Add missing value visualization
- [x] Create data profile report generator
- [x] Implement data insights summary
- [x] Write tests for all EDA functions
- [x] Create test cases with different data types and edge cases

## PR4: Data Visualization Module

Expand Down
19 changes: 16 additions & 3 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import streamlit as st
from utils import config
from utils import data as data_utils
from utils import eda

st.set_page_config(page_title="PredictStream", layout="wide")

Expand Down Expand Up @@ -53,10 +54,22 @@ def main() -> None:
end = start + page_size
st.dataframe(data.iloc[start:end])

st.subheader("Data Summary")
summary = data_utils.data_summary(data)
st.subheader("Summary Statistics")
summary = eda.summary_statistics(data)
st.dataframe(summary)

st.subheader("Data Quality")
quality = eda.data_quality_assessment(data)
st.dataframe(quality)

st.subheader("Correlation Matrix")
corr = eda.correlation_matrix(data)
st.dataframe(corr)

st.subheader("Insights")
for insight in eda.data_insights_summary(data):
st.write(f"- {insight}")


if __name__ == "__main__":
main()
main()
61 changes: 61 additions & 0 deletions tests/test_eda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import pandas as pd
from utils import eda


def sample_df():
return pd.DataFrame({
'num1': [1, 2, 3, 4, 5],
'num2': [1, 2, 3, 4, 5],
'cat': ['a', 'b', 'a', None, 'b'],
})


def test_summary_statistics():
df = sample_df()
summary = eda.summary_statistics(df)
assert 'num1' in summary.columns
assert 'cat' in summary.columns


def test_data_quality_assessment():
df = sample_df()
quality = eda.data_quality_assessment(df)
assert quality.loc['cat', 'missing'] == 1
assert quality.loc['num1', 'missing'] == 0


def test_correlation_matrix():
df = sample_df()
corr = eda.correlation_matrix(df)
assert corr.loc['num1', 'num2'] == 1.0


def test_numeric_distributions():
df = sample_df()
hists = eda.numeric_distributions(df, bins=2)
assert 'num1' in hists
assert hists['num1'].sum() == len(df)


def test_categorical_analysis():
df = sample_df()
counts = eda.categorical_analysis(df)
assert counts['cat']['a'] == 2


def test_missing_value_matrix():
df = sample_df()
matrix = eda.missing_value_matrix(df)
assert matrix['cat'].sum() == 1


def test_profile_report():
df = sample_df()
report = eda.profile_report(df)
assert 'summary' in report and 'quality' in report and 'correlation' in report


def test_data_insights_summary():
df = sample_df()
insights = eda.data_insights_summary(df)
assert any('missing values' in text for text in insights)
3 changes: 2 additions & 1 deletion utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@

from . import config
from . import data
from . import eda

__all__ = ["config", "data"]
__all__ = ["config", "data", "eda"]
83 changes: 83 additions & 0 deletions utils/eda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""Exploratory data analysis utilities."""

from __future__ import annotations

from typing import Any, Dict, List

import numpy as np
import pandas as pd


def summary_statistics(df: pd.DataFrame) -> pd.DataFrame:
"""Return summary statistics for all columns."""
return df.describe(include="all")


def data_quality_assessment(df: pd.DataFrame) -> pd.DataFrame:
"""Return data quality metrics for each column."""
total = len(df)
return pd.DataFrame({
"dtype": df.dtypes,
"missing": df.isna().sum(),
"missing_percent": df.isna().mean() * 100,
"unique": df.nunique(dropna=False),
})


def correlation_matrix(df: pd.DataFrame, method: str = "pearson") -> pd.DataFrame:
"""Return the correlation matrix for numeric columns."""
numeric_df = df.select_dtypes(include="number")
return numeric_df.corr(method=method)


def numeric_distributions(df: pd.DataFrame, bins: int = 10) -> Dict[str, pd.Series]:
"""Return histogram counts for numeric columns."""
histograms: Dict[str, pd.Series] = {}
numeric_df = df.select_dtypes(include="number")
for column in numeric_df.columns:
histograms[column] = pd.cut(numeric_df[column], bins=bins).value_counts().sort_index()
return histograms


def categorical_analysis(df: pd.DataFrame, top_n: int = 10) -> Dict[str, pd.Series]:
"""Return value counts for categorical columns."""
counts: Dict[str, pd.Series] = {}
categorical_df = df.select_dtypes(exclude="number")
for column in categorical_df.columns:
counts[column] = categorical_df[column].value_counts(dropna=False).head(top_n)
return counts


def missing_value_matrix(df: pd.DataFrame) -> pd.DataFrame:
"""Return a boolean matrix indicating missing values."""
return df.isna()


def profile_report(df: pd.DataFrame) -> Dict[str, Any]:
"""Generate a simple data profile report."""
return {
"summary": summary_statistics(df),
"quality": data_quality_assessment(df),
"correlation": correlation_matrix(df),
}


def data_insights_summary(df: pd.DataFrame) -> List[str]:
"""Generate simple insights from the data."""
insights: List[str] = []
quality = data_quality_assessment(df)
missing_cols = quality[quality["missing"] > 0].index.tolist()
if missing_cols:
insights.append("Columns with missing values: " + ", ".join(missing_cols))

corr = correlation_matrix(df).abs()
if not corr.empty:
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
strong = upper.stack().loc[lambda s: s > 0.8]
if not strong.empty:
pairs = [f"{i} & {j}" for i, j in strong.index]
insights.append("Strong correlations detected: " + ", ".join(pairs))

if not insights:
insights.append("No notable data issues detected.")
return insights