In [1]:
# ========================================================
# 04 - Analysis & Report Generation
# Automatic Data Cleaning & Analysis Agent
# ========================================================

import pandas as pd
import numpy as np
import os
from pprint import pprint


In [2]:
def compute_descriptive_stats(df: pd.DataFrame):
    """
    Compute descriptive statistics for numeric columns.
    """
    numeric_df = df.select_dtypes(include=np.number)
    if numeric_df.empty:
        return {}
    return numeric_df.describe().to_dict()


In [3]:
def compute_correlations(df: pd.DataFrame, threshold=0.6):
    """
    Compute correlation matrix and extract strong correlations.
    """
    numeric_df = df.select_dtypes(include=np.number)
    if numeric_df.empty:
        return {"correlation_matrix": {}, "strong_correlations": []}

    corr = numeric_df.corr()

    strong_pairs = []
    for col1 in corr.columns:
        for col2 in corr.columns:
            if col1 < col2:
                if abs(corr.loc[col1, col2]) >= threshold:
                    strong_pairs.append({
                        "col1": col1,
                        "col2": col2,
                        "correlation": float(corr.loc[col1, col2])
                    })

    return {
        "correlation_matrix": corr.to_dict(),
        "strong_correlations": strong_pairs
    }


In [4]:
def summarize_distributions(df: pd.DataFrame):
    numeric_df = df.select_dtypes(include=np.number)
    summaries = {}

    for col in numeric_df.columns:
        series = numeric_df[col].dropna()
        summaries[col] = {
            "min": float(series.min()),
            "max": float(series.max()),
            "mean": float(series.mean()),
            "median": float(series.median()),
            "skewness": float(series.skew()),
            "q1": float(series.quantile(0.25)),
            "q3": float(series.quantile(0.75))
        }

    return summaries


In [5]:
def generate_insights(descriptive, correlations, distributions, cleaning_report):
    insights = []

    # Strong correlations
    for pair in correlations["strong_correlations"]:
        insights.append(
            f"Strong correlation ({pair['correlation']:.2f}) between **{pair['col1']}** and **{pair['col2']}**."
        )

    # Skewed distributions
    for col, dist in distributions.items():
        if abs(dist["skewness"]) > 1:
            insights.append(f"Column **{col}** has a skewed distribution (skewness={dist['skewness']:.2f}).")

    # Many imputations
    for col, info in cleaning_report["imputations"].items():
        if info["n_imputed"] > 0:
            insights.append(f"Column **{col}** had {info['n_imputed']} missing values imputed.")

    # Outliers still present
    for col, out in cleaning_report["outliers"]["details"].items():
        insights.append(f"Column **{col}** contains {out['n_outliers']} outliers.")

    return insights


In [6]:
def generate_warnings(cleaning_report):
    warnings = []

    # Dropped columns
    if cleaning_report["dropped_columns"]:
        warnings.append(f"Columns dropped due to high missing values: {', '.join(cleaning_report['dropped_columns'])}")

    # Too many imputations
    for col, info in cleaning_report["imputations"].items():
        if info["n_imputed"] > 5:
            warnings.append(f"High number of imputations for **{col}** ({info['n_imputed']}).")

    # Outliers not removed
    if cleaning_report["outliers"]["n_rows_removed"] == 0 and cleaning_report["outliers"]["details"]:
        warnings.append("Outliers detected but not removed.")

    return warnings


In [7]:
def build_markdown_report(
    profile, cleaning_report, descriptive, correlations, distributions, insights, warnings
):
    md = []
    md.append("# üìä Data Analysis Report\n")

    md.append("## 1Ô∏è‚É£ Dataset Overview")
    md.append(f"- Shape: **{profile['shape']}**")
    md.append(f"- Columns: {', '.join(profile['columns'])}\n")

    md.append("## 2Ô∏è‚É£ Cleaning Summary")
    md.append(f"- Shape before: {cleaning_report['shape_before']}")
    md.append(f"- Shape after: {cleaning_report['shape_after']}")
    md.append(f"- Dropped columns: {cleaning_report['dropped_columns']}")
    md.append(f"- Duplicates removed: {cleaning_report['duplicates']['n_removed']}\n")

    md.append("### Imputations")
    for col, info in cleaning_report["imputations"].items():
        md.append(f"- **{col}** ‚Üí {info['strategy']} (imputed {info['n_imputed']} values)")

    md.append("\n## 3Ô∏è‚É£ Descriptive Statistics")
    if descriptive:
        for col, stats in descriptive.items():
            md.append(f"### {col}")
            for s, val in stats.items():
                md.append(f"- {s}: {val}")
    else:
        md.append("No numeric columns.\n")

    md.append("\n## 4Ô∏è‚É£ Correlations")
    if correlations["strong_correlations"]:
        md.append("### Strong correlations:")
        for pair in correlations["strong_correlations"]:
            md.append(
                f"- **{pair['col1']}** ‚Üî **{pair['col2']}** ‚Üí {pair['correlation']:.2f}"
            )
    else:
        md.append("No strong correlations found.\n")

    md.append("\n## 5Ô∏è‚É£ Distribution Summaries")
    for col, stats in distributions.items():
        md.append(f"### {col}")
        for s, val in stats.items():
            md.append(f"- {s}: {val}")

    md.append("\n## 6Ô∏è‚É£ Insights")
    for i in insights:
        md.append(f"- {i}")

    md.append("\n## 7Ô∏è‚É£ Warnings")
    if warnings:
        for w in warnings:
            md.append(f"- ‚ö†Ô∏è {w}")
    else:
        md.append("- No warnings detected.")

    return "\n".join(md)


In [8]:
def save_markdown_report(md_content: str, filename="../reports/analysis_report.md"):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w", encoding="utf-8") as f:
        f.write(md_content)
    print(f"Report saved to {filename}")


In [9]:
import sys
import os

# Add parent/src folder to python path
sys.path.append(os.path.abspath("../src"))

from cleaning_tools import clean_dataset, display_cleaning_report
from data_tools import profile_dataset  # Importing profile_dataset from data_tools.py




Folder 'data' ready.


In [10]:
import numpy as np

In [13]:
from IPython.display import display

# Load cleaned dataset + cleaning report (from previous notebook)
path = "../data/test_messy.csv"
df_raw = pd.read_csv(path)
df_clean, cleaning_report = clean_dataset(df_raw)

# Build profile
profile = profile_dataset(df_raw)

# Compute analysis
descriptive = compute_descriptive_stats(df_clean)
correlations = compute_correlations(df_clean)
distributions = summarize_distributions(df_clean)

# Generate insights & warnings
insights = generate_insights(descriptive, correlations, distributions, cleaning_report)
warnings = generate_warnings(cleaning_report)

# Build Markdown report
md = build_markdown_report(
    profile, cleaning_report, descriptive, correlations, distributions, insights, warnings
)

# Save report
save_markdown_report(md)

from IPython.display import display, Markdown
display(Markdown(md))

Report saved to ../reports/analysis_report.md


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed[col].fillna(value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed[col].fillna(value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

# üìä Data Analysis Report

## 1Ô∏è‚É£ Dataset Overview
- Shape: **(20, 4)**
- Columns: id, age, income, city

## 2Ô∏è‚É£ Cleaning Summary
- Shape before: (20, 4)
- Shape after: (20, 4)
- Dropped columns: []
- Duplicates removed: 0

### Imputations
- **age** ‚Üí median (imputed 5 values)
- **income** ‚Üí median (imputed 6 values)
- **city** ‚Üí mode (imputed 6 values)

## 3Ô∏è‚É£ Descriptive Statistics
### id
- count: 20.0
- mean: 10.5
- std: 5.916079783099616
- min: 1.0
- 25%: 5.75
- 50%: 10.5
- 75%: 15.25
- max: 20.0
### age
- count: 20.0
- mean: 34.05
- std: 5.145002685283939
- min: 25.0
- 25%: 31.5
- 50%: 34.0
- 75%: 36.25
- max: 44.0
### income
- count: 20.0
- mean: 4005.0
- std: 430.0856707802834
- min: 3000.0
- 25%: 3875.0
- 50%: 4050.0
- 75%: 4225.0
- max: 4700.0

## 4Ô∏è‚É£ Correlations
### Strong correlations:
- **id** ‚Üî **income** ‚Üí 0.82
- **age** ‚Üî **id** ‚Üí 0.88
- **age** ‚Üî **income** ‚Üí 0.87

## 5Ô∏è‚É£ Distribution Summaries
### id
- min: 1.0
- max: 20.0
- mean: 10.5
- median: 10.5
- skewness: 0.0
- q1: 5.75
- q3: 15.25
### age
- min: 25.0
- max: 44.0
- mean: 34.05
- median: 34.0
- skewness: 0.10715522387337376
- q1: 31.5
- q3: 36.25
### income
- min: 3000.0
- max: 4700.0
- mean: 4005.0
- median: 4050.0
- skewness: -0.7188934095275396
- q1: 3875.0
- q3: 4225.0

## 6Ô∏è‚É£ Insights
- Strong correlation (0.82) between **id** and **income**.
- Strong correlation (0.88) between **age** and **id**.
- Strong correlation (0.87) between **age** and **income**.
- Column **age** had 5 missing values imputed.
- Column **income** had 6 missing values imputed.
- Column **city** had 6 missing values imputed.
- Column **age** contains 1 outliers.
- Column **income** contains 2 outliers.

## 7Ô∏è‚É£ Warnings
- ‚ö†Ô∏è High number of imputations for **income** (6).
- ‚ö†Ô∏è High number of imputations for **city** (6).
- ‚ö†Ô∏è Outliers detected but not removed.

In [None]:
import pandas as pd

df_cleaned.to_csv("df_clean.csv", index=False)
print("‚úî Cleaned dataset saved as df_clean.csv")