In [1]:
import os
import pandas as pd
import seaborn as sns
from datetime import datetime
from tabulate import tabulate

def generate_data_report(df, visualization_dir="plots", report_dir="reports"):
   
    # Create directories if they don't exist
    os.makedirs(visualization_dir, exist_ok=True)
    os.makedirs(report_dir, exist_ok=True)
    
    # Get current timestamp
    timestamp = datetime.now()
    timestamp_str = timestamp.strftime("%Y-%m-%d %H:%M:%S")
    filename_timestamp = timestamp.strftime("%Y%m%d_%H%M%S")
    
    # 1. Data Cleaning Summary
    cleaning_summary = {
        "Total Rows": len(df),
        "Total Columns": len(df.columns),
        "Missing Values": df.isna().sum().sum(),
        "Duplicate Rows": df.duplicated().sum(),
        "Numerical Columns": len(df.select_dtypes(include=['number']).columns),
        "Categorical Columns": len(df.select_dtypes(exclude=['number']).columns)
    }
    
    # 2. Generate Visualizations (using our previous function)
    visualize_data=(df, visualization_dir)
    
    # 3. Get visualization file info
    viz_files = []
    if os.path.exists(visualization_dir):
        viz_files = sorted([
            (f, 
             os.path.getsize(os.path.join(visualization_dir, f)), 
             datetime.fromtimestamp(os.path.getmtime(os.path.join(visualization_dir, f))).strftime("%Y-%m-%d %H:%M:%S"))
            for f in os.listdir(visualization_dir) 
            if f.endswith('.png') and f.startswith(('heatmap_', 'boxplot_', 'pairplot_'))
        ], reverse=True)[:3]  # Get 3 most recent files
    
    # 4. Generate report content
    report_content = f"""
DATA ANALYSIS REPORT
====================
Generated: {timestamp_str}

1. DATA CLEANING SUMMARY
-----------------------
{tabulate(cleaning_summary.items(), tablefmt="grid")}

2. STATISTICAL SUMMARY
----------------------
Numerical Columns:
{df.describe().to_markdown()}

Categorical Columns:
{df.describe(include=['object', 'category']).to_markdown()}

3. VISUALIZATIONS GENERATED
--------------------------
Files saved to: {os.path.abspath(visualization_dir)}

{tabulate(viz_files, headers=["Filename", "Size (bytes)", "Modified Time"], tablefmt="grid")}

4. DATA SAMPLE
--------------
First 5 rows:
{df.head().to_markdown()}
"""
    
    # 5. Save report to file
    report_filename = f"data_report_{filename_timestamp}.txt"
    report_path = os.path.join(report_dir, report_filename)
    
    with open(report_path, 'w') as f:
        f.write(report_content)
    
    print(f"Report generated successfully: {report_path}")
    return report_path

# Example usage
if __name__ == "__main__":
    from seaborn import load_dataset
    
    # Load sample data
    data = load_dataset('iris')
    
    # Generate report
    generate_data_report(data)


Report generated successfully: reports\data_report_20250616_000744.txt
