In [None]:
# üìä Lakehouse Data Summary Notebook

This notebook demonstrates how to:
1. Read data from a Microsoft Fabric Lakehouse
2. Explore and summarize the data
3. Generate basic statistics and visualizations

**Prerequisites:**
- Attach this notebook to a Lakehouse in your Fabric workspace
- Ensure you have data tables available in the Lakehouse

In [None]:
# Cell 1: Import Required Libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, sum, min, max, desc
import pandas as pd

## Step 1: List Available Tables in Lakehouse

First, let's discover what tables are available in the attached Lakehouse.

In [None]:
# List all tables in the default lakehouse
tables_df = spark.sql("SHOW TABLES")
display(tables_df)

## Step 2: Read Data from Lakehouse

Read a Delta table from the Lakehouse into a Spark DataFrame. Replace `your_table_name` with an actual table from your Lakehouse.

In [None]:
# Configuration - Update these values with your table name
TABLE_NAME = "your_table_name"  # Replace with your actual table name

# Option 1: Read using spark.read.table (recommended for Delta tables)
df = spark.read.table(TABLE_NAME)

# Option 2: Read from Delta path directly
# df = spark.read.format("delta").load("Tables/" + TABLE_NAME)

# Option 3: Read using SQL
# df = spark.sql(f"SELECT * FROM {TABLE_NAME}")

# Show first few rows
print(f"‚úÖ Successfully loaded table: {TABLE_NAME}")
print(f"üìä Total records: {df.count():,}")
display(df.limit(10))

## Step 3: Explore Data Schema

Understanding the structure of your data is crucial for analysis.

In [None]:
# Display schema information
print("üìã Data Schema:")
print("-" * 50)
df.printSchema()

# Get column names and types
print("\nüìù Column Summary:")
print("-" * 50)
for field in df.schema.fields:
    print(f"  ‚Ä¢ {field.name}: {field.dataType}")

## Step 4: Generate Statistical Summary

Get descriptive statistics for all numeric columns in the dataset.

In [None]:
# Generate descriptive statistics for all columns
print("üìà Statistical Summary:")
print("-" * 50)
display(df.describe())

# Additional summary using pandas for more detailed stats
pandas_df = df.toPandas()
print("\nüìä Detailed Statistics (via Pandas):")
display(pandas_df.describe(include='all').T)

## Step 5: Data Quality Check

Check for missing values and data quality issues.

In [None]:
# Check for null values in each column
from pyspark.sql.functions import col, count, when, isnan

print("üîç Null Value Analysis:")
print("-" * 50)

# Count nulls for each column
null_counts = df.select([
    count(when(col(c).isNull() | isnan(col(c)), c)).alias(c) 
    for c in df.columns
])

display(null_counts)

# Calculate null percentages
total_rows = df.count()
print(f"\nüìä Total Rows: {total_rows:,}")
print("\nüéØ Null Percentage by Column:")
for column in df.columns:
    null_count = df.filter(col(column).isNull()).count()
    null_pct = (null_count / total_rows) * 100 if total_rows > 0 else 0
    print(f"  ‚Ä¢ {column}: {null_pct:.2f}% ({null_count:,} nulls)")

## Step 6: Generate Summary Report

Create a comprehensive summary of the dataset.

In [None]:
# Generate comprehensive data summary report
from pyspark.sql.types import NumericType, StringType

def generate_data_summary(dataframe, table_name):
    """Generate a comprehensive summary report for a DataFrame"""
    
    print("=" * 60)
    print(f"üìä DATA SUMMARY REPORT: {table_name}")
    print("=" * 60)
    
    # Basic Info
    print(f"\nüìå Basic Information:")
    print(f"   ‚Ä¢ Total Records: {dataframe.count():,}")
    print(f"   ‚Ä¢ Total Columns: {len(dataframe.columns)}")
    
    # Column Types
    numeric_cols = [f.name for f in dataframe.schema.fields if isinstance(f.dataType, NumericType)]
    string_cols = [f.name for f in dataframe.schema.fields if isinstance(f.dataType, StringType)]
    
    print(f"\nüìã Column Types:")
    print(f"   ‚Ä¢ Numeric Columns: {len(numeric_cols)}")
    print(f"   ‚Ä¢ String Columns: {len(string_cols)}")
    print(f"   ‚Ä¢ Other Columns: {len(dataframe.columns) - len(numeric_cols) - len(string_cols)}")
    
    # Memory estimate
    print(f"\nüíæ Estimated Memory Usage:")
    row_count = dataframe.count()
    col_count = len(dataframe.columns)
    estimated_mb = (row_count * col_count * 8) / (1024 * 1024)  # rough estimate
    print(f"   ‚Ä¢ Approximate: {estimated_mb:.2f} MB")
    
    # Numeric summaries
    if numeric_cols:
        print(f"\nüìà Numeric Column Statistics:")
        for col_name in numeric_cols[:5]:  # Limit to first 5 numeric columns
            stats = dataframe.agg(
                min(col_name).alias("min"),
                max(col_name).alias("max"),
                avg(col_name).alias("avg"),
                sum(col_name).alias("sum")
            ).collect()[0]
            print(f"   ‚Ä¢ {col_name}:")
            print(f"      Min: {stats['min']}, Max: {stats['max']}, Avg: {stats['avg']:.2f}")
    
    print("\n" + "=" * 60)
    print("‚úÖ Summary report generated successfully!")
    print("=" * 60)

# Generate the summary report
generate_data_summary(df, TABLE_NAME)

## Step 7: Save Summary Results (Optional)

Optionally save the summary statistics back to the Lakehouse as a new table.

In [None]:
# Save summary statistics to a new Delta table in the Lakehouse
SUMMARY_TABLE_NAME = f"{TABLE_NAME}_summary"

# Create a summary DataFrame
summary_stats = df.describe()

# Save to Lakehouse as Delta table
summary_stats.write.mode("overwrite").format("delta").saveAsTable(SUMMARY_TABLE_NAME)

print(f"‚úÖ Summary statistics saved to table: {SUMMARY_TABLE_NAME}")
print(f"üìç Location: Tables/{SUMMARY_TABLE_NAME}")