# Polars Tutorial - Part 6: Lazy Evaluation

In this notebook, we'll explore Polars' lazy evaluation system:
- Understanding lazy vs eager execution
- Query optimization
- Execution plans
- Best practices for lazy DataFrames
- Advanced lazy operations

In [None]:
import polars as pl
import time
import os

DATA_DIR = '../data/'

print(f"Polars version: {pl.__version__}")

## 1. Eager vs Lazy Execution

### 1.1 Eager Execution (Default)

In [None]:
# Eager: Each operation executes immediately
df_sales = pl.read_csv(os.path.join(DATA_DIR, 'sales_data.csv'))

print("Eager execution:")
result_eager = (
    df_sales
    .filter(pl.col('revenue') > 500)  # Executes immediately
    .group_by('category')  # Executes immediately
    .agg([pl.sum('revenue').alias('total_revenue')])  # Executes immediately
    .sort('total_revenue', descending=True)  # Executes immediately
)

print(result_eager)
print(f"Type: {type(result_eager)}")

### 1.2 Lazy Execution

In [None]:
# Lazy: Operations are recorded but not executed
lazy_query = (
    df_sales.lazy()  # Convert to LazyFrame
    .filter(pl.col('revenue') > 500)  # Not executed
    .group_by('category')  # Not executed
    .agg([pl.sum('revenue').alias('total_revenue')])  # Not executed
    .sort('total_revenue', descending=True)  # Not executed
)

print("Lazy query (not executed yet):")
print(f"Type: {type(lazy_query)}")
print(lazy_query)

# Execute the query
result_lazy = lazy_query.collect()
print("\nAfter .collect():")
print(result_lazy)

## 2. Query Optimization

### 2.1 Viewing the Execution Plan

In [None]:
# Create a complex lazy query
complex_query = (
    df_sales.lazy()
    .filter(pl.col('revenue') > 100)
    .select(['date', 'product', 'category', 'revenue', 'region'])
    .filter(pl.col('category') == 'Electronics')
    .group_by('region')
    .agg([
        pl.sum('revenue').alias('total_revenue'),
        pl.count().alias('num_sales')
    ])
    .filter(pl.col('num_sales') > 1)
)

# View the optimized execution plan
print("Optimized Execution Plan:")
print(complex_query.explain())
print("\n" + "="*60)

### 2.2 Predicate Pushdown

In [None]:
# Polars pushes filters down to reduce data early
query_with_pushdown = (
    pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))
    .select(['product', 'revenue', 'region'])
    .filter(pl.col('revenue') > 1000)  # This filter gets pushed down
)

print("Query with Predicate Pushdown:")
print(query_with_pushdown.explain())
print("\nNotice how the filter is applied early in the plan!")

### 2.3 Projection Pushdown

In [None]:
# Polars only reads columns that are actually needed
query_projection = (
    pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))
    .filter(pl.col('revenue') > 500)
    .select(['product', 'revenue'])  # Only these columns are read
)

print("Query with Projection Pushdown:")
print(query_projection.explain())
print("\nOnly the required columns are read from the file!")

## 3. Lazy Reading Methods

### 3.1 scan_csv()

In [None]:
# Lazy CSV reading
lf_sales = pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))

print("Lazy CSV scan (no data loaded yet):")
print(type(lf_sales))

# Apply operations
result = lf_sales.filter(pl.col('category') == 'Furniture').collect()
print("\nFiltered results:")
print(result)

### 3.2 scan_parquet()

In [None]:
# Lazy Parquet reading
lf_transactions = pl.scan_parquet(os.path.join(DATA_DIR, 'transactions.parquet'))

result = (
    lf_transactions
    .filter(pl.col('status') == 'completed')
    .select(['customer_name', 'amount'])
    .limit(5)
    .collect()
)

print("Lazy Parquet scan results:")
print(result)

## 4. Performance Comparison: Eager vs Lazy

In [None]:
# Create a larger dataset for benchmarking
large_df = pl.DataFrame({
    'id': range(100000),
    'value': [i * 1.5 for i in range(100000)],
    'category': ['A', 'B', 'C', 'D'] * 25000,
    'subcategory': ['X', 'Y', 'Z'] * 33333 + ['X']
})

# Save to CSV for lazy reading
large_df.write_csv(os.path.join(DATA_DIR, 'large_test.csv'))

print("Dataset created with 100,000 rows")

In [None]:
# Eager execution
start = time.time()
eager_result = (
    pl.read_csv(os.path.join(DATA_DIR, 'large_test.csv'))
    .filter(pl.col('value') > 50000)
    .filter(pl.col('category').is_in(['A', 'B']))
    .select(['id', 'value', 'category'])
    .group_by('category')
    .agg([pl.count().alias('count')])
)
eager_time = time.time() - start

print(f"Eager execution time: {eager_time:.4f} seconds")
print(eager_result)

In [None]:
# Lazy execution
start = time.time()
lazy_result = (
    pl.scan_csv(os.path.join(DATA_DIR, 'large_test.csv'))
    .filter(pl.col('value') > 50000)
    .filter(pl.col('category').is_in(['A', 'B']))
    .select(['id', 'value', 'category'])
    .group_by('category')
    .agg([pl.count().alias('count')])
    .collect()
)
lazy_time = time.time() - start

print(f"Lazy execution time: {lazy_time:.4f} seconds")
print(lazy_result)

print(f"\nSpeedup: {eager_time/lazy_time:.2f}x")

## 5. Advanced Lazy Operations

### 5.1 Chaining Multiple Operations

In [None]:
# Complex lazy query with multiple operations
advanced_query = (
    pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))
    .with_columns([
        pl.col('revenue').rank(descending=True).over('category').alias('revenue_rank')
    ])
    .filter(pl.col('revenue_rank') <= 3)
    .sort(['category', 'revenue_rank'])
    .select(['category', 'product', 'revenue', 'revenue_rank'])
)

print("Top 3 products per category:")
print(advanced_query.collect())

### 5.2 Using sink_csv() for Lazy Writing

In [None]:
# Process and write without loading into memory
output_file = os.path.join(DATA_DIR, 'processed_output.csv')

(
    pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))
    .filter(pl.col('revenue') > 1000)
    .select(['product', 'category', 'revenue', 'region'])
    .sink_csv(output_file)
)

print(f"Processed data written to {output_file}")

# Verify
verification = pl.read_csv(output_file)
print(f"Rows written: {verification.height}")
print(verification.head())

## 6. Joining Lazy DataFrames

In [None]:
# Create a regions reference table
df_regions = pl.DataFrame({
    'region': ['North', 'South', 'East', 'West'],
    'region_code': ['N', 'S', 'E', 'W'],
    'manager': ['Alice', 'Bob', 'Charlie', 'Diana']
})
df_regions.write_csv(os.path.join(DATA_DIR, 'regions.csv'))

# Lazy join
lazy_join = (
    pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))
    .join(
        pl.scan_csv(os.path.join(DATA_DIR, 'regions.csv')),
        on='region',
        how='left'
    )
    .select(['product', 'revenue', 'region', 'manager'])
    .filter(pl.col('revenue') > 1000)
)

print("Lazy join execution plan:")
print(lazy_join.explain())

print("\nResults:")
print(lazy_join.collect())

## 7. Caching Intermediate Results

In [None]:
# Use .cache() to store intermediate results
lazy_cached = (
    pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))
    .filter(pl.col('revenue') > 500)
    .cache()  # Cache this intermediate result
)

# Use the cached result multiple times
result1 = lazy_cached.group_by('category').agg([pl.sum('revenue')]).collect()
result2 = lazy_cached.group_by('region').agg([pl.count()]).collect()

print("Result 1 (by category):")
print(result1)
print("\nResult 2 (by region):")
print(result2)

## 8. Streaming Execution

For datasets larger than memory:

In [None]:
# Enable streaming for very large datasets
streaming_query = (
    pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))
    .filter(pl.col('revenue') > 500)
    .group_by('category')
    .agg([
        pl.sum('revenue').alias('total_revenue'),
        pl.count().alias('count')
    ])
)

# Collect with streaming enabled
result = streaming_query.collect(streaming=True)

print("Streaming execution result:")
print(result)
print("\nNote: Streaming processes data in chunks, using constant memory")

## 9. Common Patterns and Best Practices

### 9.1 Filter Early, Select Late

In [None]:
# Good: Filter before selecting columns
good_query = (
    pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))
    .filter(pl.col('revenue') > 1000)  # Filter early
    .filter(pl.col('category') == 'Electronics')  # More filtering
    .select(['product', 'revenue'])  # Select needed columns
)

print("Optimized query plan:")
print(good_query.explain())

### 9.2 Avoid Collecting Too Early

In [None]:
# Bad: Collecting in the middle of operations
# intermediate = pl.scan_csv('file.csv').collect()  # Don't do this
# result = intermediate.lazy().filter(...).collect()  # Wasteful

# Good: Keep it lazy until the end
result = (
    pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))
    .filter(pl.col('revenue') > 500)
    .group_by('category')
    .agg([pl.sum('revenue')])
    .collect()  # Only collect at the end
)

print("Efficient lazy execution:")
print(result)

### 9.3 Use with_columns() for Multiple Transformations

In [None]:
# Efficient: Multiple columns in one operation
efficient_transform = (
    pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))
    .with_columns([
        (pl.col('revenue') * 1.1).alias('revenue_with_tax'),
        (pl.col('price') * 0.9).alias('discounted_price'),
        pl.col('product').str.to_uppercase().alias('product_upper')
    ])
    .collect()
)

print("Multiple transformations:")
print(efficient_transform.head())

## 10. Debugging Lazy Queries

In [None]:
# Use .fetch() to preview without full execution
lazy_debug = (
    pl.scan_csv(os.path.join(DATA_DIR, 'sales_data.csv'))
    .filter(pl.col('revenue') > 500)
    .select(['product', 'revenue', 'category'])
)

# Fetch first N rows without optimizing/executing full query
preview = lazy_debug.fetch(n_rows=3)

print("Preview (first 3 rows):")
print(preview)

# Use .describe_optimized_plan() for detailed plan
print("\nOptimized plan description:")
print(lazy_debug.describe_optimized_plan())

## 11. Summary

In this notebook, we explored:
- ✅ Eager vs lazy execution
- ✅ Query optimization techniques
- ✅ Predicate and projection pushdown
- ✅ Lazy reading methods (scan_csv, scan_parquet)
- ✅ Performance comparisons
- ✅ Advanced lazy operations
- ✅ Streaming execution
- ✅ Best practices and common patterns

### Key Takeaways:
1. **Lazy evaluation enables optimization** - Polars can optimize the entire query before execution
2. **Use scan_* methods** - For files, always prefer scan_csv/scan_parquet over read_csv/read_parquet
3. **Filter early** - Apply filters before aggregations and joins
4. **Select only needed columns** - Projection pushdown reduces I/O
5. **Collect at the end** - Keep queries lazy as long as possible
6. **Use streaming for big data** - Enable streaming=True for datasets larger than memory

### When to Use Lazy:
- ✅ Reading large files
- ✅ Complex multi-step transformations
- ✅ When you need optimal performance
- ✅ ETL pipelines
- ✅ Data larger than available RAM (with streaming)

### When to Use Eager:
- ✅ Small datasets
- ✅ Interactive exploration
- ✅ Quick prototyping
- ✅ When you need results immediately

**Next:** In the final notebook, we'll compare Polars with Pandas!