---

## 1. Environment Setup

### 1.1 Import Standard Libraries

Import all necessary libraries at the beginning. Group imports logically and follow PEP 8.

In [None]:
"""
Genesis 22 Canonical Notebook Template - Imports

This cell demonstrates proper import organization:
- Standard library imports first
- Third-party imports second
- Local/custom imports last
- Grouped and alphabetized within each section
"""

# Standard Library
import os
import sys
import warnings
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

# Third-Party: Data Manipulation
import numpy as np
import pandas as pd
from scipy import stats

# Third-Party: Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Third-Party: Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Third-Party: Utilities
from tqdm.notebook import tqdm

# Configure warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Print versions for reproducibility
print(f"Python: {sys.version}")
print(f"NumPy: {np.__version__}")
print(f"Pandas: {pd.__version__}")
print(f"Matplotlib: {plt.matplotlib.__version__}")
print(f"Seaborn: {sns.__version__}")
print(f"Notebook executed at: {datetime.now().isoformat()}")

In [None]:
"""Project configuration and constants."""

# Paths (use pathlib for cross-platform compatibility)
PROJECT_ROOT: Path = Path.cwd()
DATA_DIR: Path = PROJECT_ROOT / "data"
RAW_DATA_DIR: Path = DATA_DIR / "raw"
PROCESSED_DATA_DIR: Path = DATA_DIR / "processed"
MODELS_DIR: Path = PROJECT_ROOT / "models"
OUTPUTS_DIR: Path = PROJECT_ROOT / "outputs"

# Create directories if they don't exist
for directory in [DATA_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR, MODELS_DIR, OUTPUTS_DIR]:
    directory.mkdir(parents=True, exist_ok=True)

# Analysis Parameters
RANDOM_STATE: int = 42
TEST_SIZE: float = 0.2
CONFIDENCE_LEVEL: float = 0.95

# Visualization
FIGURE_DPI: int = 300
COLOR_PALETTE: List[str] = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']

# Display paths for verification
print("📁 Directory Structure:")
print(f"  Project Root: {PROJECT_ROOT}")
print(f"  Data Directory: {DATA_DIR}")
print(f"  Outputs Directory: {OUTPUTS_DIR}")
print(f"\n⚙️  Configuration:")
print(f"  Random State: {RANDOM_STATE}")
print(f"  Test Size: {TEST_SIZE}")
print(f"  Confidence Level: {CONFIDENCE_LEVEL}")

In [None]:
"""Generate synthetic dataset for demonstration."""

def generate_sample_data(n_samples: int = 1000, random_state: int = RANDOM_STATE) -> pd.DataFrame:
    """
    Generate a synthetic dataset for demonstration.
    
    Parameters
    ----------
    n_samples : int, default=1000
        Number of samples to generate
    random_state : int, default=RANDOM_STATE
        Random seed for reproducibility
        
    Returns
    -------
    pd.DataFrame
        Generated dataset with features and target
    """
    np.random.seed(random_state)
    
    # Generate features
    feature_1 = np.random.normal(loc=50, scale=15, size=n_samples)
    feature_2 = np.random.exponential(scale=2, size=n_samples)
    feature_3 = np.random.uniform(low=0, high=100, size=n_samples)
    feature_4 = np.random.poisson(lam=5, size=n_samples)
    
    # Generate target with some relationship to features
    noise = np.random.normal(loc=0, scale=10, size=n_samples)
    target = (
        2.5 * feature_1 + 
        1.8 * feature_2 - 
        0.5 * feature_3 + 
        3.2 * feature_4 + 
        noise
    )
    
    # Create DataFrame
    data = pd.DataFrame({
        'feature_1': feature_1,
        'feature_2': feature_2,
        'feature_3': feature_3,
        'feature_4': feature_4,
        'target': target,
        'category': np.random.choice(['A', 'B', 'C'], size=n_samples),
        'timestamp': pd.date_range(start='2024-01-01', periods=n_samples, freq='H')
    })
    
    return data

# Generate dataset
df_raw = generate_sample_data(n_samples=1000)

print(f"✅ Dataset generated: {df_raw.shape[0]} rows × {df_raw.shape[1]} columns")
print(f"\n📊 First few rows:")
display(df_raw.head())

In [None]:
"""Comprehensive data validation and quality assessment."""

def validate_data(df: pd.DataFrame) -> Dict[str, Any]:
    """
    Perform comprehensive data validation.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame to validate
        
    Returns
    -------
    Dict[str, Any]
        Validation report with metrics and issues
    """
    report = {}
    
    # Basic statistics
    report['n_rows'] = len(df)
    report['n_columns'] = len(df.columns)
    report['memory_usage_mb'] = df.memory_usage(deep=True).sum() / 1024**2
    
    # Missing values
    report['missing_values'] = df.isnull().sum().to_dict()
    report['missing_percentage'] = (df.isnull().sum() / len(df) * 100).to_dict()
    
    # Duplicates
    report['n_duplicates'] = df.duplicated().sum()
    
    # Data types
    report['dtypes'] = df.dtypes.astype(str).to_dict()
    
    # Numeric columns statistics
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    report['numeric_columns'] = list(numeric_cols)
    
    return report

# Validate the dataset
validation_report = validate_data(df_raw)

print("🔍 Data Validation Report")
print("=" * 60)
print(f"Dimensions: {validation_report['n_rows']:,} rows × {validation_report['n_columns']} columns")
print(f"Memory Usage: {validation_report['memory_usage_mb']:.2f} MB")
print(f"Duplicates: {validation_report['n_duplicates']}")
print(f"\n📋 Data Types:")
for col, dtype in validation_report['dtypes'].items():
    print(f"  {col}: {dtype}")
print(f"\n❌ Missing Values:")
for col, count in validation_report['missing_values'].items():
    if count > 0:
        pct = validation_report['missing_percentage'][col]
        print(f"  {col}: {count} ({pct:.2f}%)")
if sum(validation_report['missing_values'].values()) == 0:
    print("  ✅ No missing values detected")

# Display descriptive statistics
print("\n📊 Descriptive Statistics:")
display(df_raw.describe())

### 2.2 Data Validation and Quality Checks

Perform comprehensive validation to ensure data integrity.

---

## 2. Data Loading & Validation

### 2.1 Generate Sample Dataset

For demonstration purposes, we'll generate a synthetic dataset. In real projects, replace this with actual data loading.

### 1.3 Define Constants and Configuration

Centralize all configuration values and magic numbers as named constants.

In [None]:
"""Configure display settings for optimal notebook experience."""

# Pandas display options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)
pd.set_option('display.precision', 3)
pd.set_option('display.float_format', '{:.3f}'.format)

# Matplotlib style and settings
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100
plt.rcParams['font.size'] = 11
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10

# Seaborn settings
sns.set_palette("husl")
sns.set_context("notebook", font_scale=1.1)

# Enable inline plotting
%matplotlib inline

# Enable autoreload for development
%load_ext autoreload
%autoreload 2

print("✅ Display settings configured successfully")

### 1.2 Configure Display Settings

Set up notebook display preferences for optimal readability and presentation.

# Genesis 22 Canonical Notebook Template

**Author**: Genesis 22 Project  
**Date**: 2025-10-11  
**Version**: 1.0.0  
**Python Version**: 3.12+

This notebook serves as the **flagship example** and **canonical template** for all Jupyter notebook work in the Genesis 22 project.

## Features Demonstrated

- ✅ Proper notebook structure and organization
- ✅ Clear documentation and markdown usage
- ✅ Type hints and code quality standards
- ✅ Reproducible data analysis workflow
- ✅ Professional visualization practices
- ✅ Error handling and validation
- ✅ Memory-efficient coding patterns

## 📋 Table of Contents

1. [Environment Setup](#1-environment-setup)
2. [Data Loading & Validation](#2-data-loading--validation)
3. [Exploratory Data Analysis](#3-exploratory-data-analysis)
4. [Statistical Analysis](#4-statistical-analysis)
5. [Visualization](#5-visualization)
6. [Results & Conclusions](#6-results--conclusions)
7. [Cleanup & Best Practices](#7-cleanup--best-practices)