In [None]:
# notebooks/1_data_understanding.ipynb
def comprehensive_data_analysis(df):
    """
    Comprehensive data analysis and quality check
    """
    print("=" * 60)
    print("ðŸ“Š COMPREHENSIVE DATA ANALYSIS REPORT")
    print("=" * 60)
    
    # Basic Information
    print("\n1. DATASET OVERVIEW:")
    print(f"   â€¢ Shape: {df.shape}")
    print(f"   â€¢ Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Data Types
    print("\n2. DATA TYPES:")
    for col in df.columns:
        print(f"   â€¢ {col}: {df[col].dtype}")
    
    # Missing Values
    print("\n3. MISSING VALUES ANALYSIS:")
    missing_data = df.isnull().sum()
    for col, missing_count in missing_data.items():
        if missing_count > 0:
            percentage = (missing_count / len(df)) * 100
            print(f"   â€¢ {col}: {missing_count} missing ({percentage:.2f}%)")
    
    # Statistical Summary
    print("\n4. STATISTICAL SUMMARY (Numerical Columns):")
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    print(df[numerical_cols].describe())
    
    # Date Range Analysis
    if 'Order Date' in df.columns:
        print("\n5. DATE RANGE ANALYSIS:")
        print(f"   â€¢ Start Date: {df['Order Date'].min()}")
        print(f"   â€¢ End Date: {df['Order Date'].max()}")
        print(f"   â€¢ Total Days: {(df['Order Date'].max() - df['Order Date'].min()).days}")
    
    # Unique Values
    print("\n6. CATEGORICAL ANALYSIS:")
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        unique_count = df[col].nunique()
        print(f"   â€¢ {col}: {unique_count} unique values")
        if unique_count <= 10:  # Show values for small categories
            print(f"     Values: {list(df[col].unique())}")
    
    return df

# Run analysis
df_analysis = comprehensive_data_analysis(df)