# Phase 1 Testing: Data Infrastructure & EDA

This notebook tests the Phase 1 implementation including:
- Data loading pipeline
- Data quality assessment
- Data leakage detection
- Comprehensive visualizations

In [None]:
import sys
sys.path.append('../')

from src.data_pipeline import DataPipeline
import pandas as pd
import numpy as np
from config import *

## Test 1: Data Loading

In [None]:
# Initialize pipeline
pipeline = DataPipeline()

# Test data loading
train_data, test_data = pipeline.load_data()

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print(f"Training columns: {list(train_data.columns)}")
print(f"Target distribution (train): {train_data[TARGET_COLUMN].value_counts()}")

## Test 2: Data Quality Assessment

In [None]:
# Test data quality assessment
quality_report = pipeline.assess_data_quality()

print("Data Quality Summary:")
print(f"Training duplicates: {quality_report['train']['duplicates']}")
print(f"Test duplicates: {quality_report['test']['duplicates']}")
print(f"Training memory usage: {quality_report['train']['memory_usage']:.2f} MB")
print(f"Test memory usage: {quality_report['test']['memory_usage']:.2f} MB")

# Show features with missing values
missing_features = {k: v for k, v in quality_report['train']['missing_values'].items() if v > 0}
print(f"\nFeatures with missing values: {missing_features}")

## Test 3: Data Leakage Detection

In [None]:
# Test data leakage detection
leakage_report = pipeline.detect_data_leakage()

print("Data Leakage Analysis:")
print(f"High correlations: {leakage_report['high_correlations']}")
print(f"Temporal leakage: {leakage_report['temporal_leakage']}")
print(f"Target leakage: {leakage_report['target_leakage']}")
print(f"Suspicious features: {leakage_report['suspicious_features']}")

## Test 4: Visualization Generation

In [None]:
# Test visualization generation
pipeline.create_comprehensive_visualizations()

# Check if visualization files were created
viz_dir = REPORTS_DIR / "visualizations"
viz_files = list(viz_dir.glob("*.png"))
print(f"Generated {len(viz_files)} visualization files:")
for file in viz_files:
    print(f"  - {file.name}")

## Test 5: EDA Report Generation

In [None]:
# Test EDA report generation
pipeline.generate_eda_report()

# Check if report was created
report_file = REPORTS_DIR / "eda_report.md"
if report_file.exists():
    print(f"EDA report created: {report_file}")
    print(f"Report size: {report_file.stat().st_size} bytes")
else:
    print("EDA report not found!")

## Test 6: Complete Phase 1 Pipeline

In [None]:
# Test complete Phase 1 pipeline
results = pipeline.run_phase1()

print("Phase 1 Results:")
print(f"Training shape: {results['train_shape']}")
print(f"Test shape: {results['test_shape']}")
print(f"Phase status: {results['phase_status']}")
print(f"Suspicious features: {len(results['leakage_analysis']['suspicious_features'])}")

# Validate all deliverables
deliverables = [
    REPORTS_DIR / "data_quality_report.csv",
    REPORTS_DIR / "data_leakage_report.txt",
    REPORTS_DIR / "eda_report.md",
    REPORTS_DIR / "visualizations"
]

print("\nDeliverables Check:")
for deliverable in deliverables:
    if deliverable.exists():
        print(f"✓ {deliverable.name}")
    else:
        print(f"✗ {deliverable.name}")