# Data Configuration & Preparation Interface

This notebook provides an interactive interface for configuring and preparing market data for quantitative strategy development.

## Workflow Integration

This notebook corresponds to **Phase 1** of the development workflow:
- Data Collection & Preparation
- Data Splitting Strategy
- Data Quality Validation

The prepared datasets will be used in subsequent phases for strategy development, optimization, and validation.

In [None]:
# Import necessary libraries
import sys
import os
from pathlib import Path
from datetime import datetime, timezone, timedelta
import polars as pl
import json

# Add project root to path
project_root = Path(os.getcwd()).parent.parent
sys.path.append(str(project_root))

# Import our modules
from src.data.config.data_config import DataConfig, DataSplitConfig
from src.data.pipeline.data_preparation import DataPreparationPipeline
from src.data.query.questdb_market_data_query import QuestDBMarketDataQuery

print("📚 Libraries imported successfully!")
print(f"📁 Project root: {project_root}")

## 1. Explore Available Data

Let's first explore what data is available in our QuestDB database.

In [None]:
# Initialize query service
query_service = QuestDBMarketDataQuery()

# Get available symbols
print("🔍 Fetching available symbols...")
try:
    available_symbols = query_service.get_available_symbols()
    print(f"✅ Found {len(available_symbols)} symbols")
    
    # Display first 10 symbols
    print("\n📋 Available symbols (first 10):")
    for i, symbol in enumerate(available_symbols[:10]):
        print(f"   {i+1}. {symbol}")
    
    if len(available_symbols) > 10:
        print(f"   ... and {len(available_symbols) - 10} more")
        
except Exception as e:
    print(f"❌ Error fetching symbols: {e}")
    available_symbols = []

In [None]:
# Check data range for a specific symbol
if available_symbols:
    sample_symbol = available_symbols[0]  # Use first available symbol
    print(f"📊 Checking data range for {sample_symbol}...")
    
    try:
        data_range = query_service.get_data_range(sample_symbol)
        print(f"✅ Data range information:")
        print(f"   Symbol: {data_range['symbol']}")
        print(f"   Exchange: {data_range['exchange']}")
        print(f"   Start: {data_range['start_time']}")
        print(f"   End: {data_range['end_time']}")
        print(f"   Records: {data_range['total_records']:,}")
    except Exception as e:
        print(f"❌ Error getting data range: {e}")

## 2. Configure Data Parameters

Now let's configure the data parameters for our strategy development.

In [None]:
# =============================================================================
# DATA CONFIGURATION PARAMETERS
# =============================================================================

# Basic Data Selection
SYMBOL = "BTC-USDT-SWAP"  # Change this to your desired symbol
EXCHANGE = "OKX"
TIMEFRAME = "1h"  # Options: "1m", "5m", "15m", "30m", "1h", "4h", "1d"

# Date Range (adjust these dates based on your needs)
START_DATE = datetime(2023, 1, 1, tzinfo=timezone.utc)
END_DATE = datetime(2024, 6, 30, tzinfo=timezone.utc)

# Data Splitting Configuration
TRAIN_PCT = 0.6        # 60% for training
VALIDATION_PCT = 0.2   # 20% for validation (set to None if not needed)
TEST_PCT = 0.2         # 20% for testing (set to None if not needed)
PURGE_DAYS = 1         # Days to purge between splits

# Data Quality Parameters
MIN_DATA_POINTS = 1000
MAX_GAP_MINUTES = 120  # Maximum acceptable gap in minutes
OUTLIER_THRESHOLD = 5.0  # Standard deviations for outlier detection

# Configuration Name and Description
CONFIG_NAME = "btc_usdt_swap_1h_2023_2024"
DESCRIPTION = "BTC-USDT-SWAP 1h data for 2023-2024 strategy development"

print("📝 Configuration parameters set!")
print(f"   Symbol: {SYMBOL}")
print(f"   Exchange: {EXCHANGE}")
print(f"   Timeframe: {TIMEFRAME}")
print(f"   Period: {START_DATE.strftime('%Y-%m-%d')} to {END_DATE.strftime('%Y-%m-%d')}")
print(f"   Splits: Train {TRAIN_PCT:.0%} | Val {VALIDATION_PCT:.0%} | Test {TEST_PCT:.0%}")

In [None]:
# Create data configuration
split_config = DataSplitConfig(
    train_pct=TRAIN_PCT,
    validation_pct=VALIDATION_PCT,
    test_pct=TEST_PCT,
    purge_days=PURGE_DAYS
)

data_config = DataConfig(
    symbol=SYMBOL,
    exchange=EXCHANGE,
    start_date=START_DATE,
    end_date=END_DATE,
    timeframe=TIMEFRAME,
    split_config=split_config,
    min_data_points=MIN_DATA_POINTS,
    max_gap_minutes=MAX_GAP_MINUTES,
    outlier_std_threshold=OUTLIER_THRESHOLD,
    config_name=CONFIG_NAME,
    description=DESCRIPTION
)

print("✅ Data configuration created!")
print("\n" + str(data_config))

In [None]:
# Show split dates
split_dates = data_config.get_split_dates()

print("📅 Data Split Schedule:")
print("=" * 50)

for split_name, (start, end) in split_dates.items():
    duration = end - start
    print(f"{split_name.upper():12} {start.strftime('%Y-%m-%d %H:%M')} to {end.strftime('%Y-%m-%d %H:%M')} ({duration.days} days)")

print(f"\n📊 Expected data points: {data_config.get_expected_data_points():,}")

## 3. Preview Data Sample

Let's fetch a small sample of data to verify everything is working correctly.

In [None]:
# Fetch a small sample for preview
sample_end = START_DATE + timedelta(days=7)  # 1 week sample

print(f"📥 Fetching sample data from {START_DATE.strftime('%Y-%m-%d')} to {sample_end.strftime('%Y-%m-%d')}...")

try:
    sample_data = query_service.get_market_data(
        symbol=SYMBOL,
        start_date=START_DATE,
        end_date=sample_end,
        timeframe=TIMEFRAME,
        exchange=EXCHANGE
    )
    
    print(f"✅ Sample data retrieved: {len(sample_data)} records")
    
    if len(sample_data) > 0:
        print("\n📊 Sample data preview:")
        print(sample_data.head(10))
        
        print("\n📈 Basic statistics:")
        print(sample_data.select([
            pl.col("open").mean().alias("avg_open"),
            pl.col("high").max().alias("max_high"),
            pl.col("low").min().alias("min_low"),
            pl.col("close").mean().alias("avg_close"),
            pl.col("volume").mean().alias("avg_volume")
        ]))
    else:
        print("❌ No sample data found")
        
except Exception as e:
    print(f"❌ Error fetching sample data: {e}")
    sample_data = pl.DataFrame()

## 4. Data Quality Preview

Let's check the quality of our sample data before proceeding with the full preparation.

In [None]:
if len(sample_data) > 0:
    # Create a pipeline for quality checking
    sample_config = DataConfig(
        symbol=SYMBOL,
        exchange=EXCHANGE,
        start_date=START_DATE,
        end_date=sample_end,
        timeframe=TIMEFRAME,
        split_config=split_config,
        min_data_points=50,  # Lower threshold for sample
        max_gap_minutes=MAX_GAP_MINUTES,
        outlier_std_threshold=OUTLIER_THRESHOLD,
        config_name="sample_check"
    )
    
    sample_pipeline = DataPreparationPipeline(sample_config)
    
    print("🔍 Checking sample data quality...")
    sample_pipeline.print_validation_report(sample_data)
else:
    print("⚠️  No sample data available for quality check")

## 5. Full Data Preparation

Now let's prepare the complete dataset for strategy development.

⚠️ **Warning**: This may take several minutes depending on the data size.

In [None]:
# Confirm before proceeding
print("🚨 READY TO PREPARE FULL DATASET")
print("=" * 40)
print(f"Symbol: {SYMBOL}")
print(f"Period: {START_DATE.strftime('%Y-%m-%d')} to {END_DATE.strftime('%Y-%m-%d')}")
print(f"Expected records: ~{data_config.get_expected_data_points():,}")
print(f"Configuration: {CONFIG_NAME}")
print("\n💡 This will:")
print("   1. Download full dataset from QuestDB")
print("   2. Validate data quality")
print("   3. Clean and process data")
print("   4. Split into train/validation/test sets")
print("   5. Save datasets to disk")
print("\n⏱️  This may take several minutes...")

# Set to True to proceed
PROCEED_WITH_PREPARATION = True  # Change to True when ready

if PROCEED_WITH_PREPARATION:
    print("\n🚀 Starting full data preparation...")
else:
    print("\n⏸️  Preparation paused. Set PROCEED_WITH_PREPARATION = True to continue.")

In [None]:
if PROCEED_WITH_PREPARATION:
    # Create preparation pipeline
    pipeline = DataPreparationPipeline(data_config)
    
    try:
        # Execute full preparation
        prepared_datasets = pipeline.prepare_data(save_to_disk=True)
        
        print("\n🎉 Data preparation completed successfully!")
        print(f"\n📁 Datasets saved to: data/processed/{CONFIG_NAME}/")
        
        # Show final summary
        for split_name, dataset in prepared_datasets.items():
            if len(dataset) > 0:
                print(f"\n📊 {split_name.upper()} SET:")
                print(f"   Records: {len(dataset):,}")
                print(f"   Period: {dataset['timestamp'].min()} to {dataset['timestamp'].max()}")
                print(f"   Columns: {', '.join(dataset.columns)}")
        
    except Exception as e:
        print(f"\n❌ Error during preparation: {e}")
        raise

## 6. Verification and Next Steps

Let's verify the prepared datasets and discuss next steps.

In [None]:
# List all prepared datasets
print("📋 All prepared datasets:")
datasets = DataPreparationPipeline.list_prepared_datasets()

for i, dataset_name in enumerate(datasets, 1):
    print(f"   {i}. {dataset_name}")

if not datasets:
    print("   No datasets found.")

In [None]:
# Load and verify the prepared dataset
if CONFIG_NAME in datasets:
    print(f"\n🔍 Verifying prepared dataset: {CONFIG_NAME}")
    
    # Get dataset info
    dataset_info = pipeline.get_dataset_info(CONFIG_NAME)
    
    print(f"\n📊 Dataset Information:")
    print(f"   Symbol: {dataset_info['symbol']}")
    print(f"   Exchange: {dataset_info['exchange']}")
    print(f"   Timeframe: {dataset_info['timeframe']}")
    print(f"   Prepared: {dataset_info['prepared_at']}")
    
    print(f"\n📈 Dataset Splits:")
    for split_name, split_info in dataset_info['datasets'].items():
        print(f"   {split_name.upper()}:")
        print(f"     Records: {split_info['records']:,}")
        print(f"     Period: {split_info['start_date']} to {split_info['end_date']}")
        print(f"     File: {split_info['filename']}")
    
    # Test loading a split
    train_data = pipeline.load_prepared_dataset(CONFIG_NAME, "train")
    print(f"\n✅ Successfully loaded training set: {len(train_data):,} records")
    print(f"   Columns: {', '.join(train_data.columns)}")
    
else:
    print(f"\n❌ Dataset {CONFIG_NAME} not found in prepared datasets")

## 7. Next Steps in Development Workflow

Now that your data is prepared, here are the next steps in the development workflow:

### Phase 1: Complete ✅
- ✅ Data Collection & Preparation
- ✅ Data Quality Validation
- ✅ Data Splitting Strategy

### Phase 2: Strategy Development & Optimization
Navigate to the following notebooks:
1. **Feature Engineering**: `notebooks/01_data_exploration/feature_exploration.ipynb`
2. **Strategy Development**: `notebooks/02_strategy_development/strategy_prototyping.ipynb`
3. **Parameter Optimization**: `notebooks/03_optimization/parameter_optimization.ipynb`

### Using Your Prepared Data
In subsequent notebooks, you can load your prepared data like this:

```python
from src.data.pipeline.data_preparation import DataPreparationPipeline

pipeline = DataPreparationPipeline(data_config)
train_data = pipeline.load_prepared_dataset("your_config_name", "train")
validation_data = pipeline.load_prepared_dataset("your_config_name", "validation")
test_data = pipeline.load_prepared_dataset("your_config_name", "test")
```

In [None]:
# Save configuration for future reference
if 'data_config' in locals():
    config_path = data_config.save_config()
    print(f"💾 Configuration saved to: {config_path}")
    
    print("\n📋 Configuration Summary:")
    print(json.dumps(data_config.to_dict(), indent=2, default=str))

print("\n🎯 Ready to proceed to Phase 2: Strategy Development!")