
#  📊 Data Ingestion and Quality Checks
# 
#  Notebook Overview
# This notebook performs the initial data ingestion and quality assessment for the Stock Price Indicator project.
# 
#  Objectives:
# 1. **API Testing**: Verify yfinance connectivity with single ticker
# 2. **Data Collection**: Download historical data for multiple tickers
# 3. **Quality Assessment**: Check for missing values, anomalies, and data consistency
# 4. **Initial Exploration**: Basic statistics and time series visualization
# 5. **Data Storage**: Save processed data for subsequent analysis
# 
#  Target Stocks:
# - **AAPL** (Apple Inc.)
# - **GOOGL** (Alphabet Inc.)
# - **MSFT** (Microsoft Corporation)
# - **TSLA** (Tesla Inc.)
# - **AMZN** (Amazon.com Inc.)
# 
# ---


#  1. Setup and Configuration

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
import warnings
from datetime import datetime, timedelta
import os
import sys
from pathlib import Path

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8')
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("✅ Libraries imported successfully")
print(
    f"📅 Notebook execution date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# %%
# Project configuration
PROJECT_CONFIG = {
    'tickers': ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'AMZN'],
    'start_date': '2015-01-01',
    'end_date': datetime.now().strftime('%Y-%m-%d'),
    'data_interval': '1d',  # Daily data
    'target_column': 'Adj Close',
    'prediction_horizons': [1, 7, 14, 28]  # Days ahead to predict
}

# Directory structure
BASE_DIR = Path('../')
DATA_DIR = BASE_DIR / 'data'
RAW_DATA_DIR = DATA_DIR / 'raw'
PROCESSED_DATA_DIR = DATA_DIR / 'processed'
FIGURES_DIR = BASE_DIR / 'experiments' / 'figures'

# Create directories if they don't exist
for directory in [RAW_DATA_DIR, PROCESSED_DATA_DIR, FIGURES_DIR]:
    directory.mkdir(parents=True, exist_ok=True)

print("📁 Directory structure initialized")
print(f"📊 Target tickers: {PROJECT_CONFIG['tickers']}")
print(
    f"📅 Date range: {PROJECT_CONFIG['start_date']} to {PROJECT_CONFIG['end_date']}")

#  2. API Connectivity Test
# 
# First, let's test the yfinance API with a single ticker to ensure connectivity and understand the data structure.

In [None]:
# Test API connectivity with AAPL
print("🔍 Testing yfinance API connectivity...")

try:
    # Download sample data for AAPL
    test_ticker = 'AAPL'
    test_data = yf.download(
        test_ticker,
        start='2024-01-01',
        end='2024-12-31',
        progress=False
    )

    print(f"✅ Successfully connected to yfinance API")
    print(f"📈 Sample data shape for {test_ticker}: {test_data.shape}")
    print(f"📊 Available columns: {list(test_data.columns)}")
    print(f"📅 Date range: {test_data.index.min()} to {test_data.index.max()}")

    # Display sample data
    print("\n📋 Sample data (first 5 rows):")
    print(test_data.head())

    # Check for missing values
    missing_values = test_data.isnull().sum()
    print(f"\n🔍 Missing values in test data:")
    for col, missing in missing_values.items():
        if missing > 0:
            print(f"  {col}: {missing} ({missing/len(test_data)*100:.2f}%)")
        else:
            print(f"  {col}: ✅ No missing values")

except Exception as e:
    print(f"❌ Error connecting to yfinance API: {str(e)}")
    sys.exit(1)

#  3. Multi-Ticker Data Download
# 
# Now let's download historical data for all target tickers.

In [None]:
def download_stock_data(tickers, start_date, end_date, save_individual=True):
    """
    Download stock data for multiple tickers and save to CSV files.

    Parameters:
    -----------
    tickers : list
        List of stock ticker symbols
    start_date : str
        Start date in YYYY-MM-DD format
    end_date : str
        End date in YYYY-MM-DD format
    save_individual : bool
        Whether to save individual CSV files for each ticker

    Returns:
    --------
    dict : Dictionary containing DataFrames for each ticker
    """
    stock_data = {}
    download_summary = []

    print(f"📥 Downloading data for {len(tickers)} tickers...")
    print(f"📅 Date range: {start_date} to {end_date}\n")

    for ticker in tickers:
        try:
            print(f"⏳ Downloading {ticker}...")

            # Download data
            data = yf.download(
                ticker,
                start=start_date,
                end=end_date,
                progress=False
            )

            if data.empty:
                print(f"⚠️ No data found for {ticker}")
                continue

            # Add ticker column for identification
            data['Ticker'] = ticker

            # Store in dictionary
            stock_data[ticker] = data.copy()

            # Save individual CSV if requested
            if save_individual:
                csv_path = RAW_DATA_DIR / f"{ticker}_raw.csv"
                data.to_csv(csv_path)
                print(f"💾 Saved {ticker} data to {csv_path}")

            # Collect summary statistics
            summary = {
                'Ticker': ticker,
                'Start_Date': data.index.min(),
                'End_Date': data.index.max(),
                'Total_Days': len(data),
                'Missing_Values': data.isnull().sum().sum(),
                'Avg_Volume': data['Volume'].mean(),
                'Price_Range': f"${data['Low'].min():.2f} - ${data['High'].max():.2f}"
            }
            download_summary.append(summary)

            print(
                f"✅ {ticker}: {len(data)} records from {data.index.min().date()} to {data.index.max().date()}")

        except Exception as e:
            print(f"❌ Error downloading {ticker}: {str(e)}")
            continue

    # Create summary DataFrame
    summary_df = pd.DataFrame(download_summary)

    print(f"\n📊 Download Summary:")
    print(summary_df.to_string(index=False))

    return stock_data, summary_df


# Download data for all tickers
stock_data, download_summary = download_stock_data(
    PROJECT_CONFIG['tickers'],
    PROJECT_CONFIG['start_date'],
    PROJECT_CONFIG['end_date']
)

#  4. Data Quality Assessment
# 
# Let's perform comprehensive quality checks on our downloaded data.

In [None]:
def assess_data_quality(stock_data):
    """
    Perform comprehensive data quality assessment.

    Parameters:
    -----------
    stock_data : dict
        Dictionary containing DataFrames for each ticker

    Returns:
    --------
    dict : Quality assessment results
    """
    quality_results = {}

    print("🔍 Performing Data Quality Assessment...\n")

    for ticker, data in stock_data.items():
        print(f"📊 Analyzing {ticker}...")

        # Basic info
        total_records = len(data)
        date_range = (data.index.max() - data.index.min()).days

        # Missing values analysis
        missing_analysis = {}
        for col in data.columns:
            if col != 'Ticker':
                missing_count = data[col].isnull().sum()
                missing_pct = (missing_count / total_records) * 100
                missing_analysis[col] = {
                    'count': missing_count,
                    'percentage': missing_pct
                }

        # Date continuity check
        expected_business_days = pd.bdate_range(
            start=data.index.min(),
            end=data.index.max()
        )
        actual_days = data.index
        missing_dates = set(expected_business_days) - set(actual_days)

        # Price anomaly detection
        price_cols = ['Open', 'High', 'Low', 'Close', 'Adj Close']
        anomalies = {}

        for col in price_cols:
            if col in data.columns:
                # Check for zero or negative prices
                zero_negative = (data[col] <= 0).sum()

                # Check for extreme price jumps (>20% daily change)
                pct_change = data[col].pct_change().abs()
                extreme_changes = (pct_change > 0.20).sum()

                anomalies[col] = {
                    'zero_negative': zero_negative,
                    'extreme_changes': extreme_changes
                }

        # Volume analysis
        volume_stats = {
            'zero_volume_days': (data['Volume'] == 0).sum() if 'Volume' in data.columns else 0,
            'avg_volume': data['Volume'].mean() if 'Volume' in data.columns else 0,
            'volume_spikes': (data['Volume'] > data['Volume'].quantile(0.99)).sum() if 'Volume' in data.columns else 0
        }

        # High-Low consistency check
        hl_inconsistent = 0
        if all(col in data.columns for col in ['High', 'Low', 'Open', 'Close']):
            # Check if High >= Low, High >= Open, High >= Close, etc.
            hl_inconsistent = ((data['High'] < data['Low']) |
                               (data['High'] < data['Open']) |
                               (data['High'] < data['Close']) |
                               (data['Low'] > data['Open']) |
                               (data['Low'] > data['Close'])).sum()

        # Store results
        quality_results[ticker] = {
            'total_records': total_records,
            'date_range_days': date_range,
            'missing_dates': len(missing_dates),
            'missing_values': missing_analysis,
            'price_anomalies': anomalies,
            'volume_stats': volume_stats,
            'hl_inconsistent': hl_inconsistent
        }

        # Print summary for this ticker
        print(f"  📈 Total records: {total_records:,}")
        print(f"  📅 Date range: {date_range:,} days")
        print(f"  🗓️ Missing business days: {len(missing_dates)}")
        print(f"  ⚠️ OHLC inconsistencies: {hl_inconsistent}")
        print(f"  📊 Zero volume days: {volume_stats['zero_volume_days']}")
        print("")

    return quality_results


# Perform quality assessment
quality_results = assess_data_quality(stock_data)

## 5. Missing Values Analysis and Handling

In [None]:
def analyze_missing_values(stock_data):
    """
    Detailed analysis of missing values across all tickers.
    """
    print("🔍 Missing Values Analysis\n")

    # Create consolidated missing values summary
    missing_summary = []

    for ticker, data in stock_data.items():
        for col in data.columns:
            if col != 'Ticker':
                missing_count = data[col].isnull().sum()
                if missing_count > 0:
                    missing_summary.append({
                        'Ticker': ticker,
                        'Column': col,
                        'Missing_Count': missing_count,
                        'Missing_Percentage': (missing_count / len(data)) * 100,
                        'Total_Records': len(data)
                    })

    if missing_summary:
        missing_df = pd.DataFrame(missing_summary)
        print("📋 Missing Values Summary:")
        print(missing_df.to_string(index=False))

        # Visualization
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))

        # Missing values by ticker
        ticker_missing = missing_df.groupby('Ticker')['Missing_Count'].sum()
        ticker_missing.plot(kind='bar', ax=axes[0], color='coral')
        axes[0].set_title('Total Missing Values by Ticker')
        axes[0].set_ylabel('Missing Count')
        axes[0].tick_params(axis='x', rotation=45)

        # Missing values by column
        col_missing = missing_df.groupby('Column')['Missing_Count'].sum()
        col_missing.plot(kind='bar', ax=axes[1], color='lightblue')
        axes[1].set_title('Total Missing Values by Column')
        axes[1].set_ylabel('Missing Count')
        axes[1].tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.savefig(FIGURES_DIR / 'missing_values_analysis.png',
                    dpi=300, bbox_inches='tight')
        plt.show()

    else:
        print("✅ No missing values found in any dataset!")

    return missing_df if missing_summary else None


missing_analysis = analyze_missing_values(stock_data)

#  6. Basic Time Series Visualization
# 
# Let's create visualizations to understand the time series characteristics of our data.

In [None]:
def create_price_visualizations(stock_data, save_plots=True):
    """
    Create comprehensive price visualizations for all tickers.
    """
    print("📊 Creating Price Visualizations...\n")

    # 1. Adjusted Close Price Comparison
    fig, ax = plt.subplots(figsize=(15, 8))

    for ticker, data in stock_data.items():
        ax.plot(data.index, data['Adj Close'], label=ticker, linewidth=2)

    ax.set_title('Adjusted Close Prices Comparison (2015-2024)',
                 fontsize=16, fontweight='bold')
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Adjusted Close Price ($)', fontsize=12)
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)

    if save_plots:
        plt.savefig(FIGURES_DIR / 'adj_close_comparison.png',
                    dpi=300, bbox_inches='tight')
    plt.show()

    # 2. Normalized Price Comparison (Base = 100)
    fig, ax = plt.subplots(figsize=(15, 8))

    for ticker, data in stock_data.items():
        normalized_price = (data['Adj Close'] /
                            data['Adj Close'].iloc[0]) * 100
        ax.plot(data.index, normalized_price, label=ticker, linewidth=2)

    ax.set_title('Normalized Adjusted Close Prices (Base = 100 at Start)',
                 fontsize=16, fontweight='bold')
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Normalized Price (Base = 100)', fontsize=12)
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)
    ax.axhline(y=100, color='black', linestyle='--', alpha=0.5)

    if save_plots:
        plt.savefig(FIGURES_DIR / 'normalized_price_comparison.png',
                    dpi=300, bbox_inches='tight')
    plt.show()

    # 3. Individual ticker subplots with volume
    fig, axes = plt.subplots(
        len(stock_data), 2, figsize=(20, 4*len(stock_data)))
    if len(stock_data) == 1:
        axes = axes.reshape(1, -1)

    for i, (ticker, data) in enumerate(stock_data.items()):
        # Price plot
        axes[i, 0].plot(data.index, data['Adj Close'],
                        color='blue', linewidth=1.5)
        axes[i, 0].set_title(
            f'{ticker} - Adjusted Close Price', fontweight='bold')
        axes[i, 0].set_ylabel('Price ($)')
        axes[i, 0].grid(True, alpha=0.3)

        # Volume plot
        axes[i, 1].bar(data.index, data['Volume'],
                       alpha=0.7, color='orange', width=1)
        axes[i, 1].set_title(f'{ticker} - Trading Volume', fontweight='bold')
        axes[i, 1].set_ylabel('Volume')
        axes[i, 1].grid(True, alpha=0.3)

        # Format x-axis
        axes[i, 0].tick_params(axis='x', rotation=45)
        axes[i, 1].tick_params(axis='x', rotation=45)

    plt.tight_layout()
    if save_plots:
        plt.savefig(FIGURES_DIR / 'individual_price_volume.png',
                    dpi=300, bbox_inches='tight')
    plt.show()

    # 4. Daily Returns Distribution
    fig, ax = plt.subplots(figsize=(12, 8))

    returns_data = []
    for ticker, data in stock_data.items():
        daily_returns = data['Adj Close'].pct_change().dropna()
        returns_data.append(daily_returns)

        # Plot histogram
        ax.hist(daily_returns, bins=50, alpha=0.6, label=ticker, density=True)

    ax.set_title('Daily Returns Distribution', fontsize=16, fontweight='bold')
    ax.set_xlabel('Daily Return', fontsize=12)
    ax.set_ylabel('Density', fontsize=12)
    ax.legend(fontsize=10)
    ax.grid(True, alpha=0.3)
    ax.axvline(x=0, color='black', linestyle='--', alpha=0.5)

    if save_plots:
        plt.savefig(FIGURES_DIR / 'daily_returns_distribution.png',
                    dpi=300, bbox_inches='tight')
    plt.show()


create_price_visualizations(stock_data)

#  7. Statistical Summary and Correlation Analysis

In [None]:
def compute_statistical_summary(stock_data):
    """
    Compute comprehensive statistical summary for all tickers.
    """
    print("📊 Computing Statistical Summary...\n")

    summary_stats = []
    correlation_data = {}

    for ticker, data in stock_data.items():
        # Basic statistics for Adjusted Close
        adj_close = data['Adj Close']
        daily_returns = adj_close.pct_change().dropna()

        stats = {
            'Ticker': ticker,
            'Start_Price': adj_close.iloc[0],
            'End_Price': adj_close.iloc[-1],
            'Min_Price': adj_close.min(),
            'Max_Price': adj_close.max(),
            'Mean_Price': adj_close.mean(),
            'Std_Price': adj_close.std(),
            'Total_Return': ((adj_close.iloc[-1] / adj_close.iloc[0]) - 1) * 100,
            'Annualized_Return': (((adj_close.iloc[-1] / adj_close.iloc[0]) ** (252 / len(adj_close))) - 1) * 100,
            'Daily_Return_Mean': daily_returns.mean() * 100,
            'Daily_Return_Std': daily_returns.std() * 100,
            'Sharpe_Ratio': (daily_returns.mean() / daily_returns.std()) * np.sqrt(252) if daily_returns.std() > 0 else 0,
            'Max_Drawdown': ((adj_close / adj_close.expanding().max() - 1).min()) * 100,
            'Avg_Volume': data['Volume'].mean(),
            'Volume_Std': data['Volume'].std()
        }
        summary_stats.append(stats)

        # Store returns for correlation analysis
        correlation_data[ticker] = daily_returns

    # Create summary DataFrame
    summary_df = pd.DataFrame(summary_stats)

    # Display formatted summary
    print("📋 Statistical Summary:")
    pd.set_option('display.float_format', '{:.4f}'.format)
    print(summary_df.to_string(index=False))

    # Correlation analysis
    returns_df = pd.DataFrame(correlation_data)
    correlation_matrix = returns_df.corr()

    print(f"\n🔗 Daily Returns Correlation Matrix:")
    print(correlation_matrix.round(4))

    # Correlation heatmap
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='RdYlBu_r', center=0,
                square=True, fmt='.3f', cbar_kws={'label': 'Correlation'})
    ax.set_title('Daily Returns Correlation Matrix',
                 fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / 'correlation_heatmap.png',
                dpi=300, bbox_inches='tight')
    plt.show()

    return summary_df, correlation_matrix


summary_stats, correlation_matrix = compute_statistical_summary(stock_data)

#  8. Anomaly Detection
# 
# Let's identify potential anomalies in the data that might affect our modeling.

In [None]:
def detect_anomalies(stock_data):
    """
    Detect various types of anomalies in stock data.
    """
    print("🔍 Detecting Data Anomalies...\n")

    anomaly_report = []

    for ticker, data in stock_data.items():
        print(f"📊 Analyzing {ticker} for anomalies...")

        # Calculate daily returns
        daily_returns = data['Adj Close'].pct_change()

        # 1. Extreme price movements (> 3 standard deviations)
        returns_mean = daily_returns.mean()
        returns_std = daily_returns.std()
        extreme_threshold = 3 * returns_std

        extreme_moves = daily_returns[abs(
            daily_returns - returns_mean) > extreme_threshold]

        # 2. Volume spikes (> 99th percentile)
        volume_99th = data['Volume'].quantile(0.99)
        volume_spikes = data[data['Volume'] > volume_99th]

        # 3. Price gaps (> 5% overnight gap)
        price_gaps = []
        for i in range(1, len(data)):
            prev_close = data['Close'].iloc[i-1]
            curr_open = data['Open'].iloc[i]
            gap = abs((curr_open - prev_close) / prev_close)
            if gap > 0.05:  # 5% gap
                price_gaps.append({
                    'date': data.index[i],
                    'gap_percent': gap * 100,
                    'prev_close': prev_close,
                    'curr_open': curr_open
                })

        # 4. Zero volume days
        zero_volume_days = data[data['Volume'] == 0]

        # 5. OHLC inconsistencies
        ohlc_issues = data[
            (data['High'] < data['Low']) |
            (data['High'] < data['Open']) |
            (data['High'] < data['Close']) |
            (data['Low'] > data['Open']) |
            (data['Low'] > data['Close'])
        ]

        # Compile report
        anomalies = {
            'Ticker': ticker,
            'Extreme_Returns_Count': len(extreme_moves),
            'Max_Single_Day_Gain': daily_returns.max() * 100 if not daily_returns.empty else 0,
            'Max_Single_Day_Loss': daily_returns.min() * 100 if not daily_returns.empty else 0,
            'Volume_Spikes_Count': len(volume_spikes),
            'Price_Gaps_Count': len(price_gaps),
            'Zero_Volume_Days': len(zero_volume_days),
            'OHLC_Inconsistencies': len(ohlc_issues)
        }
        anomaly_report.append(anomalies)

        # Print details for significant anomalies
        if len(extreme_moves) > 0:
            print(f"  ⚠️ {len(extreme_moves)} extreme price movements detected")
            for date, return_val in extreme_moves.head(3).items():
                print(f"    📅 {date.date()}: {return_val*100:.2f}% return")

        if len(price_gaps) > 0:
            print(f"  📈 {len(price_gaps)} significant price gaps detected")
            for gap in price_gaps[:3]:
                print(
                    f"    📅 {gap['date'].date()}: {gap['gap_percent']:.2f}% gap")

        if len(zero_volume_days) > 0:
            print(f"  📊 {len(zero_volume_days)} zero volume days detected")

        print("")

    # Create anomaly summary DataFrame
    anomaly_df = pd.DataFrame(anomaly_report)
    print("📋 Anomaly Summary:")
    print(anomaly_df.to_string(index=False))

    return anomaly_df


anomaly_report = detect_anomalies(stock_data)

#  9. Data Export and Preparation for Next Steps

In [None]:
def export_processed_data(stock_data, summary_stats, quality_results):
    """
    Export processed data and analysis results for next steps.
    """
    print("💾 Exporting processed data and analysis results...\n")

    # 1. Create combined dataset with all tickers
    combined_data = []
    for ticker, data in stock_data.items():
        ticker_data = data.copy()
        ticker_data['Ticker'] = ticker
        ticker_data['Date'] = ticker_data.index
        combined_data.append(ticker_data)

    all_stocks_df = pd.concat(combined_data, ignore_index=True)
    all_stocks_df = all_stocks_df.set_index('Date')

    # Save combined dataset
    combined_path = PROCESSED_DATA_DIR / 'all_stocks_combined.csv'
    all_stocks_df.to_csv(combined_path)
    print(f"📊 Combined dataset saved to: {combined_path}")
    print(f"   📈 Shape: {all_stocks_df.shape}")

    # 2. Save individual processed datasets
    for ticker, data in stock_data.items():
        # Add some basic derived features
        processed_data = data.copy()
        processed_data['Daily_Return'] = processed_data['Adj Close'].pct_change()
        processed_data['Log_Return'] = np.log(
            processed_data['Adj Close'] / processed_data['Adj Close'].shift(1))
        processed_data['Price_Range'] = processed_data['High'] - \
            processed_data['Low']
        processed_data['Volume_MA_20'] = processed_data['Volume'].rolling(
            window=20).mean()

        ticker_path = PROCESSED_DATA_DIR / f'{ticker}_processed.csv'
        processed_data.to_csv(ticker_path)
        print(f"📊 {ticker} processed data saved to: {ticker_path}")

    # 3. Save analysis results
    summary_stats.to_csv(PROCESSED_DATA_DIR /
                         'statistical_summary.csv', index=False)
    print(
        f"📋 Statistical summary saved to: {PROCESSED_DATA_DIR / 'statistical_summary.csv'}")

    # 4. Save metadata
    metadata = {
        'data_ingestion_date': datetime.now().isoformat(),
        'tickers': PROJECT_CONFIG['tickers'],
        'date_range': f"{PROJECT_CONFIG['start_date']} to {PROJECT_CONFIG['end_date']}",
        'total_records_per_ticker': {ticker: len(data) for ticker, data in stock_data.items()},
        'quality_check_passed': True,  # Would be False if critical issues found
        'next_steps': [
            'Exploratory Data Analysis (02_eda_and_viz.ipynb)',
            'Feature Engineering (03_feature_engineering.ipynb)',
            'Baseline Modeling (04_baseline_models.ipynb)'
        ]
    }

    import json
    with open(PROCESSED_DATA_DIR / 'data_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"📝 Metadata saved to: {PROCESSED_DATA_DIR / 'data_metadata.json'}")

    return all_stocks_df


# Export all data and results
combined_dataset = export_processed_data(
    stock_data, summary_stats, quality_results)

#  10. Summary and Key Findings

In [None]:
# 10. Summary and Key Findings

print("=" * 80)
print("📊 DATA INGESTION AND QUALITY ASSESSMENT SUMMARY")
print("=" * 80)

print(f"\n✅ SUCCESSFULLY COMPLETED:")
print(
    f"   📥 Downloaded data for {len(stock_data)} tickers: {', '.join(stock_data.keys())}")
print(
    f"   📅 Date range: {PROJECT_CONFIG['start_date']} to {PROJECT_CONFIG['end_date']}")
print(f"   💾 Saved {len(stock_data)*2 + 3} files to data directories")

print(f"\n📊 DATA OVERVIEW:")
total_records = sum(len(data) for data in stock_data.values())
print(f"   📈 Total records across all tickers: {total_records:,}")
print(
    f"   🗓️ Average trading days per ticker: {total_records // len(stock_data):,}")

print(f"\n🔍 QUALITY ASSESSMENT:")
has_missing = missing_analysis is not None and len(missing_analysis) > 0
print(
    f"   ✅ Missing values: {'Found issues - see analysis above' if has_missing else 'No missing values detected'}")

# Count total anomalies across all tickers
total_extreme_returns = anomaly_report['Extreme_Returns_Count'].sum()
total_price_gaps = anomaly_report['Price_Gaps_Count'].sum()
total_zero_volume = anomaly_report['Zero_Volume_Days'].sum()
total_ohlc_issues = anomaly_report['OHLC_Inconsistencies'].sum()

print(
    f"   ⚠️ Extreme returns (>3σ): {total_extreme_returns} across all tickers")
print(f"   📈 Price gaps (>5%): {total_price_gaps} across all tickers")
print(f"   📊 Zero volume days: {total_zero_volume} across all tickers")
print(f"   🔧 OHLC inconsistencies: {total_ohlc_issues} across all tickers")

print(f"\n📈 PERFORMANCE HIGHLIGHTS:")
# Get top and bottom performers
best_performer = summary_stats.loc[summary_stats['Total_Return'].idxmax()]
worst_performer = summary_stats.loc[summary_stats['Total_Return'].idxmin()]
highest_volatility = summary_stats.loc[summary_stats['Daily_Return_Std'].idxmax(
)]
best_sharpe = summary_stats.loc[summary_stats['Sharpe_Ratio'].idxmax()]

print(
    f"   🏆 Best performer: {best_performer['Ticker']} ({best_performer['Total_Return']:.1f}% total return)")
print(
    f"   📉 Worst performer: {worst_performer['Ticker']} ({worst_performer['Total_Return']:.1f}% total return)")
print(
    f"   📊 Most volatile: {highest_volatility['Ticker']} ({highest_volatility['Daily_Return_Std']:.2f}% daily std)")
print(
    f"   ⚡ Best Sharpe ratio: {best_sharpe['Ticker']} ({best_sharpe['Sharpe_Ratio']:.2f})")

print(f"\n🔗 CORRELATION INSIGHTS:")
# Find highest and lowest correlations
corr_values = []
tickers = list(stock_data.keys())
for i in range(len(tickers)):
    for j in range(i+1, len(tickers)):
        corr_val = correlation_matrix.loc[tickers[i], tickers[j]]
        corr_values.append((tickers[i], tickers[j], corr_val))

corr_values.sort(key=lambda x: x[2], reverse=True)
highest_corr = corr_values[0]
lowest_corr = corr_values[-1]

print(
    f"   📊 Highest correlation: {highest_corr[0]} - {highest_corr[1]} ({highest_corr[2]:.3f})")
print(
    f"   📊 Lowest correlation: {lowest_corr[0]} - {lowest_corr[1]} ({lowest_corr[2]:.3f})")
avg_correlation = correlation_matrix.values[np.triu_indices_from(
    correlation_matrix.values, k=1)].mean()
print(f"   📊 Average pairwise correlation: {avg_correlation:.3f}")

print(f"\n💾 OUTPUT FILES CREATED:")
print(
    f"   📁 Raw data: {len(stock_data)} individual CSV files in {RAW_DATA_DIR}")
print(
    f"   📁 Processed data: {len(stock_data)} individual CSV files in {PROCESSED_DATA_DIR}")
print(
    f"   📄 Combined dataset: all_stocks_combined.csv ({combined_dataset.shape[0]:,} rows)")
print(f"   📊 Statistical summary: statistical_summary.csv")
print(f"   📋 Metadata: data_metadata.json")
print(
    f"   📊 Visualizations: {len([f for f in FIGURES_DIR.glob('*.png')])} charts in {FIGURES_DIR}")

print(f"\n🔍 DATA QUALITY VERDICT:")
quality_issues = []
if has_missing:
    quality_issues.append("Missing values detected")
# More than 5 extreme moves per ticker on average
if total_extreme_returns > len(stock_data) * 5:
    quality_issues.append("High number of extreme price movements")
if total_zero_volume > 0:
    quality_issues.append("Zero volume days found")
if total_ohlc_issues > 0:
    quality_issues.append("OHLC price inconsistencies found")

if not quality_issues:
    print("   ✅ EXCELLENT: Data quality is high with no significant issues detected")
    print("   ✅ All datasets are ready for feature engineering and modeling")
elif len(quality_issues) <= 2:
    print("   ⚠️ GOOD: Minor data quality issues identified:")
    for issue in quality_issues:
        print(f"      - {issue}")
    print("   ✅ Data is suitable for modeling with standard preprocessing")
else:
    print("   ⚠️ CAUTION: Multiple data quality issues identified:")
    for issue in quality_issues:
        print(f"      - {issue}")
    print("   🔧 Additional data cleaning may be required before modeling")

print(f"\n🎯 RECOMMENDED NEXT STEPS:")
print(f"   1️⃣ Run Exploratory Data Analysis (02_eda_and_viz.ipynb)")
print(f"   2️⃣ Implement Feature Engineering (03_feature_engineering.ipynb)")
print(f"   3️⃣ Develop Baseline Models (04_baseline_models.ipynb)")
print(f"   4️⃣ Consider additional data sources for external factors")

print(f"\n📝 KEY TAKEAWAYS:")
print(
    f"   • All {len(PROJECT_CONFIG['tickers'])} target stocks successfully downloaded")
print(
    f"   • {total_records:,} total data points spanning ~{(pd.to_datetime(PROJECT_CONFIG['end_date']) - pd.to_datetime(PROJECT_CONFIG['start_date'])).days} days")
print(f"   • Strong foundation established for time series prediction modeling")
print(f"   • Data exhibits typical financial time series characteristics")
print(f"   • Ready to proceed with advanced analysis and feature engineering")

print("\n" + "=" * 80)
print("🎉 DATA INGESTION PHASE COMPLETED SUCCESSFULLY!")
print("=" * 80)