# MLOps Data Exploration Notebook

This notebook is intended for exploratory data analysis (EDA) related to MLOps tasks, such as:
- Investigating raw data characteristics.
- Analyzing processed data and features.
- Exploring model predictions and errors.
- Debugging data-related issues in pipelines.

It complements the automated monitoring dashboards by providing a flexible environment for ad-hoc analysis.

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import yaml

# Configure plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context('notebook')

# Configure logging (optional for notebooks, but can be useful)
import logging
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define base paths (adjust if your notebook is in a different location relative to project root)
# Assuming the notebook is run from the FinAI_algo root or paths are relative to it.
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '../..')) # Adjust if notebook is deeper
MLOPS_CONFIG_DIR = os.path.join(PROJECT_ROOT, 'MLOps/config')
DATA_DIR = os.path.join(PROJECT_ROOT, 'data') # Assuming data is in FinAI_algo/data/
RESULTS_DIR = os.path.join(PROJECT_ROOT, 'MLOps/results')

logger.info(f"PROJECT_ROOT set to: {PROJECT_ROOT}")
logger.info(f"MLOPS_CONFIG_DIR set to: {MLOPS_CONFIG_DIR}")
logger.info(f"DATA_DIR set to: {DATA_DIR}")

## 2. Load Configuration Files (Optional)

In [None]:
def load_yaml_config(file_path):
    try:
        with open(file_path, 'r') as f:
            config = yaml.safe_load(f)
        logger.info(f"Successfully loaded config: {file_path}")
        return config
    except FileNotFoundError:
        logger.error(f"Config file not found: {file_path}")
        return None
    except Exception as e:
        logger.error(f"Error loading config {file_path}: {e}")
        return None

# Example: Load global vars
# global_vars_path = os.path.join(MLOPS_CONFIG_DIR, 'common/global_vars.yaml')
# global_vars = load_yaml_config(global_vars_path)
# if global_vars:
#     print("Global Vars:", global_vars)

## 3. Load Data

Load datasets relevant to your exploration. This could be:
- Raw data from `data/raw/`
- Processed data from `data/processed/` (potentially DVC-tracked)
- Backtesting results from `MLOps/results/backtesting/`
- Model predictions or outputs.

In [None]:
# Example: Load processed data (replace with actual file name)
# processed_data_path = os.path.join(DATA_DIR, 'processed/sample_processed_data.csv')
# try:
#     df_processed = pd.read_csv(processed_data_path, parse_dates=['date'])
#     logger.info(f"Loaded processed data from {processed_data_path}. Shape: {df_processed.shape}")
#     print(df_processed.head())
# except FileNotFoundError:
#     logger.error(f"Processed data file not found: {processed_data_path}. Please ensure it exists.")
#     df_processed = pd.DataFrame() # Create empty df to avoid errors later

# Create dummy data for demonstration if files don't exist
if 'df_processed' not in locals() or df_processed.empty:
    logger.warning("Creating dummy processed data for notebook demonstration.")
    dates = pd.date_range(start="2023-01-01", periods=100, freq="B")
    df_processed = pd.DataFrame({
        'date': dates,
        'tic': np.random.choice(['AAPL', 'MSFT'], 100),
        'price': np.random.rand(100) * 100 + 100,
        'volume': np.random.randint(100000, 1000000, 100),
        'sentiment_score': np.random.normal(0, 0.3, 100),
        'feature1_tech': np.random.rand(100),
        'feature2_tech': np.random.rand(100)
    })
    print("Dummy Processed Data Head:")
    print(df_processed.head())

## 4. Exploratory Data Analysis (EDA)

Perform your EDA here. Examples:
- Summary statistics
- Data distributions (histograms, density plots)
- Time series plots
- Correlation analysis
- Missing value analysis

In [None]:
if not df_processed.empty:
    print("\n--- Summary Statistics ---")
    print(df_processed.describe(include='all'))

    print("\n--- Missing Values ---")
    print(df_processed.isnull().sum())

    # Plot distribution of a numerical feature
    if 'price' in df_processed.columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(df_processed['price'], kde=True)
        plt.title('Distribution of Price')
        plt.xlabel('Price')
        plt.ylabel('Frequency')
        plt.show()

    # Plot sentiment score distribution
    if 'sentiment_score' in df_processed.columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(df_processed['sentiment_score'], kde=True)
        plt.title('Distribution of Sentiment Score')
        plt.xlabel('Sentiment Score')
        plt.ylabel('Frequency')
        plt.show()
        
    # Time series plot for a ticker
    if 'tic' in df_processed.columns and 'price' in df_processed.columns and 'date' in df_processed.columns:
        aapl_data = df_processed[df_processed['tic'] == 'AAPL']
        if not aapl_data.empty:
            plt.figure(figsize=(12, 6))
            plt.plot(aapl_data['date'], aapl_data['price'])
            plt.title('AAPL Price Over Time')
            plt.xlabel('Date')
            plt.ylabel('Price')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()
else:
    logger.warning("df_processed is empty. Skipping EDA plots.")

## 5. Specific MLOps Investigations

### 5.1 Data Drift Investigation
If `check_data_drift.py` indicated drift, you can load the reference and current datasets here for a deeper dive.

In [None]:
# Example: Load reference and current data for drift analysis
# ref_data_path = os.path.join(DATA_DIR, 'processed/reference_training_data.csv')
# current_data_path = os.path.join(DATA_DIR, 'processed/recent_production_batch.csv')

# try:
#     df_ref = pd.read_csv(ref_data_path)
#     df_curr = pd.read_csv(current_data_path)
#     logger.info("Loaded reference and current data for drift investigation.")
      # Perform comparative analysis, e.g., plot distributions side-by-side
#     if 'feature1_tech' in df_ref.columns and 'feature1_tech' in df_curr.columns:
#         plt.figure(figsize=(12, 6))
#         sns.kdeplot(df_ref['feature1_tech'], label='Reference Data', fill=True)
#         sns.kdeplot(df_curr['feature1_tech'], label='Current Data', fill=True)
#         plt.title('Distribution Comparison for feature1_tech')
#         plt.legend()
#         plt.show()
# except FileNotFoundError:
#     logger.warning("Reference or current data for drift not found. Skipping drift investigation section.")
logger.info("Placeholder for Data Drift Investigation section.")

### 5.2 Model Prediction Analysis
Load model predictions and actuals to analyze errors, biases, or specific scenarios.

In [None]:
# Example: Load predictions
# predictions_path = os.path.join(RESULTS_DIR, 'model_predictions/sentiment_model_preds_on_eval.csv')
# try:
#     df_preds = pd.read_csv(predictions_path)
#     logger.info(f"Loaded predictions from {predictions_path}")
      # Analyze prediction errors, confusion matrix, etc.
# except FileNotFoundError:
#     logger.warning(f"Predictions file not found: {predictions_path}. Skipping prediction analysis.")
logger.info("Placeholder for Model Prediction Analysis section.")

## 6. Conclusions & Next Steps

Summarize findings from the exploration and outline any actions or further investigations needed.

In [None]:
logger.info("Notebook execution placeholder finished.")