# Z1 Data Exploration and Loading

This notebook demonstrates how to load and explore Federal Reserve Z1 (Flow of Funds) data using the project's data loading infrastructure.

## 1. Setup and Imports

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Project imports
import sys
sys.path.append('..')  # Add parent directory to path

from src.data.cached_fed_data_loader import CachedFedDataLoader
from src.data.data_processor import DataProcessor

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

## 2. Load Z1 Data with Caching

In [None]:
# Initialize the cached data loader
loader = CachedFedDataLoader(
    base_directory="../data/fed_data",
    cache_directory="../data/cache"
)

# Load Z1 data (will use cache if available)
print("Loading Z1 data...")
z1_data = loader.load_single_source('Z1')

if z1_data is not None:
    print(f"\nLoaded Z1 data shape: {z1_data.shape}")
    print(f"Date range: {z1_data.index.min()} to {z1_data.index.max()}")
    print(f"Number of series: {len(z1_data.columns)}")
else:
    print("Failed to load Z1 data. Please check data availability.")

## 3. Data Structure Exploration

In [None]:
# Display basic information about the dataset
print("Dataset Info:")
print("=" * 50)
print(f"Total observations: {len(z1_data)}")
print(f"Total series: {len(z1_data.columns)}")
print(f"\nData types:")
print(z1_data.dtypes.value_counts())

# Show sample data
print("\nFirst 5 rows and columns:")
z1_data.iloc[:5, :5]

In [None]:
# Explore series names and categories
series_names = z1_data.columns.tolist()

# Categorize series by prefix
series_categories = {}
for series in series_names:
    prefix = series.split('.')[0] if '.' in series else series[:2]
    if prefix not in series_categories:
        series_categories[prefix] = []
    series_categories[prefix].append(series)

print("Series Categories:")
print("=" * 50)
for category, series_list in sorted(series_categories.items()):
    print(f"{category}: {len(series_list)} series")

# Show example series from major categories
print("\nExample series from major categories:")
for category in ['FL', 'FA', 'FU', 'LM']:
    if category in series_categories:
        print(f"\n{category} examples:")
        for series in series_categories[category][:3]:
            print(f"  - {series}")

## 4. Data Quality Assessment

In [None]:
# Check for missing values
missing_summary = pd.DataFrame({
    'missing_count': z1_data.isnull().sum(),
    'missing_pct': (z1_data.isnull().sum() / len(z1_data) * 100).round(2)
})

# Show series with different levels of completeness
print("Data Completeness Summary:")
print("=" * 50)
print(f"Series with no missing values: {(missing_summary['missing_count'] == 0).sum()}")
print(f"Series with <10% missing: {(missing_summary['missing_pct'] < 10).sum()}")
print(f"Series with >50% missing: {(missing_summary['missing_pct'] > 50).sum()}")

# Visualize missing data patterns
plt.figure(figsize=(12, 6))
missing_pct_bins = pd.cut(missing_summary['missing_pct'], 
                          bins=[0, 10, 25, 50, 75, 100],
                          labels=['0-10%', '10-25%', '25-50%', '50-75%', '75-100%'])
missing_pct_bins.value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Missing Data Across Series')
plt.xlabel('Percentage of Missing Values')
plt.ylabel('Number of Series')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 5. Time Series Characteristics

In [None]:
# Select a few representative series for detailed analysis
# Focus on series with good data coverage
complete_series = missing_summary[missing_summary['missing_count'] == 0].index

# Select different types of series
selected_series = []
for prefix in ['FL', 'FA', 'FU', 'LM']:
    prefix_series = [s for s in complete_series if s.startswith(prefix)]
    if prefix_series:
        selected_series.append(prefix_series[0])

print(f"Selected {len(selected_series)} series for detailed analysis:")
for series in selected_series:
    print(f"  - {series}")

In [None]:
# Plot selected series
fig, axes = plt.subplots(len(selected_series), 1, figsize=(12, 3*len(selected_series)))
if len(selected_series) == 1:
    axes = [axes]

for idx, series in enumerate(selected_series):
    z1_data[series].plot(ax=axes[idx], linewidth=1.5)
    axes[idx].set_title(f'{series}')
    axes[idx].set_xlabel('')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Statistical Summary

In [None]:
# Compute basic statistics for selected series
stats_df = z1_data[selected_series].describe()

# Add additional statistics
stats_df.loc['skew'] = z1_data[selected_series].skew()
stats_df.loc['kurtosis'] = z1_data[selected_series].kurtosis()
stats_df.loc['cv'] = stats_df.loc['std'] / stats_df.loc['mean']  # Coefficient of variation

print("Statistical Summary of Selected Series:")
print("=" * 50)
stats_df.round(2)

## 7. Growth Rate Analysis

In [None]:
# Calculate year-over-year growth rates
growth_rates = z1_data[selected_series].pct_change(4) * 100  # 4 quarters = 1 year

# Plot growth rates
fig, ax = plt.subplots(figsize=(12, 6))
for series in selected_series:
    growth_rates[series].plot(ax=ax, label=series, alpha=0.7)

ax.set_title('Year-over-Year Growth Rates')
ax.set_ylabel('Growth Rate (%)')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True, alpha=0.3)
ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

## 8. Correlation Analysis

In [None]:
# Compute correlation matrix for selected series
corr_matrix = z1_data[selected_series].corr()

# Visualize correlation matrix
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(corr_matrix), k=1)
sns.heatmap(corr_matrix, 
            mask=mask,
            annot=True, 
            fmt='.2f',
            cmap='coolwarm', 
            center=0,
            square=True,
            linewidths=1,
            cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Selected Series')
plt.tight_layout()
plt.show()

## 9. Load Additional Fed Data Sources

In [None]:
# Load multiple data sources
print("Loading additional Federal Reserve data sources...")
additional_sources = ['H6', 'H15']

fed_data = {}
for source in ['Z1'] + additional_sources:
    print(f"\nLoading {source}...")
    data = loader.load_single_source(source)
    if data is not None:
        fed_data[source] = data
        print(f"  ✓ Loaded {source}: {data.shape}")
    else:
        print(f"  ✗ Failed to load {source}")

print(f"\nSuccessfully loaded {len(fed_data)} data sources")

## 10. Save Processed Data for Next Steps

In [None]:
# Process and prepare data for decomposition analysis
processor = DataProcessor()

# Process Z1 data
z1_processed = processor.process_fed_data(z1_data, 'Z1')

# Select series with sufficient data for analysis
min_observations = 40  # At least 10 years of quarterly data
analysis_series = z1_processed.columns[
    z1_processed.count() >= min_observations
].tolist()

print(f"Selected {len(analysis_series)} series with at least {min_observations} observations")
print(f"\nSaving processed data for next notebooks...")

# Save selected series names for use in other notebooks
import json
with open('../data/selected_series.json', 'w') as f:
    json.dump({
        'series': analysis_series,
        'date_range': {
            'start': str(z1_processed.index.min()),
            'end': str(z1_processed.index.max())
        },
        'n_observations': len(z1_processed)
    }, f)

print("✓ Data exploration complete!")