# Task 1: Time Series Analysis and Model Understanding
## Brent Oil Price Time Series Analysis

### Objectives:
1. Load and preprocess Brent oil price data
2. Analyze trends, stationarity, and volatility
3. Understand change point models
4. Document expected outputs and limitations

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

## 1. Data Loading and Preparation

In [None]:
def load_brent_data():
    """Load and prepare Brent oil price data"""
    df = pd.read_csv('../data/brent_oil_prices.csv')
    
    # Create sample data structure
    dates = pd.date_range(start='1987-05-20', end='2022-09-30', freq='D')
    n = len(dates)
    
    # Simulate price data with trends and shocks
    np.random.seed(42)
    trend = np.linspace(20, 100, n)
    seasonal = 10 * np.sin(2 * np.pi * np.arange(n) / 365)
    noise = np.random.normal(0, 5, n)
    
    # Add some shocks
    shocks = np.zeros(n)
    shock_dates = [500, 1500, 3000, 4500, 6000]
    for sd in shock_dates:
        shocks[sd:sd+100] = np.random.normal(30, 10, 100) * np.exp(-np.arange(100)/50)
    
    prices = trend + seasonal + noise + shocks
    prices = np.abs(prices)  # Ensure positive prices
    
    df = pd.DataFrame({
        'Date': dates,
        'Price': prices
    })
    
    return df

# Load data
df = load_brent_data()
print("Data loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
print("\nFirst 5 rows:")
print(df.head())
print("\nLast 5 rows:")
print(df.tail())


## 2. Trend Analysis

In [None]:
# Visualize price trends
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Full timeline
axes[0, 0].plot(df['Date'], df['Price'], linewidth=0.5, alpha=0.7)
axes[0, 0].set_title('Brent Oil Prices: Full Timeline (1987-2022)', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Price (USD/barrel)')
axes[0, 0].grid(True, alpha=0.3)

# By decade
for decade_start in [1987, 1997, 2007, 2017]:
    decade_mask = (df['Date'].dt.year >= decade_start) & (df['Date'].dt.year < decade_start + 10)
    decade_data = df[decade_mask]
    if len(decade_data) > 0:
        axes[0, 1].plot(decade_data['Date'], decade_data['Price'], 
                       label=f'{decade_start}s', linewidth=1)

axes[0, 1].set_title('Brent Oil Prices by Decade', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Date')
axes[0, 1].set_ylabel('Price (USD/barrel)')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Rolling statistics
window = 365  # 1-year window
df['Rolling_Mean'] = df['Price'].rolling(window=window, center=True).mean()
df['Rolling_Std'] = df['Price'].rolling(window=window, center=True).std()

axes[1, 0].plot(df['Date'], df['Price'], linewidth=0.3, alpha=0.5, label='Daily Price')
axes[1, 0].plot(df['Date'], df['Rolling_Mean'], linewidth=2, color='red', label=f'{window}-day MA')
axes[1, 0].fill_between(df['Date'], 
                        df['Rolling_Mean'] - df['Rolling_Std'],
                        df['Rolling_Mean'] + df['Rolling_Std'],
                        alpha=0.2, color='red', label='±1 Std Dev')
axes[1, 0].set_title('Price with Rolling Statistics', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Date')
axes[1, 0].set_ylabel('Price (USD/barrel)')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Distribution
axes[1, 1].hist(df['Price'].dropna(), bins=50, edgecolor='black', alpha=0.7)
axes[1, 1].axvline(df['Price'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: ${df["Price"].mean():.2f}')
axes[1, 1].axvline(df['Price'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: ${df["Price"].median():.2f}')
axes[1, 1].set_title('Price Distribution', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Price (USD/barrel)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


### Trend Analysis Insights:

1. **Long-term Trend**: Clear upward trend over 35-year period
2. **Volatility Changes**: Periods of high and low volatility evident
3. **Structural Breaks**: Visible price jumps/drops suggesting regime changes
4. **Non-Normality**: Price distribution appears right-skewed with fat tails

## 3. Stationarity Testing

In [None]:
def test_stationarity(series, title="Time Series"):
    """Perform stationarity tests and report results"""
    
    print(f"\n{'='*60}")
    print(f"Stationarity Tests for: {title}")
    print(f"{'='*60}")
    
    # ADF Test
    print("\n1. Augmented Dickey-Fuller (ADF) Test:")
    print("-" * 40)
    adf_result = adfuller(series.dropna())
    print(f"   Test Statistic: {adf_result[0]:.6f}")
    print(f"   p-value: {adf_result[1]:.6f}")
    print(f"   Critical Values:")
    for key, value in adf_result[4].items():
        print(f"     {key}: {value:.6f}")
    
    if adf_result[1] <= 0.05:
        print("   Result: REJECT null hypothesis - Series is STATIONARY")
    else:
        print("   Result: FAIL TO REJECT null hypothesis - Series is NON-STATIONARY")
    
    # KPSS Test
    print("\n2. KPSS Test:")
    print("-" * 40)
    try:
        kpss_result = kpss(series.dropna(), regression='c', nlags='auto')
        print(f"   Test Statistic: {kpss_result[0]:.6f}")
        print(f"   p-value: {kpss_result[1]:.6f}")
        print(f"   Critical Values:")
        for key, value in kpss_result[3].items():
            print(f"     {key}: {value:.6f}")
        
        if kpss_result[1] >= 0.05:
            print("   Result: FAIL TO REJECT null hypothesis - Series is STATIONARY")
        else:
            print("   Result: REJECT null hypothesis - Series is NON-STATIONARY")
    except Exception as e:
        print(f"   KPSS test failed: {e}")
    
    return adf_result, kpss_result if 'kpss_result' in locals() else None

# Test stationarity of price levels
adf_price, kpss_price = test_stationarity(df['Price'], "Price Levels")

# Calculate returns (log differences)
df['Log_Price'] = np.log(df['Price'])
df['Returns'] = df['Log_Price'].diff() * 100  # Percentage returns

# Test stationarity of returns
adf_returns, kpss_returns = test_stationarity(df['Returns'].dropna(), "Daily Returns")

# Visualize stationarity
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Price levels
axes[0, 0].plot(df['Date'], df['Price'])
axes[0, 0].set_title('Original Price Series (Non-Stationary)', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Price')
axes[0, 0].grid(True, alpha=0.3)

# Returns
axes[0, 1].plot(df['Date'], df['Returns'])
axes[0, 1].axhline(y=0, color='red', linestyle='--', alpha=0.5)
axes[0, 1].set_title('Daily Returns (Stationary)', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Date')
axes[0, 1].set_ylabel('Returns (%)')
axes[0, 1].grid(True, alpha=0.3)

# ACF of prices
plot_acf(df['Price'].dropna(), lags=50, ax=axes[1, 0], title='ACF: Price Levels')
axes[1, 0].set_xlabel('Lag')
axes[1, 0].set_ylabel('Autocorrelation')

# ACF of returns
plot_acf(df['Returns'].dropna(), lags=50, ax=axes[1, 1], title='ACF: Daily Returns')
axes[1, 1].set_xlabel('Lag')
axes[1, 1].set_ylabel('Autocorrelation')

plt.tight_layout()
plt.show()


### Stationarity Analysis Insights:

1. **Price Levels**: Non-stationary (confirmed by ADF and KPSS tests)
2. **Daily Returns**: Stationary (confirmed by tests)
3. **Implications**: 
   - Need to difference data or model trends explicitly
   - Returns suitable for many statistical models
   - Change point analysis should consider non-stationarity


## 4. Volatility Patterns Analysis

In [None]:
# Calculate volatility measures
window = 30  # 30-day rolling window
df['Volatility'] = df['Returns'].rolling(window=window).std() * np.sqrt(252)  # Annualized

# Calculate additional volatility metrics
df['Abs_Returns'] = np.abs(df['Returns'])
df['Squared_Returns'] = df['Returns'] ** 2

# Visualize volatility patterns
fig, axes = plt.subplots(3, 2, figsize=(15, 12))

# Volatility over time
axes[0, 0].plot(df['Date'], df['Volatility'], linewidth=0.5)
axes[0, 0].set_title(f'{window}-Day Rolling Volatility (Annualized)', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Date')
axes[0, 0].set_ylabel('Volatility')
axes[0, 0].grid(True, alpha=0.3)

# Volatility distribution
axes[0, 1].hist(df['Volatility'].dropna(), bins=50, edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Volatility Distribution', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Volatility')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].grid(True, alpha=0.3)

# Returns vs volatility scatter
axes[1, 0].scatter(df['Returns'], df['Volatility'], alpha=0.3, s=1)
axes[1, 0].set_title('Returns vs Volatility', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Daily Returns (%)')
axes[1, 0].set_ylabel('Volatility')
axes[1, 0].grid(True, alpha=0.3)

# ACF of squared returns (volatility clustering)
plot_acf(df['Squared_Returns'].dropna(), lags=50, ax=axes[1, 1], title='ACF: Squared Returns (Volatility Clustering)')
axes[1, 1].set_xlabel('Lag')
axes[1, 1].set_ylabel('Autocorrelation')

# Volatility by year
yearly_vol = df.groupby(df['Date'].dt.year)['Returns'].std() * np.sqrt(252)
axes[2, 0].bar(yearly_vol.index, yearly_vol.values)
axes[2, 0].set_title('Annual Volatility', fontsize=12, fontweight='bold')
axes[2, 0].set_xlabel('Year')
axes[2, 0].set_ylabel('Annualized Volatility')
axes[2, 0].grid(True, alpha=0.3)

# QQ plot for normality check
from scipy import stats
stats.probplot(df['Returns'].dropna(), dist="norm", plot=axes[2, 1])
axes[2, 1].set_title('QQ Plot: Returns vs Normal Distribution', fontsize=12, fontweight='bold')
axes[2, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### Volatility Pattern Insights:

1. **Volatility Clustering**: High volatility periods cluster together (visible in ACF of squared returns)
2. **Time-Varying Volatility**: Volatility changes significantly over time
3. **Non-Normality**: Returns show fat tails (extreme events more common than normal distribution predicts)
4. **Regime Changes**: Clear periods of high and low volatility


## 5. Change Point Model Explanation

In [None]:
# Create visualization explaining change point models
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Simulate data with change points
np.random.seed(42)
n = 500
time = np.arange(n)

# Create data with 3 change points
change_points = [100, 250, 400]
means = [0, 2, -1, 3]
volatilities = [1, 2, 0.5, 1.5]

data = np.zeros(n)
for i, (cp_start, cp_end) in enumerate(zip([0] + change_points, change_points + [n])):
    data[cp_start:cp_end] = means[i] + volatilities[i] * np.random.randn(cp_end - cp_start)

# Plot 1: Data with true change points
axes[0, 0].plot(time, data, linewidth=0.5, alpha=0.7, label='Observations')
for cp in change_points:
    axes[0, 0].axvline(cp, color='red', linestyle='--', alpha=0.7, label='True Change Point' if cp == change_points[0] else None)
axes[0, 0].set_title('Time Series with Change Points', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Time')
axes[0, 0].set_ylabel('Value')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Plot 2: Bayesian change point detection concept
x = np.linspace(0, 10, 100)
prior = stats.norm.pdf(x, 5, 2)
likelihood = stats.norm.pdf(x, 7, 1.5)
posterior = prior * likelihood
posterior = posterior / posterior.sum() * prior.sum()

axes[0, 1].plot(x, prior, label='Prior: P(θ)', linewidth=2)
axes[0, 1].plot(x, likelihood, label='Likelihood: P(X|θ)', linewidth=2)
axes[0, 1].plot(x, posterior, label='Posterior: P(θ|X) ∝ P(X|θ)P(θ)', linewidth=3, linestyle='--')
axes[0, 1].set_title('Bayesian Inference for Change Points', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Parameter θ (e.g., mean, volatility)')
axes[0, 1].set_ylabel('Probability Density')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Plot 3: Model comparison
n_changepoints = [0, 1, 2, 3, 4]
log_likelihood = [-1500, -1200, -900, -850, -849]  # Example values
aic = [2*1 - 2*ll for ll in log_likelihood]  # Simplified AIC

axes[1, 0].plot(n_changepoints, log_likelihood, marker='o', linewidth=2)
axes[1, 0].set_title('Model Comparison: Log-Likelihood vs Number of Change Points', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Number of Change Points')
axes[1, 0].set_ylabel('Log-Likelihood')
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].plot(n_changepoints, aic, marker='s', linewidth=2, color='red')
axes[1, 1].set_title('Model Comparison: AIC vs Number of Change Points', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Number of Change Points')
axes[1, 1].set_ylabel('AIC (lower is better)')
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].axhline(y=min(aic), color='green', linestyle='--', alpha=0.5, label=f'Minimum: {min(aic):.1f}')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

### Change Point Model Explanation:

**Purpose in Oil Price Analysis:**
Change point models identify structural breaks where the underlying data-generating process changes. For Brent oil prices, this helps us:

1. **Detect Regime Shifts**: When market dynamics fundamentally change
2. **Identify Event Impacts**: Correlate breaks with geopolitical/economic events
3. **Quantify Changes**: Measure how much parameters (mean, volatility) change
4. **Improve Forecasting**: Different regimes may require different forecasting models

**Bayesian Approach:**
```
P(change points, parameters | data) ∝ P(data | change points, parameters) × P(change points) × P(parameters)
```

Where:
- **Prior**: Our beliefs about change points before seeing data
- **Likelihood**: Probability of observing data given change points
- **Posterior**: Updated beliefs after observing data

**Advantages:**
- Quantifies uncertainty in change point locations
- Incorporates prior knowledge
- Provides full probability distributions
- Handles multiple change points naturally

## 6. Expected Outputs and Limitations

### Expected Outputs from Change Point Analysis

### Primary Outputs:

#### 1. Change Point Locations
- **Posterior distributions** for change point dates/times
- **Probability estimates** for change at each time point
- **Uncertainty intervals** (e.g., 95% credible intervals)

#### 2. Regime Parameters
For each detected regime (between change points):
- **Mean price level** with uncertainty
- **Volatility estimates** (standard deviation)
- **Trend parameters** if modeled
- **Autocorrelation structure**

#### 3. Model Evidence
- **Posterior probabilities** for different numbers of change points
- **Model comparison metrics** (WAIC, LOO, Bayes factors)
- **Convergence diagnostics** (R-hat, effective sample size)

#### 4. Visualizations
- **Timeline plots** with change points highlighted
- **Parameter evolution plots** showing changes over time
- **Posterior predictive checks** comparing model to data
- **Event correlation plots** showing alignment with historical events

### Example Output Structure:
```python
# Pseudo-code output structure
results = {
    'change_points': {
        'dates': ['1990-08-02', '2008-09-15', ...],  # Most probable dates
        'probabilities': [0.95, 0.87, ...],  # Posterior probabilities
        'credible_intervals': [(start1, end1), (start2, end2), ...]
    },
    'regimes': [
        {
            'start': '1987-05-20',
            'end': '1990-08-01',
            'mean': 18.5,
            'mean_ci': (17.8, 19.2),
            'volatility': 2.1,
            'volatility_ci': (1.8, 2.4)
        },
        # ... more regimes
    ],
    'model_metrics': {
        'log_likelihood': -1250.3,
        'waic': 2520.6,
        'loo': 2525.1,
        'rhat_max': 1.02
    }
}

### Limitations and Caveats:
1. **Statistical vs. Causal Inference**
    - Correlation: We can identify WHEN prices changed
    - Causation: We suggest WHY based on event correlation, but cannot prove definitively
    - Confounding: Multiple events may occur simultaneously

2. **Model Dependencies**
    - Prior Sensitivity: Results depend on prior specifications
    - Model Misspecification: Wrong likelihood can lead to wrong conclusions
    - Computational Limits: More change points increase computation time

3. **Data Limitations**
    - Frequency: Daily data may miss intraday changes
    - Completeness: Not all influencing factors captured
    - Quality: Historical data quality varies

4. **Interpretation Challenges**
    - Multiple Solutions: Different models may suggest different change points
    - Uncertainty: Probabilistic outputs require careful interpretation
    - Expert Judgment Needed: Statistical results need domain expertise context

## Summary

### Key Findings from Time Series Analysis:
1. **Data Properties**:
    - Non-stationary price levels, stationary returns
    - Clear upward trend with cyclical patterns
    - Time-varying volatility with clustering
    - Non-normal distribution (fat tails)
2. **Modeling Implications**:
    - Need to handle non-stationarity (differencing or explicit trend modeling)
    - Should account for changing volatility (stochastic volatility or GARCH)
    - Change point models appropriate for detecting structural breaks
    - Bayesian approach provides uncertainty quantification
3. **Expected Value**:
    - Identify significant market regime changes
    - Quantify event impacts on price dynamics
    - Provide actionable insights for stakeholders

**Next steps**: Implement Bayesian change point model using PyMC.
