## Setup and Data Loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.tsa.stattools import adfuller, acf, pacf, kpss
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("Libraries loaded successfully!")

Libraries loaded successfully!


In [None]:
# Load Brent oil price data
df = pd.read_csv('../data/raw/BrentOilPrices.csv')

# Convert date column - using mixed format to handle both formats
df['Date'] = pd.to_datetime(df['Date'], format='mixed', dayfirst=True)

# Sort by date
df = df.sort_values('Date').reset_index(drop=True)

print(f"Data shape: {df.shape}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Data quality check
print("Missing values:")
print(df.isnull().sum())
print(f"\nBasic statistics:")
df['Price'].describe()

### 2. Historical Price Trends

In [None]:
# Plot historical prices
fig, ax = plt.subplots(figsize=(16, 6))
ax.plot(df['Date'], df['Price'], linewidth=0.8, color='darkblue', alpha=0.8)
ax.set_title('Brent Oil Prices (May 1987 - September 2022)', fontsize=16, fontweight='bold')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Price (USD per barrel)', fontsize=12)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nKey observations:")
print(f"- Maximum price: ${df['Price'].max():.2f} on {df.loc[df['Price'].idxmax(), 'Date'].date()}")
print(f"- Minimum price: ${df['Price'].min():.2f} on {df.loc[df['Price'].idxmin(), 'Date'].date()}")
print(f"- Mean price: ${df['Price'].mean():.2f}")
print(f"- Median price: ${df['Price'].median():.2f}")
plt.savefig('../results/figures/fig_brent_prices_full.png', dpi=300, bbox_inches='tight')

### 2.1 Recent Decade Focus (2012-2022)

In [None]:
# Filter to recent decade
df_recent = df[df['Date'] >= '2012-01-01'].copy()

fig, ax = plt.subplots(figsize=(16, 6))
ax.plot(df_recent['Date'], df_recent['Price'], linewidth=1, color='darkgreen')
ax.set_title('Brent Oil Prices (2012-2022): Focus Period', fontsize=16, fontweight='bold')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Price (USD per barrel)', fontsize=12)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Recent data shape: {df_recent.shape}")
print(f"Recent price range: ${df_recent['Price'].min():.2f} - ${df_recent['Price'].max():.2f}")
plt.savefig('../results/figures/fig_brent_prices_2012_2022.png', dpi=300, bbox_inches='tight')

### 3. Log Returns Analysis

In [None]:
# Calculate log returns
df['Log_Return'] = np.log(df['Price'] / df['Price'].shift(1))
df_recent['Log_Return'] = np.log(df_recent['Price'] / df_recent['Price'].shift(1))

# Remove NaN from first row
df = df.dropna()
df_recent = df_recent.dropna()

print("Log returns summary (full data):")
print(df['Log_Return'].describe())
print("\nLog returns summary (2012-2022):")
print(df_recent['Log_Return'].describe())

In [None]:
# Plot log returns
fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Time series of log returns
axes[0].plot(df_recent['Date'], df_recent['Log_Return'], linewidth=0.6, alpha=0.7)
axes[0].axhline(y=0, color='red', linestyle='--', linewidth=1)
axes[0].set_title('Daily Log Returns (2012-2022)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Date', fontsize=11)
axes[0].set_ylabel('Log Return', fontsize=11)
axes[0].grid(True, alpha=0.3)

# Distribution of log returns
axes[1].hist(df_recent['Log_Return'], bins=100, alpha=0.7, edgecolor='black', density=True)
axes[1].set_title('Distribution of Daily Log Returns (2012-2022)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Log Return', fontsize=11)
axes[1].set_ylabel('Density', fontsize=11)
axes[1].grid(True, alpha=0.3, axis='y')

# Overlay normal distribution
mu, std = df_recent['Log_Return'].mean(), df_recent['Log_Return'].std()
x = np.linspace(df_recent['Log_Return'].min(), df_recent['Log_Return'].max(), 100)
axes[1].plot(x, stats.norm.pdf(x, mu, std), 'r-', linewidth=2, label=f'Normal(μ={mu:.4f}, σ={std:.4f})')
axes[1].legend()

plt.tight_layout()
plt.show()
plt.savefig('../results/figures/fig_log_returns_analysis.png', dpi=300, bbox_inches='tight')

In [None]:
# Test for normality
statistic, p_value = stats.normaltest(df_recent['Log_Return'])
print(f"Normality test (D'Agostino-Pearson):")
print(f"  Statistic: {statistic:.4f}")
print(f"  P-value: {p_value:.6f}")
if p_value < 0.05:
    print("  Result: Reject normality (p < 0.05) - Returns have fat tails or skewness")
else:
    print("  Result: Cannot reject normality (p >= 0.05)")

# Skewness and kurtosis
skew = stats.skew(df_recent['Log_Return'])
kurt = stats.kurtosis(df_recent['Log_Return'])
print(f"\nSkewness: {skew:.4f} (normal = 0)")
print(f"Kurtosis: {kurt:.4f} (normal = 0, higher = fatter tails)")

### 4. Volatility Analysis

In [None]:
# Rolling volatility (30-day window)
window = 30
df_recent['Rolling_Volatility'] = df_recent['Log_Return'].rolling(window=window).std() * np.sqrt(252)  # Annualized

fig, ax = plt.subplots(figsize=(16, 6))
ax.plot(df_recent['Date'], df_recent['Rolling_Volatility'], linewidth=1.2, color='purple')
ax.set_title(f'{window}-Day Rolling Volatility (Annualized, 2012-2022)', fontsize=14, fontweight='bold')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Annualized Volatility', fontsize=12)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
plt.savefig('../results/figures/fig_rolling_volatility.png', dpi=300, bbox_inches='tight')

print("\nVolatility clustering observation:")
print("Periods of high volatility tend to cluster together, indicating regime changes.")

### 5. Stationarity Testing

In [None]:
# Augmented Dickey-Fuller test on prices
def adf_test(series, name):
    result = adfuller(series.dropna(), autolag='AIC')
    print(f"\n{name}:")
    print(f"  ADF Statistic: {result[0]:.6f}")
    print(f"  P-value: {result[1]:.6f}")
    print(f"  Critical Values:")
    for key, value in result[4].items():
        print(f"    {key}: {value:.4f}")
    if result[1] < 0.05:
        print(f"  Result: Reject null - Series is STATIONARY (p < 0.05)")
    else:
        print(f"  Result: Cannot reject null - Series is NON-STATIONARY (p >= 0.05)")
    return result

print("=" * 60)
print("AUGMENTED DICKEY-FULLER TEST")
print("Null Hypothesis: Series has a unit root (non-stationary)")
print("=" * 60)

adf_price = adf_test(df_recent['Price'], "Price Level")
adf_returns = adf_test(df_recent['Log_Return'], "Log Returns")

In [None]:
# KPSS test
def kpss_test(series, name):
    result = kpss(series.dropna(), regression='c', nlags='auto')
    print(f"\n{name}:")
    print(f"  KPSS Statistic: {result[0]:.6f}")
    print(f"  P-value: {result[1]:.6f}")
    print(f"  Critical Values:")
    for key, value in result[3].items():
        print(f"    {key}: {value:.4f}")
    if result[1] < 0.05:
        print(f"  Result: Reject null - Series is NON-STATIONARY (p < 0.05)")
    else:
        print(f"  Result: Cannot reject null - Series is STATIONARY (p >= 0.05)")
    return result

print("\n" + "=" * 60)
print("KPSS TEST")
print("Null Hypothesis: Series is stationary")
print("=" * 60)

kpss_price = kpss_test(df_recent['Price'], "Price Level")
kpss_returns = kpss_test(df_recent['Log_Return'], "Log Returns")

#### 6. Autocorrelation Analysis

In [None]:
# Autocorrelation and partial autocorrelation
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# ACF and PACF for prices
plot_acf(df_recent['Price'].dropna(), lags=40, ax=axes[0, 0])
axes[0, 0].set_title('ACF: Price Level', fontsize=12, fontweight='bold')
plot_pacf(df_recent['Price'].dropna(), lags=40, ax=axes[0, 1])
axes[0, 1].set_title('PACF: Price Level', fontsize=12, fontweight='bold')

# ACF and PACF for log returns
plot_acf(df_recent['Log_Return'].dropna(), lags=40, ax=axes[1, 0])
axes[1, 0].set_title('ACF: Log Returns', fontsize=12, fontweight='bold')
plot_pacf(df_recent['Log_Return'].dropna(), lags=40, ax=axes[1, 1])
axes[1, 1].set_title('PACF: Log Returns', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()
plt.savefig('../results/figures/fig_autocorrelation.png', dpi=300, bbox_inches='tight')


print("\nInterpretation:")
print("- Price levels show strong autocorrelation (non-stationary behavior)")
print("- Log returns show minimal autocorrelation (closer to white noise)")

### 7. Trend Analysis with Moving Averages

In [None]:
# Calculate moving averages
df_recent['MA_30'] = df_recent['Price'].rolling(window=30).mean()
df_recent['MA_90'] = df_recent['Price'].rolling(window=90).mean()
df_recent['MA_180'] = df_recent['Price'].rolling(window=180).mean()

fig, ax = plt.subplots(figsize=(16, 8))
ax.plot(df_recent['Date'], df_recent['Price'], linewidth=1, alpha=0.5, label='Daily Price', color='gray')
ax.plot(df_recent['Date'], df_recent['MA_30'], linewidth=1.5, label='30-Day MA', color='blue')
ax.plot(df_recent['Date'], df_recent['MA_90'], linewidth=1.5, label='90-Day MA', color='orange')
ax.plot(df_recent['Date'], df_recent['MA_180'], linewidth=2, label='180-Day MA', color='red')
ax.set_title('Brent Oil Prices with Moving Averages (2012-2022)', fontsize=14, fontweight='bold')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Price (USD per barrel)', fontsize=12)
ax.legend(loc='best', fontsize=11)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
plt.savefig('../results/figures/fig_moving_averages.png', dpi=300, bbox_inches='tight')