In [None]:
import os
import sys

import matplotlib.pyplot as plt
import seaborn as sns

# Ensure the project root is in sys.path for absolute imports
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))

# Add the project root to sys.path if it's not already there
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.pipelines import data_pipeline  # noqa: E402

In [None]:
lf = data_pipeline.run_data_pipeline(path=r"")
df = lf.collect()

gold_price_cols = [
    col for col in df.columns if col.startswith("Gold_High") or col == "Date"
]
if gold_price_cols:
    print(df.select(gold_price_cols).tail(5))

In [None]:
gold = df.select(gold_price_cols)
gold

In [None]:
var_std_cols = [col for col in gold.columns if "var" in col or "std" in col]

if var_std_cols:
    var_std_data = gold.select(["Date", *var_std_cols])
    for col in var_std_cols:
        plt.figure(figsize=(10, 6))
        plt.plot(var_std_data["Date"], var_std_data[col], label=col)
        plt.title(f"{col} Over Time")
        plt.xlabel("Date")
        plt.ylabel(col)
        plt.legend()
        plt.show()
else:
    print("No columns with 'var' or 'std' found in the dataset.")

In [None]:
# Compute the covariance matrix for all features
covariance_matrix = gold.drop(["Date", "Gold_Price_was_null"]).to_pandas().cov()

# Plot the covariance matrix as a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(covariance_matrix, annot=False, cmap="viridis", fmt=".2f")
plt.title("Covariance Matrix Heatmap")
plt.show()

In [None]:
# Compute the correlation matrix for all features
correlation_matrix = gold.drop(["Date", "Gold_Price_was_null"]).to_pandas().corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=False, cmap="viridis", fmt=".2f")
plt.title("Correlation Matrix Heatmap")
plt.show()

In [None]:
# Compute the rolling standard deviation covariance matrix
rolling_std_columns = [col for col in gold.columns if "return_rolling_std" in col]
covariance_matrix = gold.select(rolling_std_columns).to_pandas().cov()

# Plot the covariance matrix as a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(covariance_matrix, annot=False, cmap="viridis", fmt=".2f")
plt.title("Rolling Standard Deviation Covariance Matrix Heatmap")
plt.show()

In [None]:
# Compute the rolling standard deviation correlation matrix
rolling_std_columns = [col for col in gold.columns if "return_rolling_std" in col]
correlation_matrix = gold.select(rolling_std_columns).to_pandas().corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=False, cmap="viridis", fmt=".2f")
plt.title("Rolling Standard Deviation Correlation Matrix Heatmap")
plt.show()