# PCA factors on driver matrix

In [1]:
import sys  # no installation needed
from pathlib import Path  # no installation needed

ROOT = Path(r"C:\\Users\\quantbase\\Desktop\\ecom_forecast")
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))


In [2]:
import pandas as pd  # already in env - no new install
import numpy as np  # already in env - no new install
import matplotlib.pyplot as plt  # already in env - no new install

from src.config import ProjectPaths  # no installation needed
from src.pca_tools import fit_pca, select_pca_features  # no installation needed


In [3]:
paths = ProjectPaths.from_root()
paths.ensure_directories()
pca_dir = paths.outputs_dir / 'pca'
pca_dir.mkdir(parents=True, exist_ok=True)
driver = pd.read_pickle(paths.outputs_dir / 'drivers' / 'driver_matrix.pkl')


In [4]:
feature_cols = select_pca_features(driver)
# Drop rows with NaN only after selecting PCA columns (lags imply early drops)
driver_for_pca = driver[['Day'] + feature_cols].copy()
driver_for_pca = driver_for_pca.dropna().reset_index(drop=True)
pca_result = fit_pca(driver_for_pca, feature_cols=feature_cols, n_components=5)
loadings = pca_result['loadings']
scores = pca_result['scores']
explained = pca_result['explained_variance_ratio']


In [5]:
cum_explained = np.cumsum(explained)
explained_df = pd.DataFrame({
    'PC': [f'PC{i+1}' for i in range(len(explained))],
    'ratio': explained,
    'cumulative': cum_explained,
})
explained_df.to_csv(pca_dir / 'explained_variance.csv', index=False)
loadings.to_csv(pca_dir / 'loadings.csv')
scores.to_csv(pca_dir / 'pc_scores.csv', index=False)

# Plot cumulative explained variance
fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(range(1, len(cum_explained)+1), cum_explained, marker='o')
ax.set_xlabel('Number of Components')
ax.set_ylabel('Cumulative Explained Variance')
ax.set_ylim(0, 1.05)
ax.grid(True)
plt.tight_layout()
plt.savefig(pca_dir / 'explained_variance.png', dpi=200)
plt.close(fig)

# Plot PC1 and PC2 over time
fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(pd.to_datetime(scores['Day']), scores['PC1'], label='PC1')
ax.plot(pd.to_datetime(scores['Day']), scores['PC2'], label='PC2')
ax.set_xlabel('Day')
ax.set_ylabel('Score')
ax.legend()
plt.tight_layout()
plt.savefig(pca_dir / 'pc_timeseries.png', dpi=200)
plt.close(fig)

print('Features used:', feature_cols)
print('Explained variance:', explained_df)
loadings.head()


Features used: ['Sessions', 'Conversion rate', 'aov_proxy', 'return_rate_gross', 'discount_rate_gross', 'Ad_Spend', 'Meta_Spend', 'Google_Spend', 'TikTok_Spend', 'Email_SMS_Cost', 'meta_cpc', 'google_cpc', 'tiktok_cpc', 'return_rate_gross_lag_7', 'return_rate_gross_lag_14', 'return_rate_gross_lag_21', 'returns_abs_lag_7', 'returns_abs_lag_14', 'returns_abs_lag_21']
Explained variance:     PC     ratio  cumulative
0  PC1  0.403229    0.403229
1  PC2  0.204622    0.607851
2  PC3  0.076203    0.684054
3  PC4  0.066029    0.750083
4  PC5  0.059315    0.809398


Unnamed: 0,PC1,PC2,PC3,PC4,PC5
Sessions,0.217184,0.216675,0.02205,0.172073,0.055894
Conversion rate,0.286925,0.066121,0.090158,0.234104,0.112829
aov_proxy,-0.053037,-0.18673,0.017975,-0.396769,-0.660353
return_rate_gross,-0.109374,0.310987,-0.066899,0.159267,0.40685
discount_rate_gross,0.025426,0.053055,-0.342793,-0.203938,0.093034
