In [None]:
# Top of notebook: ensure src is first on sys.path, then import the intended modules
import sys
from pathlib import Path

src_path = str((Path('..') / 'src').resolve())
if src_path not in sys.path:
    sys.path.insert(0, src_path)   # insert at front to prefer local src over other packages

# Optional: list files in src to confirm
from os import listdir
print("src dir files:", listdir(src_path))

# Import the modules you actually want (use the file/module names that exist in src/)
from data_processing import DataLoader
# If your pipeline is defined in feature_engineering_4_PTV.py:
from feature_engineering_4_PTV import feature_engineering_pipeline
# Import proxy and rfm modules (ensure their filenames are proxy_target.py and rfm_labeling.py)
from proxy_target import ProxyTargetEngineer
from rfm_labeling import RFMLabeler

# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Quick sanity check
import importlib, inspect
importlib.reload(sys.modules['feature_engineering_4_PTV'] if 'feature_engineering_4_PTV' in sys.modules else importlib.import_module('feature_engineering_4_PTV'))
import feature_engineering_4_PTV as fe4
print("Loaded feature_engineering_4_PTV from:", fe4.__file__)
print("Has pipeline:", hasattr(fe4, "feature_engineering_pipeline"))

In [None]:
# 1. Load raw transaction data
from pathlib import Path
data_path = Path('..') / 'data' / 'raw' / 'data.csv'
if not data_path.exists():
    raise FileNotFoundError(f"Raw data file not found: {data_path.resolve()}")

loader = DataLoader(str(data_path))
raw_df = loader.load_data()
print("Raw data shape:", raw_df.shape)
print("Columns:", list(raw_df.columns))
display(raw_df.head())


In [None]:
# 2. Run pipeline (using the correct module)
feat_df, feat_desc = feature_engineering_pipeline(raw_df, categorical_cols=None, create_proxy_target=False)
print("Engineered features shape:", feat_df.shape)
display(feat_df.head())

In [None]:

# 3. Fit ProxyTargetEngineer on raw transactions and merge is_high_risk into feat_df
proxy = ProxyTargetEngineer(
    id_col="CustomerId",
    time_col="TransactionStartTime",
    amount_col="Amount",
    n_clusters=3,
    random_state=42,
    scale=True
)
proxy.fit(raw_df)
feat_df = proxy.assign_labels(feat_df)

print("Engineered features shape (after proxy):", feat_df.shape)
print("is_high_risk value counts:")
display(feat_df["is_high_risk"].value_counts(dropna=False))
display(feat_df.head())


In [None]:

# 4. Save outputs
out_dir = Path("..") / "data" / "processed"
out_dir.mkdir(parents=True, exist_ok=True)
feat_df.to_csv(out_dir / "features_with_proxy.csv", index=False)
feat_desc.to_csv(out_dir / "ptv-features_description.csv", index=False)
print("Saved features and descriptions to ../data/processed/")


In [None]:

# 5. Optional: Audit RFM clustering and high-risk selection
rfm_engine = RFMLabeler()
rfm_engine.fit(raw_df)
rfm_df = rfm_engine.rfm_.reset_index()
print("RFM sample:")
display(rfm_df.head(10))
print("Cluster profiles (mean Frequency, Monetary, Recency):")
display(rfm_engine.cluster_profiles_)


In [None]:

# 6. Visualize RFM clusters (Frequency vs Monetary)
if "cluster" in rfm_df.columns:
    plt.figure(figsize=(8,6))
    sns.scatterplot(data=rfm_df, x="Frequency", y="Monetary", hue="cluster", palette="tab10")
    plt.title("RFM clusters (Frequency vs Monetary)")
    plt.tight_layout()
    plt.show()


In [None]:

# 7. Quick checks: confirm merge and describe high-risk customers
high_risk_customers = feat_df[feat_df["is_high_risk"] == 1]
print("High-risk sample (first 10):")
display(high_risk_customers.head(10))


In [None]:

# 8. If desired, retrain WoE/IV using is_high_risk as the target by fitting WoEIVCalculator
# from feature_engineering import WoEIVCalculator
# woe_calc = WoEIVCalculator(features=['ProductCategory','ChannelId'], target_col='is_high_risk', bins=5)
# woe_calc.fit(raw_df_with_is_high_risk)  # you'd need transaction-level target or aggregate appropriately

print("PTV engineering completed.")