In [1]:
import pandas as pd
import os
import sys
import logging
from datetime import datetime

# ---- Step 1: Set up logging ----
log_file = f"preprocessing_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger()

# Optional: Also log to console
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] %(message)s"))
logger.addHandler(console)

logger.info("🚀 Starting preprocessing pipeline")

# ---- Step 2: Define project root and ensure it's on sys.path ----
project_root = r"C:\Users\techin\credit_analysis_week5"
if project_root not in sys.path:
    sys.path.append(project_root)
    logger.info(f"✅ Project root added to sys.path: {project_root}")

# ---- Step 3: Import the custom preprocessing function ----
try:
    from src.data_processing import preprocess_data
except ModuleNotFoundError as e:
    logger.error("❌ Could not import preprocess_data from src.data_processing")
    raise e

# ---- Step 4: Define file paths ----
RAW_DATA_FILENAME = "data.csv"
RAW_DATA_PATH = os.path.join(project_root, "data", "raw", RAW_DATA_FILENAME)
PROCESSED_DATA_DIR = os.path.join(project_root, "data", "processed")
PROCESSED_FILE_NAME = "cleaned_data.csv"
FULL_OUTPUT_PATH = os.path.join(PROCESSED_DATA_DIR, PROCESSED_FILE_NAME)

os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
logger.info(f"✅ Ensured processed data directory exists: {PROCESSED_DATA_DIR}")

# ---- Step 5: Load raw data ----
try:
    raw_df = pd.read_csv(RAW_DATA_PATH, parse_dates=['TransactionStartTime'])
    logger.info(f"📥 Loaded raw data from {RAW_DATA_PATH} with shape {raw_df.shape}")
except Exception as e:
    logger.error(f"❌ Failed to read raw data: {e}")
    raise e

# ---- Step 6: Validate raw data columns ----
required_columns = [
    'CustomerId', 'TransactionStartTime', 'TransactionId', 'Amount', 'Value',
    'CountryCode', 'CurrencyCode', 'ChannelId'
]
missing = [col for col in required_columns if col not in raw_df.columns]
if missing:
    logger.error(f"❌ Missing required columns: {missing}")
    raise ValueError(f"Missing required columns: {missing}")

# ---- Step 7: Run preprocessing ----
try:
    reference_date = pd.to_datetime(raw_df['TransactionStartTime'].max())
    logger.info(f"📆 Using reference date for Recency calculation: {reference_date}")
    
    processed_df = preprocess_data(raw_df, reference_date)
    logger.info(f"✅ Preprocessing completed. Final shape: {processed_df.shape}")
except Exception as e:
    logger.error(f"❌ Preprocessing failed: {e}")
    raise e

# ---- Step 8: Save processed data ----
try:
    processed_df.to_csv(FULL_OUTPUT_PATH, index=False)
    logger.info(f"💾 Processed data saved to {FULL_OUTPUT_PATH}")
except Exception as e:
    logger.error(f"❌ Failed to save processed data: {e}")
    raise e

print("🎉 Preprocessing pipeline completed successfully.")

2025-06-27 12:25:03,040 [INFO] 🚀 Starting preprocessing pipeline
2025-06-27 12:25:03,040 [INFO] ✅ Project root added to sys.path: C:\Users\techin\credit_analysis_week5
2025-06-27 12:25:04,793 [INFO] ✅ Ensured processed data directory exists: C:\Users\techin\credit_analysis_week5\data\processed
2025-06-27 12:25:05,776 [INFO] 📥 Loaded raw data from C:\Users\techin\credit_analysis_week5\data\raw\data.csv with shape (95662, 16)
2025-06-27 12:25:05,776 [INFO] 📆 Using reference date for Recency calculation: 2019-02-13 10:01:28+00:00
2025-06-27 12:25:10,771 [INFO] ✅ Preprocessing completed. Final shape: (3742, 20)
2025-06-27 12:25:10,881 [INFO] 💾 Processed data saved to C:\Users\techin\credit_analysis_week5\data\processed\cleaned_data.csv


🎉 Preprocessing pipeline completed successfully.


In [2]:
df = pd.read_csv("C:/Users/techin/credit_analysis_week5/data/processed/cleaned_data.csv")

In [3]:
df.shape

(3742, 20)

In [4]:
df.head()

Unnamed: 0,CustomerId,cat__CountryCode_256,cat__CurrencyCode_UGX,cat__ChannelId_ChannelId_1,cat__ChannelId_ChannelId_2,cat__ChannelId_ChannelId_3,cat__ChannelId_ChannelId_5,num__Amount_sum,num__Amount_mean,num__Amount_std,num__Amount_min,num__Amount_max,num__Amount_count,num__Value_sum,num__Value_mean,num__Frequency,num__Recency,AvgTransactionHour,MostFrequentDayOfWeek,AccountFrequency
0,CustomerId_1,1.0,1.0,0.0,1.0,0.0,0.0,-0.066891,-0.153364,0.0,-0.161532,-0.169081,-0.253459,-0.089524,-0.052297,-0.253459,1.937605,16.0,2,1
1,CustomerId_10,1.0,1.0,0.0,1.0,0.0,0.0,-0.066891,-0.153364,0.0,-0.161532,-0.169081,-0.253459,-0.089524,-0.052297,-0.253459,1.937605,16.0,2,1
2,CustomerId_1001,1.0,1.0,0.0,0.0,1.0,0.0,-0.055849,-0.06987,-0.105976,-0.103274,-0.113497,-0.212186,-0.082011,-0.07571,-0.212186,2.158882,7.8,4,2
3,CustomerId_1002,1.0,1.0,0.0,1.0,0.0,0.0,-0.061655,-0.091435,-0.168036,-0.04589,-0.13712,-0.150278,-0.091448,-0.109431,-0.150278,-0.201408,13.454545,3,2
4,CustomerId_1003,1.0,1.0,0.0,0.0,1.0,0.0,-0.055849,-0.073846,-0.111444,-0.103274,-0.113497,-0.201868,-0.081422,-0.080169,-0.201868,-0.717722,14.333333,4,3
