In [5]:
import pandas as pd
# Assuming your training data is in the 'data' directory
data_path = '../data/raw/training.csv'
df = pd.read_csv(data_path)

# IMPORTANT: Ensure the TransactionStartTime column is a datetime object
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

# If you had missing values in your previous EDA, this is where you'd handle them,
# but since the EDA confirmed NO MISSING VALUES, we can skip that.

# Now, the rest of your original code will execute without the NameError.

In [18]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.base import BaseEstimator, TransformerMixin
# ... (other imports)

# ===============================================================
# FIX: LOAD DATA AND CONVERT DATETIME
# ===============================================================
data_path = '../data/raw/training.csv'
df = pd.read_csv(data_path)
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
# ===============================================================


# --- 1. Define Reference Date and Calculate RFM ---
# Find the maximum transaction date in the dataset
max_transaction_date = df['TransactionStartTime'].max() # This line will now work!

# Define the reference date as 1 day after the last transaction
REFERENCE_DATE = max_transaction_date + timedelta(days=1)

# Group by CustomerId and aggregate to calculate RFM
# ... (rest of your RFM code)

In [19]:
# Group by CustomerId and aggregate to calculate RFM
df_rfm = df.groupby('CustomerId').agg(
    # Recency: Calculate the difference between the reference date and max transaction date
    Recency=('TransactionStartTime', lambda x: (REFERENCE_DATE - x.max()).days),
    
    # Frequency: Count the number of transactions
    Frequency=('TransactionId', 'count'),
    
    # Monetary: Sum of the chosen monetary feature (using 'Amount' and dropping 'Value' later)
    Monetary=('Amount', 'sum'),
    
    # Add key categorical features for later encoding (use mode for customer-level representation)
    ChannelId=('ChannelId', lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'),
    ProviderId=('ProviderId', lambda x: x.mode()[0] if not x.mode().empty else 'Unknown')
).reset_index()

# Display the head of the aggregated RFM data
print("--- RFM Aggregated Data Sample ---")
print(df_rfm.head())

--- RFM Aggregated Data Sample ---
        CustomerId  Recency  Frequency  Monetary    ChannelId    ProviderId
0     CustomerId_1       84          1  -10000.0  ChannelId_2  ProviderId_4
1    CustomerId_10       84          1  -10000.0  ChannelId_2  ProviderId_4
2  CustomerId_1001       90          5   20000.0  ChannelId_3  ProviderId_4
3  CustomerId_1002       26         11    4225.0  ChannelId_2  ProviderId_4
4  CustomerId_1003       12          6   20000.0  ChannelId_3  ProviderId_6


In [20]:
class CapperTransformer(BaseEstimator, TransformerMixin):
    """
    A transformer that caps outliers above a specified percentile (e.g., 99th).
    Fits the threshold on the training data and uses it for transforming.
    """
    def __init__(self, upper_percentile=99):
        self.upper_percentile = upper_percentile
        self.thresholds = {}

    def fit(self, X, y=None):
        X_df = pd.DataFrame(X)
        for col in X_df.columns:
            self.thresholds[col] = X_df[col].quantile(self.upper_percentile / 100)
        return self

    def transform(self, X):
        X_copy = pd.DataFrame(X, copy=True)
        for col, threshold in self.thresholds.items():
            # Apply capping
            X_copy[col] = np.clip(X_copy[col], a_min=None, a_max=threshold)
        return X_copy.values # Return NumPy array for pipeline compatibility

In [21]:
# --- 3. Define Feature Sets ---
MONETARY_FEATS = ['Monetary']
RF_FEATS = ['Recency', 'Frequency']
CATEGORICAL_FEATS = ['ChannelId', 'ProviderId']

# --- 4. Define Specialized Pipelines ---
# 4a. Monetary Pipeline (Capping, Log Transform, Scaling)
monetary_pipeline = Pipeline(steps=[
    # 1. Custom Capper for Outliers (EDA Finding)
    ('capper', CapperTransformer(upper_percentile=99)), 
    
    # 2. Log Transformation for Skewness (EDA Finding)
    # Using numpy.log1p for log(1 + x)
    ('log_transform', FunctionTransformer(np.log1p, validate=True)),
    
    # 3. Standardization
    ('scaler', StandardScaler())
])

# 4b. Recency and Frequency Pipeline (Scaling Only)
rf_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# 4c. Categorical Pipeline (One-Hot Encoding)
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# --- 5. Create the Column Transformer ---
# This applies different pipelines to different feature subsets
preprocessor = ColumnTransformer(
    transformers=[
        ('monetary_proc', monetary_pipeline, MONETARY_FEATS),
        ('rf_proc', rf_pipeline, RF_FEATS),
        ('cat_proc', categorical_pipeline, CATEGORICAL_FEATS)
    ],
    # Drop the customer ID and any other unlisted columns
    remainder='drop' 
)

# --- 6. Define the Final Feature Engineering Pipeline ---
feat_engineering_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# --- 7. Fit and Transform the RFM Data ---
# Note: Only fit on the training data.
X_processed_array = feat_engineering_pipeline.fit_transform(df_rfm)

# Convert the resulting array back to a DataFrame for easier handling
# The column names will be complex due to OneHotEncoding prefixing
feature_names = (
    MONETARY_FEATS + RF_FEATS + 
    list(feat_engineering_pipeline.named_steps['preprocessor'].transformers_[2][1]['onehot'].get_feature_names_out(CATEGORICAL_FEATS))
)

df_processed = pd.DataFrame(X_processed_array, columns=feature_names)

print("\n--- Final Processed Feature Matrix (Sample) ---")
print(df_processed.head())
print("\nShape of Final Matrix:", df_processed.shape)


--- Final Processed Feature Matrix (Sample) ---
   Monetary   Recency  Frequency  ChannelId_ChannelId_1  \
0       NaN  1.937605  -0.253459                    0.0   
1       NaN  1.937605  -0.253459                    0.0   
2 -0.039434  2.158882  -0.212186                    0.0   
3 -0.817819 -0.201408  -0.150278                    0.0   
4 -0.039434 -0.717722  -0.201868                    0.0   

   ChannelId_ChannelId_2  ChannelId_ChannelId_3  ChannelId_ChannelId_5  \
0                    1.0                    0.0                    0.0   
1                    1.0                    0.0                    0.0   
2                    0.0                    1.0                    0.0   
3                    1.0                    0.0                    0.0   
4                    0.0                    1.0                    0.0   

   ProviderId_ProviderId_1  ProviderId_ProviderId_2  ProviderId_ProviderId_3  \
0                      0.0                      0.0                    

  return func(X, **(kw_args if kw_args else {}))


In [22]:
import os

# Define the path to the directory
ARTIFACTS_DIR = 'model_artifacts'

# Check if the directory exists, and if not, create it
if not os.path.exists(ARTIFACTS_DIR):
    os.makedirs(ARTIFACTS_DIR)
    print(f"Created directory: {ARTIFACTS_DIR}")
else:
    print(f"Directory already exists: {ARTIFACTS_DIR}")

# --- Now run your dump command ---
import joblib
joblib.dump(feat_engineering_pipeline, os.path.join(ARTIFACTS_DIR, 'feat_engineering_pipeline.pkl'))

print("Pipeline successfully saved!")

Directory already exists: model_artifacts
Pipeline successfully saved!
