In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

# You must also redefine the custom class if you haven't yet!
class CapperTransformer(BaseEstimator, TransformerMixin):
    # ... (paste the full CapperTransformer class definition here)
    pass

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd # Needed for the Capper to use quantiles

# =================================================================
# FIX: REDEFINE CUSTOM TRANSFORMER FOR JOBLIB TO FIND IT
# =================================================================
class CapperTransformer(BaseEstimator, TransformerMixin):
    """
    A transformer that caps outliers above a specified percentile (e.g., 99th).
    Must be defined here so joblib.load() can successfully reconstruct the pipeline.
    """
    def __init__(self, upper_percentile=99):
        self.upper_percentile = upper_percentile
        self.thresholds = {}

    def fit(self, X, y=None):
        X_df = pd.DataFrame(X)
        for col in X_df.columns:
            self.thresholds[col] = X_df[col].quantile(self.upper_percentile / 100)
        return self

    def transform(self, X):
        X_copy = pd.DataFrame(X, copy=True)
        for col, threshold in self.thresholds.items():
            # Apply capping
            X_copy[col] = np.clip(X_copy[col], a_min=None, a_max=threshold)
        return X_copy.values # Return NumPy array for pipeline compatibility
# =================================================================

In [3]:
# =================================================================
# FIX: RE-CREATE df_rfm (Required Input for Clustering)
# You need the original raw transaction data (df) to perform this step.
# =================================================================
import pandas as pd
from datetime import timedelta

# Assuming the raw data path is correct from previous steps:
data_path = '../data/raw/training.csv' 
df = pd.read_csv(data_path)
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

# Define Reference Date (Must be consistent with Task 3)
REFERENCE_DATE = df['TransactionStartTime'].max() + timedelta(days=1)

# Re-calculate RFM (Re-running Task 3, Part 1)
df_rfm = df.groupby('CustomerId').agg(
    Recency=('TransactionStartTime', lambda x: (REFERENCE_DATE - x.max()).days),
    Frequency=('TransactionId', 'count'),
    Monetary=('Amount', 'sum'),
    ChannelId=('ChannelId', lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'),
    ProviderId=('ProviderId', lambda x: x.mode()[0] if not x.mode().empty else 'Unknown')
).reset_index()

# Now df_rfm is defined and ready for clustering.
# =================================================================

In [11]:
import pandas as pd
import numpy as np
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import timedelta

# --- STEP 1: DEFINE CUSTOM CLASS (CRUCIAL for joblib to load the pipeline) ---
class CapperTransformer(BaseEstimator, TransformerMixin):
    # ... (Your CapperTransformer definition must be here)
    def __init__(self, upper_percentile=99):
        self.upper_percentile = upper_percentile
        self.thresholds = {}
    def fit(self, X, y=None):
        X_df = pd.DataFrame(X)
        X_df.columns = [f'col_{i}' for i in range(X_df.shape[1])] # Dummy columns for safety
        for col in X_df.columns:
            self.thresholds[col] = X_df[col].quantile(self.upper_percentile / 100)
        return self
    def transform(self, X):
        X_copy = pd.DataFrame(X, copy=True)
        X_copy.columns = list(self.thresholds.keys())
        for col, threshold in self.thresholds.items():
            X_copy[col] = np.clip(X_copy[col], a_min=None, a_max=threshold)
        return X_copy.values


# --- STEP 2: LOAD DATA (Define df_rfm) ---
# Re-run Task 3 Part 1 aggregation to define df_rfm
data_path = '../data/raw/training.csv' 
df = pd.read_csv(data_path)
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])
REFERENCE_DATE = df['TransactionStartTime'].max() + timedelta(days=1)
df_rfm = df.groupby('CustomerId').agg(
    Recency=('TransactionStartTime', lambda x: (REFERENCE_DATE - x.max()).days),
    Frequency=('TransactionId', 'count'),
    Monetary=('Amount', 'sum'),
    ChannelId=('ChannelId', lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'),
    ProviderId=('ProviderId', lambda x: x.mode()[0] if not x.mode().empty else 'Unknown')
).reset_index()


# --- STEP 3: LOAD PIPELINE (Define feat_engineering_pipeline) ---
try:
    feat_engineering_pipeline = joblib.load('model_artifacts/feat_engineering_pipeline.pkl')
    print("Feature Engineering Pipeline loaded successfully.")
except FileNotFoundError:
    print("CRITICAL ERROR: feat_engineering_pipeline.pkl not found. Cannot proceed.")
    
# --- STEP 4: DEFINE FEATURES ---
RFM_FEATS = ['Recency', 'Frequency', 'Monetary'] 
CATEGORICAL_FEATS = ['ChannelId', 'ProviderId'] 
CLUSTER_FEATURES = RFM_FEATS + CATEGORICAL_FEATS

Feature Engineering Pipeline loaded successfully.


In [12]:
import pandas as pd
import numpy as np

# Assuming feat_engineering_pipeline is loaded and df_rfm is defined in memory.
# Re-define features just to be safe:
RFM_FEATS = ['Recency', 'Frequency', 'Monetary'] 
CATEGORICAL_FEATS = ['ChannelId', 'ProviderId'] 
CLUSTER_FEATURES = RFM_FEATS + CATEGORICAL_FEATS

# --- 1. Define X_raw ---
# Assumes df_rfm (the RFM aggregated data) is loaded/re-created in a previous cell.
X_raw = df_rfm[CLUSTER_FEATURES]

# --- 2. Apply the Loaded Pipeline to Create X_scaled ---
# THIS is the line that defines X_scaled.
X_scaled = feat_engineering_pipeline.transform(X_raw)
print(f"Scaled feature matrix shape: {X_scaled.shape}")

# --- 3. DIAGNOSTIC CHECK ---
X_scaled_df = pd.DataFrame(X_scaled) 

# Check for any NaNs in the scaled data
nan_count = X_scaled_df.isna().sum().sum()
print(f"\nTotal NaN count in X_scaled before clustering: {nan_count}")

# Final confirmation printout
if nan_count > 0:
    print("❌ ERROR: The pipeline fix did not work! NaNs still present.")
    nan_rows = X_scaled_df[X_scaled_df.isna().any(axis=1)]
    print("\nSample rows in X_scaled containing NaNs:")
    print(nan_rows.head())
else:
    print("✅ SUCCESS: NaN check passed! X_scaled is clean and ready for clustering.")

Scaled feature matrix shape: (3742, 13)

Total NaN count in X_scaled before clustering: 192
❌ ERROR: The pipeline fix did not work! NaNs still present.

Sample rows in X_scaled containing NaNs:
    0         1         2    3    4    5    6    7    8    9    10   11   12
0  NaN  1.937605 -0.253459  0.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0
1  NaN  1.937605 -0.253459  0.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0
27 NaN  2.195761 -0.232823  0.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0
69 NaN  1.937605 -0.253459  0.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0
78 NaN  1.384412 -0.232823  0.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0


  return func(X, **(kw_args if kw_args else {}))


In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np

# Assuming CapperTransformer is defined in the environment.

# 1. Monetary Pipeline (Recency, Frequency, Monetary)
# The order is crucial: Capper -> Log1p (creates NaNs) -> IMPUTER (cleans NaNs) -> Scaler
monetary_pipeline = Pipeline(steps=[
    ('capper', CapperTransformer(upper_percentile=99)), 
    ('log_transform', FunctionTransformer(np.log1p, validate=True)),
    # ADDED IMPUTATION HERE: This is what you need to fix the NaNs from log1p
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)), 
    ('scaler', StandardScaler())
])

# 2. Recency and Frequency Pipeline (They have the same pre-processing)
rf_pipeline = Pipeline(steps=[
    ('capper', CapperTransformer(upper_percentile=99)), 
    ('log_transform', FunctionTransformer(np.log1p, validate=True)),
    # ADDED IMPUTATION HERE
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

# 3. Categorical Pipeline
categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 4. Column Transformer
RFM_FEATS = ['Recency', 'Frequency', 'Monetary'] 
CATEGORICAL_FEATS = ['ChannelId', 'ProviderId'] 

preprocessor = ColumnTransformer(
    transformers=[
        ('monetary', monetary_pipeline, ['Monetary']),
        ('rf', rf_pipeline, ['Recency', 'Frequency']),
        ('cat', categorical_pipeline, CATEGORICAL_FEATS)
    ],
    remainder='drop'
)

# 5. Final Pipeline (This is the object you need to save!)
feat_engineering_pipeline_FIXED = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # K-Means will be added later for Task 4 completion, but for now we only fit the preprocessor
])
print("Fixed Pipeline defined in memory.")

Fixed Pipeline defined in memory.


In [14]:
import joblib
# Assuming df_rfm is in memory and CLUSTER_FEATURES are defined.

# Define the features needed for fitting the ColumnTransformer
RFM_FEATS = ['Recency', 'Frequency', 'Monetary'] 
CATEGORICAL_FEATS = ['ChannelId', 'ProviderId'] 
CLUSTER_FEATURES = RFM_FEATS + CATEGORICAL_FEATS

# Fit the FIXED pipeline to your data
# Note: We use feat_engineering_pipeline_FIXED, which was defined in the previous cell.
X_fit = df_rfm[CLUSTER_FEATURES]
feat_engineering_pipeline_FIXED.fit(X_fit)
print("Fixed Pipeline successfully fitted to data.")

# CRITICAL STEP: Save the fixed, fitted pipeline
# We are saving it with the original filename to overwrite the old, flawed artifact.
joblib.dump(feat_engineering_pipeline_FIXED, 'model_artifacts/feat_engineering_pipeline.pkl')
print("Fixed Pipeline artifact saved. The old one has been overwritten.")

Fixed Pipeline successfully fitted to data.
Fixed Pipeline artifact saved. The old one has been overwritten.


  return func(X, **(kw_args if kw_args else {}))


In [15]:
import joblib
import pandas as pd
import numpy as np

# Assuming df_rfm and CLUSTER_FEATURES are defined in memory.

# 1. Reload the new, clean pipeline
# We must reload it to ensure the artifact on disk (the clean one) is what we use.
feat_engineering_pipeline = joblib.load('model_artifacts/feat_engineering_pipeline.pkl')
print("New (Clean) Pipeline loaded successfully.")

# 2. Re-run transformation and diagnostic check
X_raw = df_rfm[CLUSTER_FEATURES]
X_scaled = feat_engineering_pipeline.transform(X_raw)
X_scaled_df = pd.DataFrame(X_scaled) 
nan_count = X_scaled_df.isna().sum().sum()

print(f"\nFinal NaN count in X_scaled: {nan_count}")

if nan_count == 0:
    print("✅ SUCCESS: Data is clean and ready for clustering!")
    
    # --- PROCEED TO CLUSTERING ---
    from sklearn.cluster import KMeans

    K = 4  # Assuming K=4 was chosen for your analysis
    RANDOM_STATE = 42

    kmeans = KMeans(n_clusters=K, random_state=RANDOM_STATE, n_init=10)

    # Fit the model to the scaled data and assign cluster IDs
    df_rfm['Cluster_ID'] = kmeans.fit_predict(X_scaled)

    print(f"\nClustering complete. Assigned {K} clusters to {len(df_rfm)} customers.")
    print("Ready to analyze clusters and assign the 'is_high_risk' target.")

else:
    print("❌ CRITICAL FAILURE: NaNs still present. Something is wrong with the Imputer setup or order.")

New (Clean) Pipeline loaded successfully.

Final NaN count in X_scaled: 0
✅ SUCCESS: Data is clean and ready for clustering!


  return func(X, **(kw_args if kw_args else {}))



Clustering complete. Assigned 4 clusters to 3742 customers.
Ready to analyze clusters and assign the 'is_high_risk' target.


In [16]:
import numpy as np
import pandas as pd # Import pandas again for safety

# --- 1. Calculate Cluster Profiles ---
# Group the original df_rfm by the new Cluster_ID
cluster_profiles = df_rfm.groupby('Cluster_ID')[['Recency', 'Frequency', 'Monetary']].mean()

# Add the count of customers in each cluster
cluster_profiles['Customer_Count'] = df_rfm['Cluster_ID'].value_counts()

# Calculate the descriptive statistics for easier reading
cluster_profiles['Recency_Days'] = cluster_profiles['Recency'].round(0).astype(int)
cluster_profiles['Frequency_Mean'] = cluster_profiles['Frequency'].round(1)
cluster_profiles['Monetary_Mean'] = cluster_profiles['Monetary'].round(2)

# Select final columns for display
cluster_profiles = cluster_profiles[['Customer_Count', 'Recency_Days', 'Frequency_Mean', 'Monetary_Mean']]

print("--- Cluster Profiles (Mean RFM Values) ---")
print(cluster_profiles.sort_values(by='Recency_Days', ascending=True))

--- Cluster Profiles (Mean RFM Values) ---
            Customer_Count  Recency_Days  Frequency_Mean  Monetary_Mean
Cluster_ID                                                             
1                     1095             6            66.9      518447.73
2                     1382            38            11.2      115757.39
0                      201            40            24.6     -575898.36
3                     1064            48             1.8       28873.33


In [17]:
# --- 1. Target Assignment ---
# Based on analysis, Cluster 3 has the highest Recency (48 days) and lowest Frequency (1.8),
# making it the highest churn risk proxy.
HIGH_RISK_CLUSTER_ID = 3 

# Create the final target variable
df_rfm['is_high_risk'] = np.where(df_rfm['Cluster_ID'] == HIGH_RISK_CLUSTER_ID, 1, 0)

# Check the distribution of the target variable
target_distribution = df_rfm['is_high_risk'].value_counts(normalize=True).mul(100).round(1)

print("\n--- Target Variable Distribution ---")
print(target_distribution)

# --- 2. Final Save ---
OUTPUT_FILE_PATH = 'data/processed/df_customer_target.csv'
# Ensure you are saving the dataframe df_rfm which now contains 'Cluster_ID' and 'is_high_risk'
df_rfm.to_csv(OUTPUT_FILE_PATH, index=False)
print(f"\nFinal customer dataset with target saved to: {OUTPUT_FILE_PATH}")


--- Target Variable Distribution ---
is_high_risk
0    71.6
1    28.4
Name: proportion, dtype: float64

Final customer dataset with target saved to: data/processed/df_customer_target.csv


In [2]:
import sys
# This command tells the notebook to run a shell command to install the package
# into the Python environment that the notebook is currently using.
!{sys.executable} -m pip install xverse

Collecting xverse
  Using cached xverse-1.0.5-py3-none-any.whl.metadata (19 kB)
Using cached xverse-1.0.5-py3-none-any.whl (21 kB)
Installing collected packages: xverse
Successfully installed xverse-1.0.5




In [6]:
import pandas as pd
from xverse.transformer import WOE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
import joblib

# --- 1. Load Data with Target Variable ---
df_final = pd.read_csv('data/processed/df_customer_target.csv')
print(f"Loaded dataset with target. Shape: {df_final.shape}")

# --- 2. Define Features and Target ---
# Ensure FEATURE_COLS matches the features you need (RFM, Channel, Provider)
FEATURE_COLS = ['Recency', 'Frequency', 'Monetary', 'ChannelId', 'ProviderId'] 
TARGET_COL = 'is_high_risk'

# Explicitly define feature types for the WOE transformer
CONTINUOUS_FEATS = ['Recency', 'Frequency', 'Monetary']
DISCRETE_FEATS = ['ChannelId', 'ProviderId']

X = df_final[FEATURE_COLS]
y = df_final[TARGET_COL]

# --- 3. Train/Test Split (80/20) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train/Test Split complete. Train shape: {X_train.shape}")

# --- 4. Initialize and Fit WOE Transformer (EXPLICIT FIX) ---

woe_transformer = WOE(
    # Specify features explicitly to guide the transformer
    Continuous_Cols=CONTINUOUS_FEATS,
    Discrete_Cols=DISCRETE_FEATS,
    # CRITICAL FIX: Explicitly disable monotonic binning, which is causing the AttributeError
    monotonic_binning=False 
) 

# Fit the transformer on the TRAINING data only
woe_transformer.fit(X_train, y_train)

# Inspect Information Value (IV) Analysis
iv_df = woe_transformer.iv_df.sort_values(by='IV', ascending=False)
print("\n--- Information Value (IV) Analysis ---")
print(iv_df)

# Apply WOE Transformation to both sets
X_train_woe = woe_transformer.transform(X_train)
X_test_woe = woe_transformer.transform(X_test)
print("\nWoE Transformation Complete.")

# --- 5. Model Training (Logistic Regression) ---
model = LogisticRegression(random_state=42, solver='liblinear')
model.fit(X_train_woe, y_train)
print("Logistic Regression Model Trained.")

# --- 6. Model Evaluation ---
y_pred_proba = model.predict_proba(X_test_woe)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_proba)
y_pred = model.predict(X_test_woe)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n--- Model Evaluation ---")
print(f"AUC-ROC Score: {auc_roc:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# --- 7. Save Model and WOE Transformer ---
joblib.dump(model, 'model_artifacts/logistic_regression_model.pkl')
joblib.dump(woe_transformer, 'model_artifacts/woe_transformer.pkl')
print("\nModel and WOE Transformer saved to artifacts.")

Loaded dataset with target. Shape: (3742, 8)
Train/Test Split complete. Train shape: (2993, 5)


TypeError: WOE.__init__() got an unexpected keyword argument 'Continuous_Cols'

In [7]:
from xverse.transformer import WOE
import inspect

# Print the accepted arguments for the WOE constructor in your environment
print("--- WOE Constructor Arguments in Your Environment ---")
print(inspect.signature(WOE.__init__))

--- WOE Constructor Arguments in Your Environment ---
(self, feature_names='all', exclude_features=None, woe_prefix=None, treat_missing='separate', woe_bins=None, monotonic_binning=True, mono_feature_names='all', mono_max_bins=20, mono_force_bins=3, mono_cardinality_cutoff=5, mono_prefix=None, mono_custom_binning=None)


In [18]:
import sys
!{sys.executable} -m pip install optbinning

^C


In [20]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder
import joblib
import numpy as np

# --- 1. Load Data with Target Variable ---
df_final = pd.read_csv('data/processed/df_customer_target.csv')
print(f"Loaded dataset with target. Shape: {df_final.shape}")

# --- 2. Define Features and Target ---
RFM_FEATS = ['Recency', 'Frequency', 'Monetary']
CATEGORICAL_FEATS = ['ChannelId', 'ProviderId']
FEATURE_COLS = RFM_FEATS + CATEGORICAL_FEATS
TARGET_COL = 'is_high_risk'

X = df_final[FEATURE_COLS]
y = df_final[TARGET_COL]

# --- 3. Train/Test Split (80/20) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train/Test Split complete. Train shape: {X_train.shape}")

# --- 4. Feature Transformation (Using optbinning) ---
woe_mapping = {}
iv_results = []

# A. Loop through RFM features and fit OptimalBinning
print("\nFitting OptimalBinning (WoE Transformation)...")
for feature in RFM_FEATS:
    # Set the binning properties for each continuous RFM feature
    optb = OptimalBinning(name=feature, dtype="numerical", solver="cp")
    
    # Fit the transformer and get the binning table
    optb.fit(X_train[feature], y_train)
    
    # Store the transformation details
    woe_mapping[feature] = optb
    
    # Extract Information Value
    iv_table = optb.binning_table.build()
    iv_value = iv_table['IV'].sum()
    iv_results.append({'Variable_Name': feature, 'IV': iv_value})

# B. Extract Information Value (IV) Analysis
iv_df = pd.DataFrame(iv_results).sort_values(by='IV', ascending=False)
print("\n--- Information Value (IV) Analysis (optbinning) ---")
print(iv_df)

# C. Apply WoE Transformation (using the stored mapping)
def apply_woe(df, mapping):
    df_woe = pd.DataFrame(index=df.index)
    for feature, optb in mapping.items():
        # Get the WoE values directly from the fitted transformer
        df_woe[f'{feature}_woe'] = optb.transform(df[feature], metric="woe")
    return df_woe

X_train_woe_rfm = apply_woe(X_train, woe_mapping)
X_test_woe_rfm = apply_woe(X_test, woe_mapping)
print("\nWoE Transformation Complete using optbinning.")


# D. Initialize and Fit OneHotEncoder (for Categorical features - unchanged)
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(X_train[CATEGORICAL_FEATS])

# Apply OHE Transformation
X_train_ohe_cat = ohe.transform(X_train[CATEGORICAL_FEATS])
X_test_ohe_cat = ohe.transform(X_test[CATEGORICAL_FEATS])

# Convert OHE output to DataFrame
cat_col_names = ohe.get_feature_names_out(CATEGORICAL_FEATS)
X_train_ohe_cat_df = pd.DataFrame(X_train_ohe_cat, columns=cat_col_names, index=X_train.index)
X_test_ohe_cat_df = pd.DataFrame(X_test_ohe_cat, columns=cat_col_names, index=X_test.index)

# E. Concatenate the transformed features
X_train_final = pd.concat([X_train_woe_rfm.reset_index(drop=True), X_train_ohe_cat_df.reset_index(drop=True)], axis=1)
X_test_final = pd.concat([X_test_woe_rfm.reset_index(drop=True), X_test_ohe_cat_df.reset_index(drop=True)], axis=1)
print("Feature Transformation Complete and Combined.")

# --- 5. Model Training (Logistic Regression) ---
model = LogisticRegression(random_state=42, solver='liblinear')
model.fit(X_train_final, y_train)

print("\nLogistic Regression Model Trained.")

# --- 6. Model Evaluation ---
y_pred_proba = model.predict_proba(X_test_final)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_proba)
y_pred = model.predict(X_test_final)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n--- Model Evaluation ---")
print(f"AUC-ROC Score: {auc_roc:.4f}")
print(f"Accuracy: {accuracy:.4f}")

# --- 7. Save Model and Transformers ---
# Save only the necessary artifacts
joblib.dump(model, 'model_artifacts/logistic_regression_model.pkl')
joblib.dump(woe_mapping, 'model_artifacts/woe_mapping_optbinning.pkl') # Save the new mapping
joblib.dump(ohe, 'model_artifacts/ohe_transformer.pkl') 
print("\nModel, optbinning WoE Mapping, and OHE saved to artifacts.")

Loaded dataset with target. Shape: (3742, 8)
Train/Test Split complete. Train shape: (2993, 5)

Fitting OptimalBinning (WoE Transformation)...


NameError: name 'OptimalBinning' is not defined

Collecting optbinning
  Using cached optbinning-0.21.0-py3-none-any.whl.metadata (2.1 kB)
Collecting ortools<9.12,>=9.4 (from optbinning)
  Using cached ortools-9.11.4210-cp312-cp312-win_amd64.whl.metadata (3.0 kB)
Collecting ropwr>=1.0.0 (from optbinning)
  Using cached ropwr-1.1.0-py3-none-any.whl.metadata (1.4 kB)
Collecting absl-py>=2.0.0 (from ortools<9.12,>=9.4->optbinning)
  Using cached absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting protobuf<5.27,>=5.26.1 (from ortools<9.12,>=9.4->optbinning)
  Using cached protobuf-5.26.1-cp310-abi3-win_amd64.whl.metadata (592 bytes)
Collecting immutabledict>=3.0.0 (from ortools<9.12,>=9.4->optbinning)
  Using cached immutabledict-4.2.2-py3-none-any.whl.metadata (3.5 kB)
Collecting cvxpy>=1.1.14 (from ropwr>=1.0.0->optbinning)
  Using cached cvxpy-1.7.5-cp312-cp312-win_amd64.whl.metadata (9.7 kB)
Collecting osqp>=1.0.0 (from cvxpy>=1.1.14->ropwr>=1.0.0->optbinning)
  Using cached osqp-1.0.5-cp312-cp312-win_amd64.whl.metadata (2.1 



Collecting optbinning
  Downloading optbinning-0.21.0-py3-none-any.whl.metadata (2.1 kB)
Collecting ortools<9.12,>=9.4 (from optbinning)
  Downloading ortools-9.11.4210-cp312-cp312-win_amd64.whl.metadata (3.0 kB)
Collecting ropwr>=1.0.0 (from optbinning)
  Downloading ropwr-1.1.0-py3-none-any.whl.metadata (1.4 kB)
Collecting absl-py>=2.0.0 (from ortools<9.12,>=9.4->optbinning)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting protobuf<5.27,>=5.26.1 (from ortools<9.12,>=9.4->optbinning)
  Downloading protobuf-5.26.1-cp310-abi3-win_amd64.whl.metadata (592 bytes)
Collecting immutabledict>=3.0.0 (from ortools<9.12,>=9.4->optbinning)
  Downloading immutabledict-4.2.2-py3-none-any.whl.metadata (3.5 kB)
Collecting cvxpy>=1.1.14 (from ropwr>=1.0.0->optbinning)
  Downloading cvxpy-1.7.5-cp312-cp312-win_amd64.whl.metadata (9.7 kB)
Collecting osqp>=1.0.0 (from cvxpy>=1.1.14->ropwr>=1.0.0->optbinning)
  Downloading osqp-1.0.5-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Coll



In [None]:
# Assuming you are running this in a dedicated script or notebook AFTER the data split

import mlflow
import json
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# --- Start MLflow Experiment ---
MLFLOW_TRACKING_URI = "sqlite:///mlruns.db"  # Use a local database for tracking
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("Credit_Risk_Model_Selection")


# --- MODEL 1: Logistic Regression (Hyperparameter Tuning) ---
with mlflow.start_run(run_name="LogReg_GridSearch"):
    mlflow.log_param("model_name", "LogisticRegression")
    
    # Define Hyperparameter Search Space
    param_grid_lr = {
        'C': [0.1, 1.0, 10.0],
        'solver': ['liblinear', 'lbfgs']
    }
    
    # Use GridSearchCV for tuning
    grid_search_lr = GridSearchCV(
        estimator=LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),
        param_grid=param_grid_lr,
        scoring='roc_auc',
        cv=5,
        n_jobs=-1
    )
    
    # Scale data for Logistic Regression convergence
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    grid_search_lr.fit(X_train_scaled, y_train)
    best_lr = grid_search_lr.best_estimator_

    # Log Best Parameters
    mlflow.log_params(best_lr.get_params())
    
    # Evaluation
    y_pred_proba_lr = best_lr.predict_proba(X_test_scaled)[:, 1]
    y_pred_lr = best_lr.predict(X_test_scaled)
    metrics_lr = evaluate_model(y_test, y_pred_lr, y_pred_proba_lr)
    
    # Log Metrics
    mlflow.log_metrics(metrics_lr)
    
    # Log Model and Scaler Artifacts
    mlflow.sklearn.log_model(best_lr, "logreg_model")
    mlflow.sklearn.log_model(scaler, "scaler_artifact")
    print(f"LogReg AUC: {metrics_lr['roc_auc']:.4f}")


# --- MODEL 2: Decision Tree (Hyperparameter Tuning) ---
with mlflow.start_run(run_name="DecisionTree_GridSearch"):
    mlflow.log_param("model_name", "DecisionTree")
    
    # Define Hyperparameter Search Space
    param_grid_dt = {
        'max_depth': [3, 5, 7, 10],
        'min_samples_split': [2, 5, 10],
        'criterion': ['gini', 'entropy']
    }
    
    # Use GridSearchCV for tuning
    grid_search_dt = GridSearchCV(
        estimator=DecisionTreeClassifier(random_state=RANDOM_STATE),
        param_grid=param_grid_dt,
        scoring='roc_auc',
        cv=5,
        n_jobs=-1
    )
    
    grid_search_dt.fit(X_train, y_train)
    best_dt = grid_search_dt.best_estimator_

    # Log Best Parameters
    mlflow.log_params(best_dt.get_params())
    
    # Evaluation
    y_pred_proba_dt = best_dt.predict_proba(X_test)[:, 1]
    y_pred_dt = best_dt.predict(X_test)
    metrics_dt = evaluate_model(y_test, y_pred_dt, y_pred_proba_dt)
    
    # Log Metrics
    mlflow.log_metrics(metrics_dt)
    
    # Log Model Artifact
    mlflow.sklearn.log_model(best_dt, "dt_model")
    print(f"DecTree AUC: {metrics_dt['roc_auc']:.4f}")

In [None]:
# Assuming the Logistic Regression run ID was 'your_best_run_id'
# and the artifact path was 'logreg_model'

# Replace with the actual run ID
BEST_RUN_ID = "YOUR_BEST_LOGREG_RUN_ID" 
MODEL_URI = f"runs:/{BEST_RUN_ID}/logreg_model"
MODEL_NAME = "CreditRisk_RFM_Classifier"

# Register the model
model_version = mlflow.register_model(model_uri=MODEL_URI, name=MODEL_NAME)

print(f"Model registered as: {MODEL_NAME}, Version: {model_version.version}")