# import modules

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")


# import data processing

In [41]:
import importlib.util
import sys
import os
module_path = r'C:\Users\user\Desktop\Project\Bati_Bank\src\data_processing.py'
spec = importlib.util.spec_from_file_location("data_loader", module_path)
data_loader = importlib.util.module_from_spec(spec)
spec.loader.exec_module(data_loader)

# now you can access your function
load_data = data_loader.load_data

# test
print(load_data)

<function load_data at 0x0000026A1ECD3420>


# load data

In [42]:
df = load_data(r"C:\Users\user\Desktop\Project\Bati_Bank\data\raw\loan.csv")

# Aggregate

In [43]:
customer_agg = (
    df.groupby("CustomerId")
    .agg(
        total_transaction_amount=("Amount", "sum"),
        avg_transaction_amount=("Amount", "mean"),
       transaction_count=("Amount", "count"),
        std_transaction_amount=("Amount", "std")
    )
    .reset_index()
)

customer_agg.head()


Unnamed: 0,CustomerId,total_transaction_amount,avg_transaction_amount,transaction_count,std_transaction_amount
0,CustomerId_1,-10000.0,-10000.0,1,
1,CustomerId_10,-10000.0,-10000.0,1,
2,CustomerId_1001,20000.0,4000.0,5,6558.963333
3,CustomerId_1002,4225.0,384.090909,11,560.498966
4,CustomerId_1003,20000.0,3333.333333,6,6030.478146


# Time-Based Feature Extraction

In [44]:
df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"])

df["transaction_hour"] = df["TransactionStartTime"].dt.hour
df["transaction_day"] = df["TransactionStartTime"].dt.day
df["transaction_month"] = df["TransactionStartTime"].dt.month
df["transaction_year"] = df["TransactionStartTime"].dt.year


# Merge Aggregate Features Back

In [45]:
df = df.merge(customer_agg, on="CustomerId", how="left")


# Drop Loan_ID 

In [46]:
if "TransactionId" in df.columns:
    df = df.drop(columns=["TransactionId"])

In [47]:
# Create binary target
df["FraudResult_Binary"] = df["FraudResult"].map({"Y": 0, "N": 1})

# WOE + IV FOR CATEGORICAL VARIABLES

In [48]:
def woe_iv_categorical(df, feature, target="FraudResult"):
    temp = df[[feature, target]].copy()
    temp = temp[temp[feature].notna()]
    temp[feature] = temp[feature].astype(str)

    grouped = temp.groupby(feature)[target].agg(["count", "sum"])
    grouped.columns = ["total", "bad"]
    grouped["good"] = grouped["total"] - grouped["bad"]

    grouped["good_dist"] = grouped["good"] / grouped["good"].sum()
    grouped["bad_dist"] = grouped["bad"] / grouped["bad"].sum()

    # Avoid division by zero
    grouped["good_dist"] = grouped["good_dist"].replace(0, 1e-9)
    grouped["bad_dist"] = grouped["bad_dist"].replace(0, 1e-9)

    grouped["WOE"] = np.log(grouped["good_dist"] / grouped["bad_dist"])
    grouped["IV"] = (grouped["good_dist"] - grouped["bad_dist"]) * grouped["WOE"]

    return grouped.reset_index(), grouped["IV"].sum()

# WOE ENCODING 

In [49]:
ProductCategory_table, ProductCategory_iv = woe_iv_categorical(df, "ProductCategory")
print("\nWOE Table for ProductCategory:")
print(ProductCategory_table)
print("\nIV for ProductCategory:", ProductCategory_iv)

# Map to WOE
ProductCategory_woe_map = dict(zip(ProductCategory_table["ProductCategory"].astype(str), ProductCategory_table["WOE"]))
df["ProductCategory_WOE"] = df["ProductCategory"].astype(str).map(ProductCategory_woe_map)


WOE Table for ProductCategory:
      ProductCategory  total  bad   good  good_dist      bad_dist        WOE  \
0             airtime  45027   18  45009   0.471451  9.326425e-02   1.620379   
1        data_bundles   1613    0   1613   0.016896  1.000000e-09  16.642560   
2  financial_services  45405  161  45244   0.473913  8.341969e-01  -0.565446   
3              movies    175    0    175   0.001833  1.000000e-09  14.421495   
4               other      2    0      2   0.000021  1.000000e-09   9.949856   
5              ticket    216    0    216   0.002263  1.000000e-09  14.631987   
6           transport     25    2     23   0.000241  1.036269e-02  -3.761520   
7                  tv   1279    0   1279   0.013397  1.000000e-09  16.410543   
8        utility_bill   1920   12   1908   0.019986  6.217617e-02  -1.134962   

         IV  
0  0.612807  
1  0.281185  
2  0.203721  
3  0.026435  
4  0.000208  
5  0.033105  
6  0.038073  
7  0.219852  
8  0.047885  

IV for ProductCategory: 1.

# IV FOR NUMERIC VARIABLES (automatic binning)

In [50]:
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
numeric_cols.remove("FraudResult")  # exclude target

def iv_numeric(df, feature, target="FraudResult", bins=5):
    df2 = df[[feature, target]].copy()
    df2["bin"] = pd.qcut(df2[feature], q=bins, duplicates="drop")

    grouped = df2.groupby("bin")[target].agg(["count", "sum"])
    grouped.columns = ["total", "bad"]
    grouped["good"] = grouped["total"] - grouped["bad"]

    grouped["good_dist"] = grouped["good"] / grouped["good"].sum()
    grouped["bad_dist"] = grouped["bad"] / grouped["bad"].sum()

    grouped["good_dist"] = grouped["good_dist"].replace(0, 1e-9)
    grouped["bad_dist"] = grouped["bad_dist"].replace(0, 1e-9)

    grouped["WOE"] = np.log(grouped["good_dist"] / grouped["bad_dist"])
    grouped["IV"] = (grouped["good_dist"] - grouped["bad_dist"]) * grouped["WOE"]

    return grouped.reset_index(), grouped["IV"].sum()


numeric_iv_results = []

for col in numeric_cols:
    table, iv = iv_numeric(df, col)
    numeric_iv_results.append([col, iv])

numeric_iv_df = pd.DataFrame(numeric_iv_results, columns=["Feature", "IV"])

# IV FOR CATEGORICAL VARIABLES

In [51]:
categorical_cols = df.select_dtypes(include="object").columns.tolist()
cat_iv_results = []
for col in categorical_cols:
    table, iv = woe_iv_categorical(df, col)
    cat_iv_results.append([col, iv])

categorical_iv_df = pd.DataFrame(cat_iv_results, columns=["Feature", "IV"])


# COMBINE IV AND SELECT FEATURES

In [52]:
combined_iv = pd.concat([categorical_iv_df, numeric_iv_df], ignore_index=True)
combined_iv = combined_iv.sort_values("IV", ascending=False)

print("\n\n=== Combined IV Ranking ===")
print(combined_iv)
# Keep variables with IV >= 0.02
selected_features = combined_iv[combined_iv["IV"] >= 0.02]["Feature"].tolist()
print("\nSelected Predictive Features (IV >= 0.02):")
print(selected_features)



=== Combined IV Ranking ===
                     Feature         IV
0                    BatchId  24.762054
3                 CustomerId  18.731850
20    std_transaction_amount  16.626277
1                  AccountId  14.754754
2             SubscriptionId  14.544861
11                     Value  10.112496
10                    Amount   9.914119
17  total_transaction_amount   9.354279
18    avg_transaction_amount   6.893800
6                  ProductId   5.017794
5                 ProviderId   3.329529
7            ProductCategory   1.463272
22       ProductCategory_WOE   1.460396
8                  ChannelId   1.350916
19         transaction_count   0.573731
13          transaction_hour   0.101902
14           transaction_day   0.095925
12           PricingStrategy   0.085529
15         transaction_month   0.063391
4               CurrencyCode   0.000000
9                CountryCode   0.000000
16          transaction_year   0.000000
21        FraudResult_Binary   0.000000

Selected 

# WOE TRANSFORM SELECTED FEATURES

In [53]:
df_woe = df.copy()
# Categorical WOE
for f in selected_features:
    if f in categorical_cols:
        table, _ = woe_iv_categorical(df, f)
        woe_map = dict(zip(table[f].astype(str), table["WOE"]))
        df_woe[f + "_WOE"] = df[f].astype(str).map(woe_map)

# Numeric WOE
for f in selected_features:
    if f in numeric_cols:
        bins = pd.qcut(df[f], q=5, duplicates="drop")
        table, _ = iv_numeric(df, f)
        woe_map = dict(zip(table["bin"].astype(str), table["WOE"]))
        df_woe[f + "_bin"] = bins.astype(str)
        df_woe[f + "_WOE"] = df_woe[f + "_bin"].map(woe_map)

# Final list of WOE features
woe_features = [c for c in df_woe.columns if c.endswith("_WOE")]

# IMPUTE MISSING VALUES

In [54]:
X = df_woe[woe_features]
y = df_woe["FraudResult"]

imputer = SimpleImputer(strategy='median')  # works for numeric WOE
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# LOGISTIC REGRESSION WITH WOE FEATURES

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_prob)
print("\nLogistic Regression AUC Score:", auc)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Logistic Regression AUC Score: 0.6902911668707387

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28641
           1       0.00      0.00      0.00        58

    accuracy                           1.00     28699
   macro avg       0.50      0.50      0.50     28699
weighted avg       1.00      1.00      1.00     28699

