In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import FunctionTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
train = pd.read_csv("data/train_merged.csv")
test =  pd.read_csv("data/test_merged.csv")

# Base Line training - just encoded data, fill missing and use all column

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12014 entries, 0 to 12013
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float64(399), int64(4), object(31)
memory usage: 39.8+ MB


In [38]:
target = 'isFraud'
y = train[target]
X = train.drop(columns=[target])
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

In [39]:
numeic_tf = Pipeline (
    steps = [
        ('imputer', SimpleImputer(strategy='median')),
    ]
)

categorical_tf = Pipeline (
    steps = [
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ]
)

preprocess = ColumnTransformer (
    transformers = [
        ('num', numeic_tf, num_cols),
        ('cat', categorical_tf, cat_cols),
    ] ,
    remainder = 'drop'
)

## Logistic regression , Decision tree , Random Forest, Xg boost, lightGBM

In [44]:
to_dense = FunctionTransformer(
    lambda x: x.toarray() if hasattr(x, "toarray") else x,
    accept_sparse=True
)

In [45]:
pos = y.sum()
neg = len(y) - pos
scale_pos_weight = float(neg / pos) if pos > 0 else 1.0

In [46]:
models = {
    "logistic": Pipeline([
        ("prep", preprocess),
        ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
    ]),
    "decision_tree": Pipeline([
        ("prep", preprocess),
        ("to_dense", to_dense),
        ("clf", DecisionTreeClassifier(class_weight="balanced", random_state=42))
    ]),
    "random_forest": Pipeline([
        ("prep", preprocess),
        ("to_dense", to_dense),
        ("clf", RandomForestClassifier(
            n_estimators=300, random_state=42, n_jobs=-1,
            class_weight="balanced_subsample"
        ))
    ]),
}

try:
    from xgboost import XGBClassifier
    models["xgboost"] = Pipeline([
        ("prep", preprocess),
        ("clf", XGBClassifier(
            n_estimators=500, learning_rate=0.05, max_depth=6,
            subsample=0.8, colsample_bytree=0.8, tree_method="hist",
            eval_metric="logloss", n_jobs=-1, scale_pos_weight=scale_pos_weight
        ))
    ])
except Exception as e:
    print(" XGBoost Error", e)

try:
    from lightgbm import LGBMClassifier
    models["lightgbm"] = Pipeline([
        ("prep", preprocess),
        ("clf", LGBMClassifier(
            n_estimators=500, learning_rate=0.05, num_leaves=64,
            subsample=0.8, colsample_bytree=0.8, n_jobs=-1,
            class_weight="balanced"
        ))
    ])
except Exception as e:
    print("LightGBM Error", e)

In [47]:
X_tr, X_va, y_tr, y_va = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

results = {}
for name, pipe in models.items():
    pipe.fit(X_tr, y_tr)
    if hasattr(pipe.named_steps[list(pipe.named_steps.keys())[-1]], "predict_proba"):
        proba = pipe.predict_proba(X_va)[:, 1]
    else:
        from sklearn.calibration import CalibratedClassifierCV
        cal = CalibratedClassifierCV(pipe, cv=3)
        cal.fit(X_tr, y_tr)
        proba = cal.predict_proba(X_va)[:, 1]
    auc = roc_auc_score(y_va, proba)
    results[name] = auc
    print(f"{name:12s} AUC = {auc:.4f}")

print("\n Sorting AUC decesnding :")
for k, v in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f"- {k}: {v:.4f}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


logistic     AUC = 0.6752
decision_tree AUC = 0.6906
random_forest AUC = 0.8533
xgboost      AUC = 0.8718
[LightGBM] [Info] Number of positive: 262, number of negative: 9349
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032465 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14907
[LightGBM] [Info] Number of data points in the train set: 9611, number of used features: 495
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




lightgbm     AUC = 0.8630

 Sorting AUC decesnding :
- xgboost: 0.8718
- lightgbm: 0.8630
- random_forest: 0.8533
- decision_tree: 0.6906
- logistic: 0.6752


# Enhance by : feature engineering , feature selection , data preprocessing deely , scaling 

# Hyper tunning parameters