In [2]:
import pandas as pd

tt = pd.read_csv("../data/raw/ieee-fraud-detection/train_transaction.csv")
ti = pd.read_csv("../data/raw/ieee-fraud-detection/train_identity.csv")

df = tt.merge(ti, on="TransactionID", how="left")

print("tt:", tt.shape)
print("ti:", ti.shape)
print("merged df:", df.shape)
print("fraud rate:", df["isFraud"].mean())


tt: (590540, 394)
ti: (144233, 41)
merged df: (590540, 434)
fraud rate: 0.03499000914417313


In [3]:
df["isFraud"].value_counts()

isFraud
0    569877
1     20663
Name: count, dtype: int64

In [4]:
missing = df.isna().mean().sort_values(ascending=False)
missing.head(10)

id_24    0.991962
id_25    0.991310
id_07    0.991271
id_08    0.991271
id_21    0.991264
id_26    0.991257
id_27    0.991247
id_23    0.991247
id_22    0.991247
dist2    0.936284
dtype: float64

In [5]:
high_missing = missing[missing > 0.9]
print("cols >90% missing:", len(high_missing))
high_missing.head(20)

cols >90% missing: 12


id_24    0.991962
id_25    0.991310
id_07    0.991271
id_08    0.991271
id_21    0.991264
id_26    0.991257
id_27    0.991247
id_23    0.991247
id_22    0.991247
dist2    0.936284
D7       0.934099
id_18    0.923607
dtype: float64

In [6]:
cat_cols = df.select_dtypes(include=["object"]).columns
num_cols = df.select_dtypes(exclude=["object"]).columns

print("categorical:", len(cat_cols))
print("numeric:", len(num_cols))
print("sample categorical:", list(cat_cols[:10]))

categorical: 31
numeric: 403
sample categorical: ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5']


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = df.select_dtypes(include=["object"]).columns


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score

# drop high-missing
drop_cols = list(high_missing.index) + ["TransactionID"]  # keep DT for now, we'll decide after
X = df.drop(columns=["isFraud"] + drop_cols)
y = df["isFraud"]

# identify columns
cat_cols = X.select_dtypes(include=["object"]).columns
num_cols = X.columns.difference(cat_cols)

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols),
        ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                          ("oh", OneHotEncoder(handle_unknown="ignore"))]), cat_cols),
    ],
    remainder="drop",
)

model = LogisticRegression(max_iter=200, n_jobs=None)

clf = Pipeline([("prep", preprocess), ("model", model)])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf.fit(X_train, y_train)
proba = clf.predict_proba(X_val)[:, 1]

print("PR-AUC:", average_precision_score(y_val, proba))
print("ROC-AUC:", roc_auc_score(y_val, proba))


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X.select_dtypes(include=["object"]).columns
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=200).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


PR-AUC: 0.07948856190053202
ROC-AUC: 0.6502588069310049
