
DATASETS OVERVIEW AND DESCRIPTION FOR MACHINE LEARNING CLASS  - INDIVIDUAL TASK



1. Main Objective
To design and implement an end-to-end machine learning and deep learning pipeline that can predict the probability of an online transaction being fraudulent

2. Task Overview:
"In this assignment, you will build an end-to-end fraud detection, You will work with both the transaction and identity tables, perform data cleaning and preprocessing, handle missing values and class imbalance, and engineer or select relevant features. You are required to implement machine learning or deep learning models to predict the probability that a transaction is fraudulent (isFraud). The workflow should cover data preprocessing, model training, hyperparameter tuning (at a basic level), evaluation using appropriate metrics"

3. Link Datasets:
https://drive.google.com/drive/folders/1JvI5xhPfN3VmjpWYZk9fCHG41xG697um

4. Link Notebook:
https://colab.research.google.com/drive/1oz46ISmhMqGWVSsHWQcfdzZYy0tR4kVH?usp=sharing

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    roc_auc_score, average_precision_score, classification_report,
    confusion_matrix, precision_recall_curve
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [3]:
import os, re

DATA_DIR = "/content/drive/MyDrive/UAS ML DL/Fraud Transcation (ML)"  # pastikan sama persis

print("DATA_DIR exists?", os.path.exists(DATA_DIR))
print("\nIsi folder:")
files = sorted(os.listdir(DATA_DIR))
for f in files:
    print("-", f)

train_path = next((os.path.join(DATA_DIR, f) for f in files if re.match(r"train_transaction.*\.csv$", f)), None)
test_path  = next((os.path.join(DATA_DIR, f) for f in files if re.match(r"test_transaction.*\.csv$", f)), None)

print("\nDetected train_path:", train_path)
print("Detected test_path :", test_path)


DATA_DIR exists? True

Isi folder:
- Fraud Transaction.ipynb
- submission_fraud.csv
- test_transaction.csv
- train_transaction.csv

Detected train_path: /content/drive/MyDrive/UAS ML DL/Fraud Transcation (ML)/train_transaction.csv
Detected test_path : /content/drive/MyDrive/UAS ML DL/Fraud Transcation (ML)/test_transaction.csv


In [4]:
import pandas as pd

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

print("Train shape:", train.shape)
print("Test shape :", test.shape)

print("\nCek kolom wajib:")
print("TransactionID in train?", "TransactionID" in train.columns)
print("isFraud in train?", "isFraud" in train.columns)
print("TransactionID in test?", "TransactionID" in test.columns)
print("isFraud in test?", "isFraud" in test.columns)

print("\nPreview train:")
display(train.head())

print("\nPreview test:")
display(test.head())


Train shape: (590540, 394)
Test shape : (506691, 393)

Cek kolom wajib:
TransactionID in train? True
isFraud in train? True
TransactionID in test? True
isFraud in test? False

Preview train:


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Preview test:


Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,3663549,18403224,31.95,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3663552,18403310,284.95,W,10989,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
4,3663553,18403317,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,


In [5]:
import numpy as np

target = "isFraud"

# simpan id untuk submission
test_ids = test["TransactionID"].copy()

# pisahkan target & fitur
y = train[target].astype(int)
X = train.drop(columns=[target])

# buang TransactionID dari fitur (identifier)
X = X.drop(columns=["TransactionID"], errors="ignore")
test = test.drop(columns=["TransactionID"], errors="ignore")

# bersihkan nilai inf menjadi NaN
X = X.replace([np.inf, -np.inf], np.nan)
test = test.replace([np.inf, -np.inf], np.nan)

print("X shape:", X.shape)
print("test shape:", test.shape)
print("Fraud ratio (y.mean):", y.mean())
print("Jumlah missing (X):", int(X.isna().sum().sum()))
print("Jumlah missing (test):", int(test.isna().sum().sum()))


X shape: (590540, 392)
test shape: (506691, 392)
Fraud ratio (y.mean): 0.03499000914417313
Jumlah missing (X): 95566686
Jumlah missing (test): 73490163


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_valid:", X_valid.shape, "y_valid:", y_valid.shape)

print("Fraud ratio train:", y_train.mean())
print("Fraud ratio valid:", y_valid.mean())


X_train: (472432, 392) y_train: (472432,)
X_valid: (118108, 392) y_valid: (118108,)
Fraud ratio train: 0.03498916246147594
Fraud ratio valid: 0.0349933958749619


In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score

# pisahkan kolom numeric & categorical
num_cols = X_train.select_dtypes(include=["int64","float64","int32","float32"]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

print("Numeric cols:", len(num_cols))
print("Categorical cols:", len(cat_cols))

numeric_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True)),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols),
    ],
    remainder="drop"
)

logreg = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=300, class_weight="balanced"))
])

logreg.fit(X_train, y_train)
p_valid = logreg.predict_proba(X_valid)[:, 1]

roc = roc_auc_score(y_valid, p_valid)
pr  = average_precision_score(y_valid, p_valid)

print("LogReg | ROC-AUC:", roc, "| PR-AUC:", pr)


Numeric cols: 378
Categorical cols: 14


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogReg | ROC-AUC: 0.7453853429193295 | PR-AUC: 0.13730816967359938


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score

rf = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced_subsample"
    ))
])

rf.fit(X_train, y_train)
p_valid_rf = rf.predict_proba(X_valid)[:, 1]

roc_rf = roc_auc_score(y_valid, p_valid_rf)
pr_rf  = average_precision_score(y_valid, p_valid_rf)

print("RF | ROC-AUC:", roc_rf, "| PR-AUC:", pr_rf)


RF | ROC-AUC: 0.9407063270833512 | PR-AUC: 0.7364622807426647


In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_recall_curve, classification_report, confusion_matrix

# 1) tabel ringkas hasil (buat laporan)
results = pd.DataFrame([
    {"model": "LogisticRegression", "roc_auc": roc, "pr_auc": pr},
    {"model": "RandomForest",       "roc_auc": roc_rf, "pr_auc": pr_rf},
]).sort_values("pr_auc", ascending=False)

print("=== Model Comparison ===")
display(results)

# 2) pilih threshold terbaik berdasarkan F1 di validation
prec, rec, thr = precision_recall_curve(y_valid, p_valid_rf)
f1 = 2 * (prec * rec) / (prec + rec + 1e-12)

best_idx = int(np.argmax(f1))
best_thr = float(thr[best_idx-1]) if best_idx > 0 else 0.5  # aman kalau idx=0

print("\n=== Thresholding (F1-based) ===")
print("Best F1:", float(f1[best_idx]))
print("Best threshold:", best_thr)

y_pred_rf = (p_valid_rf >= best_thr).astype(int)

print("\n=== Classification Report (RF) ===")
print(classification_report(y_valid, y_pred_rf, digits=4))

print("\n=== Confusion Matrix (RF) ===")
print(confusion_matrix(y_valid, y_pred_rf))


=== Model Comparison ===


Unnamed: 0,model,roc_auc,pr_auc
1,RandomForest,0.940706,0.736462
0,LogisticRegression,0.745385,0.137308



=== Thresholding (F1-based) ===
Best F1: 0.6930315361134382
Best threshold: 0.19666666666666666

=== Classification Report (RF) ===
              precision    recall  f1-score   support

           0     0.9878    0.9909    0.9893    113975
           1     0.7247    0.6637    0.6929      4133

    accuracy                         0.9794    118108
   macro avg     0.8563    0.8273    0.8411    118108
weighted avg     0.9786    0.9794    0.9790    118108


=== Confusion Matrix (RF) ===
[[112933   1042]
 [  1390   2743]]


In [10]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
import numpy as np

# 1) ambil subset untuk tuning (misal 120k baris)
X_tune, _, y_tune, _ = train_test_split(
    X_train, y_train,
    train_size=120000,
    random_state=42,
    stratify=y_train
)

print("X_tune:", X_tune.shape, "Fraud ratio:", y_tune.mean())

# 2) pipeline RF khusus tuning (pohon lebih sedikit biar cepat)
rf_tune = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", RandomForestClassifier(
        n_estimators=150,
        random_state=42,
        n_jobs=-1,  # ini n_jobs milik RF internal; aman karena search-nya n_jobs=1
        class_weight="balanced_subsample"
    ))
])

param_dist = {
    "clf__max_depth": [None, 10, 20, 30],
    "clf__min_samples_leaf": [1, 2, 5, 10],
    "clf__max_features": ["sqrt", "log2", None],
}

cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    rf_tune,
    param_distributions=param_dist,
    n_iter=5,
    scoring="average_precision",
    cv=cv,
    random_state=42,
    n_jobs=1,          # PENTING: jangan -1
    verbose=2,
    pre_dispatch=1     # bantu hemat RAM
)

search.fit(X_tune, y_tune)

print("Best params:", search.best_params_)
print("Best CV PR-AUC:", search.best_score_)

best_rf = search.best_estimator_


X_tune: (120000, 392) Fraud ratio: 0.034991666666666664
Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] END clf__max_depth=20, clf__max_features=sqrt, clf__min_samples_leaf=10; total time=  10.1s
[CV] END clf__max_depth=20, clf__max_features=sqrt, clf__min_samples_leaf=10; total time=   9.9s
[CV] END clf__max_depth=30, clf__max_features=log2, clf__min_samples_leaf=1; total time=   8.4s
[CV] END clf__max_depth=30, clf__max_features=log2, clf__min_samples_leaf=1; total time=   8.4s
[CV] END clf__max_depth=20, clf__max_features=sqrt, clf__min_samples_leaf=5; total time=  10.1s
[CV] END clf__max_depth=20, clf__max_features=sqrt, clf__min_samples_leaf=5; total time=   9.8s
[CV] END clf__max_depth=30, clf__max_features=log2, clf__min_samples_leaf=10; total time=   7.7s
[CV] END clf__max_depth=30, clf__max_features=log2, clf__min_samples_leaf=10; total time=   8.0s
[CV] END clf__max_depth=20, clf__max_features=sqrt, clf__min_samples_leaf=1; total time=  10.2s
[CV] END clf__ma

In [11]:
from sklearn.metrics import roc_auc_score, average_precision_score

# evaluasi best_rf hasil tuning pada validation full
p_valid_best = best_rf.predict_proba(X_valid)[:, 1]

roc_best = roc_auc_score(y_valid, p_valid_best)
pr_best  = average_precision_score(y_valid, p_valid_best)

print("Tuned RF (150 trees) | ROC-AUC:", roc_best, "| PR-AUC:", pr_best)


Tuned RF (150 trees) | ROC-AUC: 0.902583904223821 | PR-AUC: 0.5421079847684398


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

final_rf = Pipeline(steps=[
    ("prep", preprocess),
    ("clf", RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        n_jobs=-1,
        class_weight="balanced_subsample",
        max_depth=20,
        max_features="sqrt",
        min_samples_leaf=10
    ))
])

final_rf.fit(X_train, y_train)
p_valid_final = final_rf.predict_proba(X_valid)[:, 1]

roc_final = roc_auc_score(y_valid, p_valid_final)
pr_final  = average_precision_score(y_valid, p_valid_final)

print("Final Tuned RF (300 trees) | ROC-AUC:", roc_final, "| PR-AUC:", pr_final)


Final Tuned RF (300 trees) | ROC-AUC: 0.9277353760654127 | PR-AUC: 0.6260872675114829


In [13]:
import pandas as pd
import os

# Pastikan rf adalah pipeline baseline RF yang PR-AUC-nya 0.736 (dari CELL 7)
rf.fit(X, y)
test_proba = rf.predict_proba(test)[:, 1]

submission = pd.DataFrame({
    "TransactionID": test_ids,
    "isFraud": test_proba
})

out_path = os.path.join(DATA_DIR, "submission_fraud_rf_baseline.csv")
submission.to_csv(out_path, index=False)

print("Saved:", out_path)
submission.head()


Saved: /content/drive/MyDrive/UAS ML DL/Fraud Transcation (ML)/submission_fraud_rf_baseline.csv


Unnamed: 0,TransactionID,isFraud
0,3663549,0.013333
1,3663550,0.01
2,3663551,0.02
3,3663552,0.016667
4,3663553,0.003333


In [14]:
import pandas as pd

out_path = "/content/drive/MyDrive/UAS ML DL/Fraud Transcation (ML)/submission_fraud_rf_baseline.csv"
sub = pd.read_csv(out_path)

print("Shape:", sub.shape)
print("Columns:", sub.columns.tolist())
print("Nulls:", sub.isna().sum().to_dict())
print("isFraud min/max:", sub["isFraud"].min(), sub["isFraud"].max())
print("Unique TransactionID:", sub["TransactionID"].nunique())

sub.head()


Shape: (506691, 2)
Columns: ['TransactionID', 'isFraud']
Nulls: {'TransactionID': 0, 'isFraud': 0}
isFraud min/max: 0.0 0.9966666666666668
Unique TransactionID: 506691


Unnamed: 0,TransactionID,isFraud
0,3663549,0.013333
1,3663550,0.01
2,3663551,0.02
3,3663552,0.016667
4,3663553,0.003333
