### Imports + Downloading Data

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pylab as plt
from cycler import cycler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
import xgboost as xgb

In [2]:
np.random.seed(314159)
train_txn = pd.read_csv('ieee-fraud-detection/train_transaction.csv')
test_txn = pd.read_csv('ieee-fraud-detection/test_transaction.csv')
# train_id = pd.read_csv('ieee-fraud-detection/train_identity.csv')

In [3]:
COLUMNS = ['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4',
               'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', "V257","V246","V244","V242","V201","V200","V189","V188","V258","V45","V158","V156","V149","V228","V44","V86","V87","V170","V147","V52"]

In [4]:
df = train_txn[COLUMNS]
df.shape

(590540, 37)

In [5]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V158,V156,V149,V228,V44,V86,V87,V170,V147,V52
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,1.0,1.0,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,1.0,1.0,1.0,,,0.0
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,1.0,1.0,1.0,,,0.0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,1.0,1.0,1.0,,,0.0
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,1.0,,,,1.0,0.0,


### Reformatting data for the Model

In [19]:
# One-hot encode these columns
ONE_HOT_COLUMNS = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain']
df_dummies = pd.get_dummies(df, columns=ONE_HOT_COLUMNS, prefix=ONE_HOT_COLUMNS, dummy_na=False) #dummy_na can be changed if it performs better w/o
df_dummies.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,R_emaildomain_web.de,R_emaildomain_windstream.net,R_emaildomain_yahoo.co.jp,R_emaildomain_yahoo.co.uk,R_emaildomain_yahoo.com,R_emaildomain_yahoo.com.mx,R_emaildomain_yahoo.de,R_emaildomain_yahoo.es,R_emaildomain_yahoo.fr,R_emaildomain_ymail.com
0,2987000,0,86400,68.5,13926,,150.0,142.0,315.0,87.0,...,0,0,0,0,0,0,0,0,0,0
1,2987001,0,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,...,0,0,0,0,0,0,0,0,0,0
2,2987002,0,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,...,0,0,0,0,0,0,0,0,0,0
3,2987003,0,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,...,0,0,0,0,0,0,0,0,0,0
4,2987004,0,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# List of columns you didn't one-hot encode
columns_to_fill_with_mean = list(set(df.columns) - set(ONE_HOT_COLUMNS))
    # [col for col in df.columns if col not in ONE_HOT_COLUMNS]

# Fill NaN values with column means
df_dummies[columns_to_fill_with_mean] = df_dummies[columns_to_fill_with_mean].fillna(df_dummies[columns_to_fill_with_mean].median())

In [21]:
df_dummies.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,R_emaildomain_web.de,R_emaildomain_windstream.net,R_emaildomain_yahoo.co.jp,R_emaildomain_yahoo.co.uk,R_emaildomain_yahoo.com,R_emaildomain_yahoo.com.mx,R_emaildomain_yahoo.de,R_emaildomain_yahoo.es,R_emaildomain_yahoo.fr,R_emaildomain_ymail.com
0,2987000,0,86400,68.5,13926,361.0,150.0,142.0,315.0,87.0,...,0,0,0,0,0,0,0,0,0,0
1,2987001,0,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,...,0,0,0,0,0,0,0,0,0,0
2,2987002,0,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,...,0,0,0,0,0,0,0,0,0,0
3,2987003,0,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,...,0,0,0,0,0,0,0,0,0,0
4,2987004,0,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,...,0,0,0,0,0,0,0,0,0,0


### Model Building - XGBoost

In [22]:
X = df_dummies.drop('isFraud', axis=1)
y = df_dummies['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [23]:
clf = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    missing=-999,
    random_state=42,
    scale_pos_weight = 30.0,
    eval_metric = 'auc'
)

In [24]:
clf.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=-999, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...)

In [25]:
y_pred = clf.predict_proba(X_test).T[1]

In [26]:
roc_auc_score(y_test, y_pred)

0.9133092814817603

### Model 2 - Keras

In [14]:
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD
from keras.models import Sequential
import tensorflow as tf

In [15]:
model = Sequential()
model.add(Dense(units=16, activation='relu', input_dim=X_test.shape[1]))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='bce', metrics=['accuracy'])

2023-10-25 13:47:58.323591: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
model.fit(X_train, y_train, epochs=20)

2023-10-25 13:47:59.358719: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fb358a171f0>

In [17]:
model.evaluate(X_test, y_test)



[0.15170331299304962, 0.9650065898895264]

In [18]:
model.predict(X_test)

array([[0.03580183],
       [0.03580183],
       [0.03580183],
       ...,
       [0.03580183],
       [0.03580183],
       [0.03580183]], dtype=float32)