In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_transaction = pd.read_csv(
    "train_transaction.csv",
    low_memory = False
)

In [3]:
for col in train_transaction.columns:
    if train_transaction[col].dtype == "float64":
        train_transaction[col] = train_transaction[col].astype("float32")
    elif train_transaction[col].dtype == "int64":
        train_transaction[col] = train_transaction[col].astype("int32")

In [4]:
missing_ratio = train_transaction.isna().mean()
drop_cols = missing_ratio[missing_ratio > 0.9].index

train_transaction.drop(drop_cols, axis = 1, inplace = True)

In [5]:
train_identity = pd.read_csv(
    "train_identity.csv",
    low_memory = False)

for col in train_identity.columns:
    if train_identity[col].dtype == "float64" :
        train_identity[col] = train_identity[col].astype("float32")

In [6]:
train_df = train_transaction.merge(
    train_identity,
    on = "TransactionID",
    how = "left")

In [7]:
import gc

In [8]:
del train_transaction, train_identity
gc.collect()

0

In [9]:
train_df

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.500000,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.000000,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.000000,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.000000,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.000000,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.000000,W,6550,,150.0,visa,226.0,...,,,,,,,,,,
590536,3577536,0,15811049,39.500000,W,10444,225.0,150.0,mastercard,224.0,...,,,,,,,,,,
590537,3577537,0,15811079,30.950001,W,12037,595.0,150.0,mastercard,224.0,...,,,,,,,,,,
590538,3577538,0,15811088,117.000000,W,7826,481.0,150.0,mastercard,224.0,...,,,,,,,,,,


In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 432 entries, TransactionID to DeviceInfo
dtypes: float32(397), int32(4), object(31)
memory usage: 1.0+ GB


In [11]:
train_df.describe()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32
count,590540.0,590540.0,590540.0,590540.0,590540.0,581607.0,588975.0,586281.0,524834.0,524834.0,...,139369.0,45113.0,139318.0,139261.0,5159.0,5169.0,4747.0,5132.0,5163.0,77586.0
mean,3282270.0,0.03499,7372311.0,135.027161,9898.734658,362.555511,153.194946,199.2789,290.733826,86.800652,...,189.45137,14.237337,353.128174,403.882568,368.269806,16.002708,12.800927,329.608917,149.070312,26.508596
std,170474.4,0.183755,4617224.0,239.162689,4901.170153,157.817963,11.343591,41.332325,101.700386,2.77353,...,30.377136,1.561116,141.11203,152.158493,198.849014,6.897755,2.372468,97.462585,32.101933,3.737366
min,2987000.0,0.0,86400.0,0.251,1000.0,100.0,100.0,100.0,100.0,10.0,...,100.0,10.0,100.0,100.0,100.0,10.0,11.0,100.0,100.0,0.0
25%,3134635.0,0.0,3027058.0,43.320999,6019.0,214.0,150.0,166.0,204.0,87.0,...,166.0,13.0,266.0,256.0,252.0,14.0,11.0,321.0,119.0,24.0
50%,3282270.0,0.0,7306528.0,68.769001,9678.0,361.0,150.0,226.0,299.0,87.0,...,166.0,15.0,341.0,472.0,252.0,14.0,11.0,321.0,149.0,24.0
75%,3429904.0,0.0,11246620.0,125.0,14184.0,512.0,150.0,226.0,330.0,87.0,...,225.0,15.0,427.0,533.0,486.5,14.0,15.0,371.0,169.0,32.0
max,3577539.0,1.0,15811130.0,31937.390625,18396.0,600.0,231.0,237.0,540.0,102.0,...,229.0,29.0,671.0,661.0,854.0,44.0,26.0,548.0,216.0,32.0


In [12]:
train_df['isFraud'].value_counts()

isFraud
0    569877
1     20663
Name: count, dtype: int64

In [13]:
train_df = train_df.sort_values("TransactionDT")

split_ratio = 0.8
split_index = int(len(train_df) * split_ratio)

train_data = train_df.iloc[:split_index]
val_data = train_df.iloc[split_index:]

X_train = train_data.drop(columns=["isFraud" , "TransactionID"])
y_train = train_data["isFraud"]

X_val = val_data.drop(columns = ["isFraud" , "TransactionID"])
y_val = val_data["isFraud"]

print("Train shape :" , X_train.shape)
print("Validation shape :", X_val.shape)

print("Fraud rate in trian :" , y_train.mean())
print("Fraud rate in validation :" , y_val.mean())

Train shape : (472432, 430)
Validation shape : (118108, 430)
Fraud rate in trian : 0.03513521522674162
Fraud rate in validation : 0.034409184813899145


In [14]:
cat_cols = X_train.select_dtypes(include = "object").columns
print(len(cat_cols))
print(cat_cols)

31
Index(['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1',
       'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15',
       'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33',
       'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType',
       'DeviceInfo'],
      dtype='object')


In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()

    X_train[col] = le.fit_transform(X_train[col].astype(str))

    X_val[col] = X_val[col].astype(str)

    unseen = set(X_val[col]) - set(le.classes_)
    if unseen:
        le.classes_ = np.append(le.classes_, list(unseen))

    X_val[col] = le.transform(X_val[col])

    label_encoders[col] = le

In [17]:
X_train

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,86400,68.500000,4,13926,,150.0,1,142.0,1,315.0,...,89,,212,4,2,2,2,2,2,1598
1,86401,29.000000,4,2755,404.0,150.0,2,102.0,1,325.0,...,89,,212,4,2,2,2,2,2,1598
2,86469,59.000000,4,4663,490.0,150.0,4,166.0,2,330.0,...,89,,212,4,2,2,2,2,2,1598
3,86499,50.000000,4,18132,567.0,150.0,2,117.0,2,476.0,...,89,,212,4,2,2,2,2,2,1598
4,86506,50.000000,1,4497,514.0,150.0,2,102.0,1,420.0,...,104,32.0,135,3,1,0,1,1,1,870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472427,12192667,43.950001,4,15484,418.0,150.0,4,226.0,2,310.0,...,89,,212,4,2,2,2,2,2,1598
472428,12192736,49.000000,4,17188,321.0,150.0,4,226.0,2,220.0,...,89,,212,4,2,2,2,2,2,1598
472429,12192742,40.000000,1,16659,170.0,150.0,4,226.0,1,272.0,...,52,24.0,62,3,1,0,1,1,0,661
472430,12192743,15.000000,4,7919,194.0,150.0,2,166.0,2,220.0,...,89,,212,4,2,2,2,2,2,1598


In [18]:
X_val

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
472432,12192900,33.261002,0,9300,103.0,185.0,4,138.0,2,,...,52,,212,4,0,0,1,0,0,1598
472433,12192911,52.811001,0,8809,179.0,106.0,4,137.0,2,,...,56,,212,4,0,0,1,0,1,250
472434,12192913,136.955994,0,10819,555.0,185.0,4,226.0,2,,...,52,,212,4,0,0,1,0,0,1467
472435,12193040,136.955994,0,9633,130.0,185.0,4,138.0,2,,...,52,,212,4,0,0,1,0,0,1467
472436,12193199,25.000000,1,17188,321.0,150.0,4,226.0,2,310.0,...,73,24.0,172,3,1,0,1,1,0,1467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,15811047,49.000000,4,6550,,150.0,4,226.0,2,272.0,...,89,,212,4,2,2,2,2,2,1598
590536,15811049,39.500000,4,10444,225.0,150.0,2,224.0,2,204.0,...,89,,212,4,2,2,2,2,2,1598
590537,15811079,30.950001,4,12037,595.0,150.0,2,224.0,2,231.0,...,89,,212,4,2,2,2,2,2,1598
590538,15811088,117.000000,4,7826,481.0,150.0,2,224.0,2,387.0,...,89,,212,4,2,2,2,2,2,1598


In [19]:
import lightgbm as lgb

In [20]:
del train_df
del train_data
del val_data
del cat_cols
del label_encoders
import gc
gc.collect()

15

In [21]:
from sklearn.metrics import roc_auc_score

In [24]:
lgb_train = lgb.Dataset(
    X_train,
    label = y_train,
    free_raw_data = False
)

lgb_val = lgb.Dataset(
    X_val,
    label = y_val,
    reference = lgb_train,
    free_raw_data = False)


In [25]:
params = {
    "objective" : "binary",
    "metric" : "acu",
    "booting_type" : "gbdt",

    "learning_rate" :0.05,
    "num_leaves" : 64,
    "max_depth" : -1,

    "feture_fraction" : 0.8,
    "bagging_fraction" : 0.8,
    "bagging_freq" : 5,

    "min_data_in_leaf" : 50,
    "verbosity" : -1,
    "seed" :42}

In [28]:
params["metric"] = "auc"

In [29]:
model = lgb.train(
    params,
    lgb_train,
    num_boost_round = 500,
    valid_sets= [lgb_val],
    valid_names = ["valid"],
    callbacks=[
        lgb.early_stopping(stopping_rounds = 50),
        lgb.log_evaluation(50)
    ]
)

Training until validation scores don't improve for 50 rounds
[50]	valid's auc: 0.882334
[100]	valid's auc: 0.90241
[150]	valid's auc: 0.909545
[200]	valid's auc: 0.913895
[250]	valid's auc: 0.914853
[300]	valid's auc: 0.915997
[350]	valid's auc: 0.916698
[400]	valid's auc: 0.917209
[450]	valid's auc: 0.917643
Early stopping, best iteration is:
[431]	valid's auc: 0.917966


In [30]:
model.save_model("lightgbm_fraud_baseline.txt")

<lightgbm.basic.Booster at 0x156270db230>