In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

In [2]:
dfTransaction = pd.read_csv("../Dataset/ML/train_transaction.csv")
dfIdentity = pd.read_csv("../Dataset/ML/train_identity.csv")
dfMerge = dfTransaction.merge(dfIdentity, on = "TransactionID", how = "left")
del dfTransaction
del dfIdentity
dfMerge.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [3]:
train_size = int(len(dfMerge) * 0.8)

X_train = dfMerge.iloc[:train_size, 1:] # Saco el Transaction ID
X_val = dfMerge.iloc[train_size:, 1:] # Saco el Transaction ID
y_train = dfMerge["isFraud"].iloc[:train_size]
y_val = dfMerge["isFraud"].iloc[train_size:]
# X_train, X_val, y_train, y_val = train_test_split(dfMerge.iloc[:,1:], dfMerge["isFraud"].values, train_size = 0.8, shuffle = False)

In [4]:
dfMerge = None

In [5]:
X_train.tail()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
472427,0,12192667,43.95,W,15484,418.0,150.0,visa,226.0,debit,...,,,,,,,,,,
472428,0,12192736,49.0,W,17188,321.0,150.0,visa,226.0,debit,...,,,,,,,,,,
472429,0,12192742,40.0,H,16659,170.0,150.0,visa,226.0,credit,...,chrome 65.0,24.0,1440x900,match_status:2,T,F,T,T,desktop,MacOS
472430,0,12192743,15.0,W,7919,194.0,150.0,mastercard,166.0,debit,...,,,,,,,,,,
472431,0,12192842,49.0,W,13749,321.0,150.0,visa,226.0,credit,...,,,,,,,,,,


In [6]:
X_val.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
472432,1,12192900,33.261,C,9300,103.0,185.0,visa,138.0,debit,...,chrome 65.0,,,,F,F,T,F,desktop,
472433,0,12192911,52.811,C,8809,179.0,106.0,visa,137.0,debit,...,chrome generic for android,,,,F,F,T,F,mobile,F3311
472434,0,12192913,136.956,C,10819,555.0,185.0,visa,226.0,debit,...,chrome 65.0,,,,F,F,T,F,desktop,Windows
472435,0,12193040,136.956,C,9633,130.0,185.0,visa,138.0,debit,...,chrome 65.0,,,,F,F,T,F,desktop,Windows
472436,0,12193199,25.0,H,17188,321.0,150.0,visa,226.0,debit,...,firefox 59.0,24.0,3200x1800,match_status:2,T,F,T,T,desktop,Windows


In [7]:
browsers = ["chrome", "safari", "firefox", "ie 11.0", "edge"]
def parse_id31(x):
    if pd.isna(x):
        return np.NaN

    for browser in browsers:
        if browser in str(x):
            return browser

    return "Other"

In [8]:
X_train["browser"] = X_train["id_31"].apply(lambda x: parse_id31(x))
X_train["browser"].value_counts()

chrome     63376
safari     31985
ie 11.0     8411
firefox     6054
edge        4614
Other       3023
Name: browser, dtype: int64

In [9]:
oh_columns = ["ProductCD", "card4", "card6", "browser", "DeviceType"]
ohe = OneHotEncoder(sparse = False, dtype = bool, handle_unknown = "ignore")
ohe_matrix = ohe.fit_transform(X_train[oh_columns])
X_train[list(ohe.get_feature_names_out(oh_columns))] = ohe_matrix
X_train.drop(["card6_debit or credit", "card6_charge card"], axis = 1, inplace = True)

In [10]:
len(y_train)

472432

In [11]:
ohe_matrix.shape

(472432, 25)

In [12]:
del ohe_matrix

In [13]:
ohe.get_feature_names_out(oh_columns)

array(['ProductCD_C', 'ProductCD_H', 'ProductCD_R', 'ProductCD_S',
       'ProductCD_W', 'card4_american express', 'card4_discover',
       'card4_mastercard', 'card4_visa', 'card4_nan', 'card6_charge card',
       'card6_credit', 'card6_debit', 'card6_debit or credit',
       'card6_nan', 'browser_Other', 'browser_chrome', 'browser_edge',
       'browser_firefox', 'browser_ie 11.0', 'browser_safari',
       'browser_nan', 'DeviceType_desktop', 'DeviceType_mobile',
       'DeviceType_nan'], dtype=object)

In [14]:
X_train.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,browser_Other,browser_chrome,browser_edge,browser_firefox,browser_ie 11.0,browser_safari,browser_nan,DeviceType_desktop,DeviceType_mobile,DeviceType_nan
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
def parse_resolucion(x):
    if pd.isna(x):
        return np.NaN

    pixeles = x.split("x")
    return int(pixeles[0]) * int(pixeles[1])

X_train["resolucion"] = X_train["id_33"].apply(lambda x: parse_resolucion(x))
X_train["resolucion"].value_counts().head(15)

2073600.0    14460
1049088.0     6669
1000500.0     5602
2742336.0     4166
1296000.0     3792
3145728.0     3110
1440000.0     3098
1024000.0     1882
4096000.0     1818
3686400.0     1589
5184000.0     1525
727040.0      1517
1310720.0     1509
1764000.0     1470
2740500.0     1148
Name: resolucion, dtype: int64

In [16]:
X_train.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,browser_chrome,browser_edge,browser_firefox,browser_ie 11.0,browser_safari,browser_nan,DeviceType_desktop,DeviceType_mobile,DeviceType_nan,resolucion
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2397600.0


In [17]:
X_train.drop(oh_columns + ["id_31", "id_33"], axis = 1, inplace = True)

In [17]:
columnas_categoricas = ["card1", "card2", "card3", "card5", "addr1", "addr2",
                        "P_emaildomain", "R_emaildomain", "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9",
                        "DeviceInfo", 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18',
                        'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28',
                        'id_29', 'id_30', 'id_32', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']
means = []
for columna in columnas_categoricas:
    mean_encoded = X_train.groupby(columna)['isFraud'].mean().to_dict()
    means += [mean_encoded]
    X_train[columna] =  X_train[columna].map(mean_encoded)

In [18]:
for columna in X_train.columns:
    if X_train[columna].dtype == object:
        X_train[columna].fillna("NaN", inplace = True)
    else:
        X_train[columna].fillna(X_train[columna].mean(), inplace = True)

In [19]:
X_train.drop("isFraud", axis = 1, inplace = True)

In [22]:
param_grid = {
    'max_depth': range(3, 11),
    'min_child_weight': [0.3, 0.5, 1.0, 1.5, 3.0],
    'gamma': [0, 0.25, 0.5, 1.0],
    'subsample': np.arange(0.5, 1.1, 0.1),
    'colsample_bytree': np.arange(0.5, 1.1, 0.1),
    'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
    'reg_alpha': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2]
}

In [23]:
clf = XGBClassifier(use_label_encoder = False, booster = "gbtree", n_estimators = 100, n_jobs = 1)
rs_clf = RandomizedSearchCV(clf, param_grid, n_iter = 4,
                            n_jobs = 1, verbose = 100, cv = 3,
                            scoring = 'roc_auc', random_state = 1)

In [24]:
rs_clf.fit(X_train.values, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3; 1/4] START colsample_bytree=0.5, gamma=0.5, learning_rate=0.1, max_depth=8, min_child_weight=1.5, reg_alpha=0.1, reg_lambda=10.0, subsample=0.5



KeyboardInterrupt



In [23]:
best_score = rs_clf.best_score_
best_params = rs_clf.best_params_
print("Best score: {}".format(best_score))
print("Best params: ")
for param_name in sorted(best_params.keys()):
    print('%s: %r' % (param_name, best_params[param_name]))

Best score: 0.7528959431377634
Best params: 
colsample_bylevel: 0.7
colsample_bytree: 0.6
gamma: 0.5
learning_rate: 3
max_depth: 6
min_child_weight: 5.0
n_estimators: 100
reg_lambda: 100.0
silent: False
subsample: 0.6


In [25]:
X_val["browser"] = X_val["id_31"].apply(lambda x: parse_id31(x))
X_val["browser"].value_counts()

chrome     12683
safari      5296
edge        1787
ie 11.0     1266
firefox      958
Other        829
Name: browser, dtype: int64

In [26]:
ohe_matrix = ohe.transform(X_val[oh_columns])
X_val[list(ohe.get_feature_names_out(oh_columns))] = ohe_matrix
X_val.drop(["card6_debit or credit", "card6_charge card"], axis = 1, inplace = True)

In [28]:
X_val["resolucion"] = X_val["id_33"].apply(lambda x: parse_resolucion(x))
X_val["resolucion"].value_counts().head(15)

2073600.0    2414
1049088.0    1936
1000500.0     845
2742336.0     734
1296000.0     592
1440000.0     412
3145728.0     372
2740500.0     336
1047722.0     309
3686400.0     276
4096000.0     275
1024000.0     267
1764000.0     257
1310720.0     234
5184000.0     231
Name: resolucion, dtype: int64

In [30]:
X_val.drop(oh_columns + ["id_31", "id_33"], axis = 1, inplace = True)

In [31]:
for i, columna in enumerate(columnas_categoricas):
    mean_encoded = means[i]
    X_val[columna] =  X_val[columna].map(mean_encoded)

In [32]:
for columna in X_val.columns:
    if X_val[columna].dtype == object:
        X_val[columna].fillna("NaN", inplace = True)
    else:
        X_val[columna].fillna(X_val[columna].mean(), inplace = True)

In [34]:
X_val.drop("isFraud", axis = 1, inplace = True)

In [35]:
predictions = rs_clf.predict_proba(X_val.values)

In [36]:
roc_auc_score(y_val, [pred[1] for pred in predictions])

0.6938999216302618

In [50]:
dfTestTransaction = pd.read_csv("../Dataset/ML/test_transaction.csv")
dfTestIdentity = pd.read_csv("../Dataset/ML/test_identity.csv")
dfTest = dfTestTransaction.merge(dfTestIdentity, on = "TransactionID", how = "left")
del dfTestTransaction
del dfTestIdentity

In [53]:
dfTest.columns = [columna.replace("id-", "id_") for columna in dfTest.columns]
dfTest["browser"] = dfTest["id_31"].apply(lambda x: parse_id31(x))
dfTest["browser"].value_counts()

chrome     79443
safari     32585
firefox     7376
ie 11.0     5856
Other       5781
edge        5584
Name: browser, dtype: int64

In [54]:
ohe.transform(dfTest[oh_columns])

array([[False, False, False, ..., False, False,  True],
       [False, False, False, ..., False, False,  True],
       [False, False, False, ..., False, False,  True],
       ...,
       [False, False, False, ..., False, False,  True],
       [False, False, False, ..., False, False,  True],
       [ True, False, False, ..., False,  True, False]])