In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

from xgboost import XGBClassifier

In [2]:
transaction_ori = pd.read_csv("ieee-fraud-detection/train_transaction.csv")
identity_ori = pd.read_csv("ieee-fraud-detection/train_identity.csv")

In [3]:
merge = transaction_ori.merge(identity_ori, on = "TransactionID", how = "left")
transaction_ori = None
identity_ori = None
merge

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.50,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,W,6550,,150.0,visa,226.0,...,,,,,,,,,,
590536,3577536,0,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,...,,,,,,,,,,
590537,3577537,0,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,...,,,,,,,,,,
590538,3577538,0,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,...,,,,,,,,,,


## CountVectorizer

In [4]:
merge.id_30 = merge.id_30.fillna("Other")
list(merge.id_30)[0:20]

['Other',
 'Other',
 'Other',
 'Other',
 'Android 7.0',
 'Other',
 'Other',
 'Other',
 'iOS 11.1.2',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Mac OS X 10_11_6',
 'Windows 10',
 'Other',
 'Other']

In [5]:
vectorizer = CountVectorizer(max_features=15)

In [6]:
matrix = vectorizer.fit_transform(list(merge.id_30))

In [7]:
vectorizer.vocabulary_

{'other': 13,
 'android': 8,
 'ios': 9,
 '11': 7,
 'mac': 11,
 'os': 12,
 '10_11_6': 2,
 'windows': 14,
 '10': 0,
 'linux': 10,
 '10_12_6': 3,
 '10_13_1': 4,
 '10_10_5': 1,
 '10_13_2': 5,
 '10_13_3': 6}

In [8]:
merge["android_so"] = list(np.asarray(matrix.todense())[:,8])
merge["ios_so"] = list(np.asarray(matrix.todense())[:,9])
merge["mac_so"] = list(np.asarray(matrix.todense())[:,11])
merge["windows_so"] = list(np.asarray(matrix.todense())[:,14])
merge["linux_so"] = list(np.asarray(matrix.todense())[:,10])

In [9]:
merge.loc[0:20,["id_30","android_so","ios_so", "mac_so","windows_so","linux_so"]]

Unnamed: 0,id_30,android_so,ios_so,mac_so,windows_so,linux_so
0,Other,0,0,0,0,0
1,Other,0,0,0,0,0
2,Other,0,0,0,0,0
3,Other,0,0,0,0,0
4,Android 7.0,1,0,0,0,0
5,Other,0,0,0,0,0
6,Other,0,0,0,0,0
7,Other,0,0,0,0,0
8,iOS 11.1.2,0,1,0,0,0
9,Other,0,0,0,0,0


In [10]:
merge = merge.drop("id_30", axis= 1)

## Columnas categoricas

In [11]:
categorical_columns = ['ProductCD','card1', 'card2', 'card3', 'card4', 'card5', 'card6','addr1', 'addr2', 'P_emaildomain', 'R_emaildomain'\
                      ,'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18',\
                        'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_31',\
                        'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

In [12]:
columns_OHE = []
columns_ME = []
for column in categorical_columns:
    if len(merge[column].value_counts()) > 10:
        columns_ME.append(column)
    else:
        columns_OHE.append(column)

In [13]:
for c in columns_ME:
    merge[c] = merge[c].fillna(0)

## Split train-test

In [14]:
train = merge.iloc[:350000,1:]
test = merge.iloc[350000:,1:]
merge = None

## MeanEncoding

In [15]:
list_dicts = []
for column in columns_ME:
    mean_encoded_col_train = train.groupby(column)['isFraud'].mean().to_dict()  
    train[column] =  train[column].map(mean_encoded_col_train) 
    test[column] =  test[column].map(mean_encoded_col_train)    
    list_dicts.append(mean_encoded_col_train)
for c in columns_ME:
    test[c] = test[c].fillna(train[c].mean())

## OneHotEncoding

In [16]:
oneHotEncoder = OneHotEncoder(sparse = False, handle_unknown = "ignore")
matrix = oneHotEncoder.fit_transform(train[columns_OHE])
matrix_b = oneHotEncoder.transform(test[columns_OHE])
train[list(oneHotEncoder.get_feature_names_out(columns_OHE))] = matrix
test[list(oneHotEncoder.get_feature_names_out(columns_OHE))] = matrix_b
train.drop(inplace = True, axis = 1, columns = columns_OHE)
test.drop(inplace = True, axis = 1, columns = columns_OHE)

In [17]:
matrix = None
matrix_b = None

## RandomizedSearch para XGBClassifier

In [18]:
selected_features = ['V200', 'V244', 'V258', 'V189', 'V243', 'V172', 'V187', 'ProductCD_C', 'V246', 'V156', 'V154', 'V32', 'V196', 'V294', 'M5_nan',\
 'card1', 'V103', 'V323', 'V15', 'C4', 'V268', 'V223', 'V295', 'V62', 'V207', 'V70', 'V58', 'V256', 'V44', 'id_25', 'C5', 'C14', 'V34', 'V283', 'V45',\
 'V317', 'C12', 'C7', 'V205', 'V266', 'C10', 'ProductCD_H', 'V72', 'V198', 'V73', 'V90', 'C8', 'V259', 'card2', 'V169', 'V298', 'id_19', 'id_17',\
 'V13', 'R_emaildomain', 'V22', 'C13', 'M4_nan', 'id_35_F', 'V158', 'V163', 'M5_T', 'V93', 'V281', 'C1', 'card6_credit', 'V296', 'id_32_24.0',\
 'V239', 'V152', 'V149', 'V87', 'V318', 'V54', 'V275', 'V67', 'V327', 'V146', 'V255', 'V53', 'card6_debit',\
 "id_31", "id_33", "DeviceInfo", 'DeviceType_desktop', 'DeviceType_mobile', 'DeviceType_nan', "android_so","ios_so", "mac_so","windows_so","linux_so"]
 #'V30', 'V74', 'V324', 'id_01', 'card3', 'V79', 'V201', 'D2', 'V199', 'C2', 'V308', 'V61', 'id_20', 'M4_M0', 'V220', 'V312', 'V192', 'V82', 'V133',\ 

In [19]:
x_train = train.drop(["isFraud"], axis=1).loc[:,selected_features]
y_train = train["isFraud"]
x_test = test.drop(["isFraud"], axis=1).loc[:,selected_features]
y_test = test["isFraud"]
test = None
train = None

In [20]:
xgbclassifier = XGBClassifier()

In [21]:
params = {'learning_rate': [0, 0.2, 0.4, 0.5],#antes llegaba hasta 1
             'max_depth': [1, 2, 3, 4, 5], 
             'n_estimators': [150, 175, 200, 225, 250, 300],             
             'min_child_weight': [1, 2, 3],
             'gamma': [0, 0.3, 0.5, 0.7, 1],
             'colsample_bytree': [0, 0.2, 0.4, 0.5]} #aca probaba hasta 1
#lograba 0.91 el mejor y 0.83 en validacion, tardaba 1 hora
#con menos features 80 y con pocos parametros 0.93 y 0.81 en 10 min
#con 100 features y varios parametros 93.8 80 en 20 minutos

In [22]:
clf = RandomizedSearchCV(estimator = xgbclassifier, param_distributions = params, cv = 3, \
                         scoring= "roc_auc", n_iter = 20, n_jobs = 1, random_state = 1, verbose = 100)

In [23]:
best_model = clf.fit(x_train.values, y_train.values)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3; 1/20] START colsample_bytree=0.4, gamma=1, learning_rate=0.2, max_depth=2, min_child_weight=2, n_estimators=225




[CV 1/3; 1/20] END colsample_bytree=0.4, gamma=1, learning_rate=0.2, max_depth=2, min_child_weight=2, n_estimators=225;, score=0.921 total time=   5.5s
[CV 2/3; 1/20] START colsample_bytree=0.4, gamma=1, learning_rate=0.2, max_depth=2, min_child_weight=2, n_estimators=225




[CV 2/3; 1/20] END colsample_bytree=0.4, gamma=1, learning_rate=0.2, max_depth=2, min_child_weight=2, n_estimators=225;, score=0.935 total time=   5.6s
[CV 3/3; 1/20] START colsample_bytree=0.4, gamma=1, learning_rate=0.2, max_depth=2, min_child_weight=2, n_estimators=225




[CV 3/3; 1/20] END colsample_bytree=0.4, gamma=1, learning_rate=0.2, max_depth=2, min_child_weight=2, n_estimators=225;, score=0.923 total time=   5.7s
[CV 1/3; 2/20] START colsample_bytree=0, gamma=0, learning_rate=0.4, max_depth=4, min_child_weight=1, n_estimators=175




[CV 1/3; 2/20] END colsample_bytree=0, gamma=0, learning_rate=0.4, max_depth=4, min_child_weight=1, n_estimators=175;, score=0.889 total time=   2.8s
[CV 2/3; 2/20] START colsample_bytree=0, gamma=0, learning_rate=0.4, max_depth=4, min_child_weight=1, n_estimators=175




[CV 2/3; 2/20] END colsample_bytree=0, gamma=0, learning_rate=0.4, max_depth=4, min_child_weight=1, n_estimators=175;, score=0.902 total time=   2.9s
[CV 3/3; 2/20] START colsample_bytree=0, gamma=0, learning_rate=0.4, max_depth=4, min_child_weight=1, n_estimators=175




[CV 3/3; 2/20] END colsample_bytree=0, gamma=0, learning_rate=0.4, max_depth=4, min_child_weight=1, n_estimators=175;, score=0.884 total time=   2.9s
[CV 1/3; 3/20] START colsample_bytree=0.4, gamma=0.3, learning_rate=0, max_depth=2, min_child_weight=1, n_estimators=200




[CV 1/3; 3/20] END colsample_bytree=0.4, gamma=0.3, learning_rate=0, max_depth=2, min_child_weight=1, n_estimators=200;, score=0.500 total time=   5.6s
[CV 2/3; 3/20] START colsample_bytree=0.4, gamma=0.3, learning_rate=0, max_depth=2, min_child_weight=1, n_estimators=200




[CV 2/3; 3/20] END colsample_bytree=0.4, gamma=0.3, learning_rate=0, max_depth=2, min_child_weight=1, n_estimators=200;, score=0.500 total time=   5.6s
[CV 3/3; 3/20] START colsample_bytree=0.4, gamma=0.3, learning_rate=0, max_depth=2, min_child_weight=1, n_estimators=200




[CV 3/3; 3/20] END colsample_bytree=0.4, gamma=0.3, learning_rate=0, max_depth=2, min_child_weight=1, n_estimators=200;, score=0.500 total time=   5.5s
[CV 1/3; 4/20] START colsample_bytree=0.4, gamma=1, learning_rate=0.2, max_depth=4, min_child_weight=2, n_estimators=200




[CV 1/3; 4/20] END colsample_bytree=0.4, gamma=1, learning_rate=0.2, max_depth=4, min_child_weight=2, n_estimators=200;, score=0.929 total time=   8.0s
[CV 2/3; 4/20] START colsample_bytree=0.4, gamma=1, learning_rate=0.2, max_depth=4, min_child_weight=2, n_estimators=200




[CV 2/3; 4/20] END colsample_bytree=0.4, gamma=1, learning_rate=0.2, max_depth=4, min_child_weight=2, n_estimators=200;, score=0.941 total time=   9.5s
[CV 3/3; 4/20] START colsample_bytree=0.4, gamma=1, learning_rate=0.2, max_depth=4, min_child_weight=2, n_estimators=200




[CV 3/3; 4/20] END colsample_bytree=0.4, gamma=1, learning_rate=0.2, max_depth=4, min_child_weight=2, n_estimators=200;, score=0.926 total time=  14.8s
[CV 1/3; 5/20] START colsample_bytree=0, gamma=0.5, learning_rate=0.4, max_depth=1, min_child_weight=1, n_estimators=300




[CV 1/3; 5/20] END colsample_bytree=0, gamma=0.5, learning_rate=0.4, max_depth=1, min_child_weight=1, n_estimators=300;, score=0.886 total time=   7.7s
[CV 2/3; 5/20] START colsample_bytree=0, gamma=0.5, learning_rate=0.4, max_depth=1, min_child_weight=1, n_estimators=300




[CV 2/3; 5/20] END colsample_bytree=0, gamma=0.5, learning_rate=0.4, max_depth=1, min_child_weight=1, n_estimators=300;, score=0.903 total time=   8.3s
[CV 3/3; 5/20] START colsample_bytree=0, gamma=0.5, learning_rate=0.4, max_depth=1, min_child_weight=1, n_estimators=300




[CV 3/3; 5/20] END colsample_bytree=0, gamma=0.5, learning_rate=0.4, max_depth=1, min_child_weight=1, n_estimators=300;, score=0.885 total time=   4.3s
[CV 1/3; 6/20] START colsample_bytree=0.2, gamma=0.5, learning_rate=0.4, max_depth=4, min_child_weight=2, n_estimators=225




[CV 1/3; 6/20] END colsample_bytree=0.2, gamma=0.5, learning_rate=0.4, max_depth=4, min_child_weight=2, n_estimators=225;, score=0.930 total time=  13.8s
[CV 2/3; 6/20] START colsample_bytree=0.2, gamma=0.5, learning_rate=0.4, max_depth=4, min_child_weight=2, n_estimators=225




[CV 2/3; 6/20] END colsample_bytree=0.2, gamma=0.5, learning_rate=0.4, max_depth=4, min_child_weight=2, n_estimators=225;, score=0.942 total time=  12.8s
[CV 3/3; 6/20] START colsample_bytree=0.2, gamma=0.5, learning_rate=0.4, max_depth=4, min_child_weight=2, n_estimators=225




[CV 3/3; 6/20] END colsample_bytree=0.2, gamma=0.5, learning_rate=0.4, max_depth=4, min_child_weight=2, n_estimators=225;, score=0.922 total time=   7.4s
[CV 1/3; 7/20] START colsample_bytree=0.2, gamma=0.7, learning_rate=0, max_depth=1, min_child_weight=3, n_estimators=225




[CV 1/3; 7/20] END colsample_bytree=0.2, gamma=0.7, learning_rate=0, max_depth=1, min_child_weight=3, n_estimators=225;, score=0.500 total time=   5.6s
[CV 2/3; 7/20] START colsample_bytree=0.2, gamma=0.7, learning_rate=0, max_depth=1, min_child_weight=3, n_estimators=225




[CV 2/3; 7/20] END colsample_bytree=0.2, gamma=0.7, learning_rate=0, max_depth=1, min_child_weight=3, n_estimators=225;, score=0.500 total time=   8.0s
[CV 3/3; 7/20] START colsample_bytree=0.2, gamma=0.7, learning_rate=0, max_depth=1, min_child_weight=3, n_estimators=225




[CV 3/3; 7/20] END colsample_bytree=0.2, gamma=0.7, learning_rate=0, max_depth=1, min_child_weight=3, n_estimators=225;, score=0.500 total time=   6.3s
[CV 1/3; 8/20] START colsample_bytree=0.4, gamma=1, learning_rate=0, max_depth=1, min_child_weight=3, n_estimators=250




[CV 1/3; 8/20] END colsample_bytree=0.4, gamma=1, learning_rate=0, max_depth=1, min_child_weight=3, n_estimators=250;, score=0.500 total time=   5.3s
[CV 2/3; 8/20] START colsample_bytree=0.4, gamma=1, learning_rate=0, max_depth=1, min_child_weight=3, n_estimators=250




[CV 2/3; 8/20] END colsample_bytree=0.4, gamma=1, learning_rate=0, max_depth=1, min_child_weight=3, n_estimators=250;, score=0.500 total time=   4.3s
[CV 3/3; 8/20] START colsample_bytree=0.4, gamma=1, learning_rate=0, max_depth=1, min_child_weight=3, n_estimators=250




[CV 3/3; 8/20] END colsample_bytree=0.4, gamma=1, learning_rate=0, max_depth=1, min_child_weight=3, n_estimators=250;, score=0.500 total time=   6.2s
[CV 1/3; 9/20] START colsample_bytree=0, gamma=0, learning_rate=0.2, max_depth=4, min_child_weight=1, n_estimators=150




[CV 1/3; 9/20] END colsample_bytree=0, gamma=0, learning_rate=0.2, max_depth=4, min_child_weight=1, n_estimators=150;, score=0.865 total time=   5.8s
[CV 2/3; 9/20] START colsample_bytree=0, gamma=0, learning_rate=0.2, max_depth=4, min_child_weight=1, n_estimators=150




[CV 2/3; 9/20] END colsample_bytree=0, gamma=0, learning_rate=0.2, max_depth=4, min_child_weight=1, n_estimators=150;, score=0.893 total time=   4.0s
[CV 3/3; 9/20] START colsample_bytree=0, gamma=0, learning_rate=0.2, max_depth=4, min_child_weight=1, n_estimators=150




[CV 3/3; 9/20] END colsample_bytree=0, gamma=0, learning_rate=0.2, max_depth=4, min_child_weight=1, n_estimators=150;, score=0.876 total time=   8.0s
[CV 1/3; 10/20] START colsample_bytree=0.4, gamma=0.3, learning_rate=0.4, max_depth=5, min_child_weight=3, n_estimators=175




[CV 1/3; 10/20] END colsample_bytree=0.4, gamma=0.3, learning_rate=0.4, max_depth=5, min_child_weight=3, n_estimators=175;, score=0.926 total time=  18.7s
[CV 2/3; 10/20] START colsample_bytree=0.4, gamma=0.3, learning_rate=0.4, max_depth=5, min_child_weight=3, n_estimators=175




[CV 2/3; 10/20] END colsample_bytree=0.4, gamma=0.3, learning_rate=0.4, max_depth=5, min_child_weight=3, n_estimators=175;, score=0.945 total time=  18.2s
[CV 3/3; 10/20] START colsample_bytree=0.4, gamma=0.3, learning_rate=0.4, max_depth=5, min_child_weight=3, n_estimators=175




[CV 3/3; 10/20] END colsample_bytree=0.4, gamma=0.3, learning_rate=0.4, max_depth=5, min_child_weight=3, n_estimators=175;, score=0.923 total time=  18.6s
[CV 1/3; 11/20] START colsample_bytree=0.2, gamma=0.5, learning_rate=0.5, max_depth=1, min_child_weight=2, n_estimators=175




[CV 1/3; 11/20] END colsample_bytree=0.2, gamma=0.5, learning_rate=0.5, max_depth=1, min_child_weight=2, n_estimators=175;, score=0.916 total time=   5.2s
[CV 2/3; 11/20] START colsample_bytree=0.2, gamma=0.5, learning_rate=0.5, max_depth=1, min_child_weight=2, n_estimators=175




[CV 2/3; 11/20] END colsample_bytree=0.2, gamma=0.5, learning_rate=0.5, max_depth=1, min_child_weight=2, n_estimators=175;, score=0.931 total time=   5.4s
[CV 3/3; 11/20] START colsample_bytree=0.2, gamma=0.5, learning_rate=0.5, max_depth=1, min_child_weight=2, n_estimators=175




[CV 3/3; 11/20] END colsample_bytree=0.2, gamma=0.5, learning_rate=0.5, max_depth=1, min_child_weight=2, n_estimators=175;, score=0.917 total time=   5.6s
[CV 1/3; 12/20] START colsample_bytree=0.5, gamma=0.7, learning_rate=0.2, max_depth=5, min_child_weight=2, n_estimators=250




[CV 1/3; 12/20] END colsample_bytree=0.5, gamma=0.7, learning_rate=0.2, max_depth=5, min_child_weight=2, n_estimators=250;, score=0.929 total time=  27.5s
[CV 2/3; 12/20] START colsample_bytree=0.5, gamma=0.7, learning_rate=0.2, max_depth=5, min_child_weight=2, n_estimators=250




[CV 2/3; 12/20] END colsample_bytree=0.5, gamma=0.7, learning_rate=0.2, max_depth=5, min_child_weight=2, n_estimators=250;, score=0.945 total time=  28.8s
[CV 3/3; 12/20] START colsample_bytree=0.5, gamma=0.7, learning_rate=0.2, max_depth=5, min_child_weight=2, n_estimators=250




[CV 3/3; 12/20] END colsample_bytree=0.5, gamma=0.7, learning_rate=0.2, max_depth=5, min_child_weight=2, n_estimators=250;, score=0.927 total time=  27.8s
[CV 1/3; 13/20] START colsample_bytree=0.2, gamma=1, learning_rate=0.4, max_depth=3, min_child_weight=2, n_estimators=150




[CV 1/3; 13/20] END colsample_bytree=0.2, gamma=1, learning_rate=0.4, max_depth=3, min_child_weight=2, n_estimators=150;, score=0.926 total time=   7.2s
[CV 2/3; 13/20] START colsample_bytree=0.2, gamma=1, learning_rate=0.4, max_depth=3, min_child_weight=2, n_estimators=150




[CV 2/3; 13/20] END colsample_bytree=0.2, gamma=1, learning_rate=0.4, max_depth=3, min_child_weight=2, n_estimators=150;, score=0.938 total time=   7.8s
[CV 3/3; 13/20] START colsample_bytree=0.2, gamma=1, learning_rate=0.4, max_depth=3, min_child_weight=2, n_estimators=150




[CV 3/3; 13/20] END colsample_bytree=0.2, gamma=1, learning_rate=0.4, max_depth=3, min_child_weight=2, n_estimators=150;, score=0.923 total time=   8.0s
[CV 1/3; 14/20] START colsample_bytree=0, gamma=0.7, learning_rate=0.2, max_depth=2, min_child_weight=3, n_estimators=200




[CV 1/3; 14/20] END colsample_bytree=0, gamma=0.7, learning_rate=0.2, max_depth=2, min_child_weight=3, n_estimators=200;, score=0.864 total time=   5.5s
[CV 2/3; 14/20] START colsample_bytree=0, gamma=0.7, learning_rate=0.2, max_depth=2, min_child_weight=3, n_estimators=200




[CV 2/3; 14/20] END colsample_bytree=0, gamma=0.7, learning_rate=0.2, max_depth=2, min_child_weight=3, n_estimators=200;, score=0.891 total time=   5.8s
[CV 3/3; 14/20] START colsample_bytree=0, gamma=0.7, learning_rate=0.2, max_depth=2, min_child_weight=3, n_estimators=200




[CV 3/3; 14/20] END colsample_bytree=0, gamma=0.7, learning_rate=0.2, max_depth=2, min_child_weight=3, n_estimators=200;, score=0.870 total time=   5.9s
[CV 1/3; 15/20] START colsample_bytree=0.4, gamma=1, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200




[CV 1/3; 15/20] END colsample_bytree=0.4, gamma=1, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200;, score=0.922 total time=  15.6s
[CV 2/3; 15/20] START colsample_bytree=0.4, gamma=1, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200




[CV 2/3; 15/20] END colsample_bytree=0.4, gamma=1, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200;, score=0.943 total time=  20.5s
[CV 3/3; 15/20] START colsample_bytree=0.4, gamma=1, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200




[CV 3/3; 15/20] END colsample_bytree=0.4, gamma=1, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200;, score=0.919 total time=  24.5s
[CV 1/3; 16/20] START colsample_bytree=0.4, gamma=1, learning_rate=0.5, max_depth=4, min_child_weight=2, n_estimators=250




[CV 1/3; 16/20] END colsample_bytree=0.4, gamma=1, learning_rate=0.5, max_depth=4, min_child_weight=2, n_estimators=250;, score=0.924 total time=  23.2s
[CV 2/3; 16/20] START colsample_bytree=0.4, gamma=1, learning_rate=0.5, max_depth=4, min_child_weight=2, n_estimators=250




[CV 2/3; 16/20] END colsample_bytree=0.4, gamma=1, learning_rate=0.5, max_depth=4, min_child_weight=2, n_estimators=250;, score=0.942 total time=  24.7s
[CV 3/3; 16/20] START colsample_bytree=0.4, gamma=1, learning_rate=0.5, max_depth=4, min_child_weight=2, n_estimators=250




[CV 3/3; 16/20] END colsample_bytree=0.4, gamma=1, learning_rate=0.5, max_depth=4, min_child_weight=2, n_estimators=250;, score=0.922 total time=  22.3s
[CV 1/3; 17/20] START colsample_bytree=0.4, gamma=0.5, learning_rate=0.2, max_depth=3, min_child_weight=2, n_estimators=175




[CV 1/3; 17/20] END colsample_bytree=0.4, gamma=0.5, learning_rate=0.2, max_depth=3, min_child_weight=2, n_estimators=175;, score=0.925 total time=  11.8s
[CV 2/3; 17/20] START colsample_bytree=0.4, gamma=0.5, learning_rate=0.2, max_depth=3, min_child_weight=2, n_estimators=175




[CV 2/3; 17/20] END colsample_bytree=0.4, gamma=0.5, learning_rate=0.2, max_depth=3, min_child_weight=2, n_estimators=175;, score=0.938 total time=  11.1s
[CV 3/3; 17/20] START colsample_bytree=0.4, gamma=0.5, learning_rate=0.2, max_depth=3, min_child_weight=2, n_estimators=175




[CV 3/3; 17/20] END colsample_bytree=0.4, gamma=0.5, learning_rate=0.2, max_depth=3, min_child_weight=2, n_estimators=175;, score=0.925 total time=  13.5s
[CV 1/3; 18/20] START colsample_bytree=0.2, gamma=0.7, learning_rate=0, max_depth=5, min_child_weight=2, n_estimators=250




[CV 1/3; 18/20] END colsample_bytree=0.2, gamma=0.7, learning_rate=0, max_depth=5, min_child_weight=2, n_estimators=250;, score=0.500 total time=  13.0s
[CV 2/3; 18/20] START colsample_bytree=0.2, gamma=0.7, learning_rate=0, max_depth=5, min_child_weight=2, n_estimators=250




[CV 2/3; 18/20] END colsample_bytree=0.2, gamma=0.7, learning_rate=0, max_depth=5, min_child_weight=2, n_estimators=250;, score=0.500 total time=  15.0s
[CV 3/3; 18/20] START colsample_bytree=0.2, gamma=0.7, learning_rate=0, max_depth=5, min_child_weight=2, n_estimators=250




[CV 3/3; 18/20] END colsample_bytree=0.2, gamma=0.7, learning_rate=0, max_depth=5, min_child_weight=2, n_estimators=250;, score=0.500 total time=  13.1s
[CV 1/3; 19/20] START colsample_bytree=0.2, gamma=0.3, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200




[CV 1/3; 19/20] END colsample_bytree=0.2, gamma=0.3, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200;, score=0.927 total time=   8.0s
[CV 2/3; 19/20] START colsample_bytree=0.2, gamma=0.3, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200




[CV 2/3; 19/20] END colsample_bytree=0.2, gamma=0.3, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200;, score=0.943 total time=   9.4s
[CV 3/3; 19/20] START colsample_bytree=0.2, gamma=0.3, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200




[CV 3/3; 19/20] END colsample_bytree=0.2, gamma=0.3, learning_rate=0.5, max_depth=5, min_child_weight=3, n_estimators=200;, score=0.921 total time=   9.5s
[CV 1/3; 20/20] START colsample_bytree=0.5, gamma=1, learning_rate=0.4, max_depth=2, min_child_weight=3, n_estimators=175




[CV 1/3; 20/20] END colsample_bytree=0.5, gamma=1, learning_rate=0.4, max_depth=2, min_child_weight=3, n_estimators=175;, score=0.923 total time=   5.2s
[CV 2/3; 20/20] START colsample_bytree=0.5, gamma=1, learning_rate=0.4, max_depth=2, min_child_weight=3, n_estimators=175




[CV 2/3; 20/20] END colsample_bytree=0.5, gamma=1, learning_rate=0.4, max_depth=2, min_child_weight=3, n_estimators=175;, score=0.936 total time=   6.9s
[CV 3/3; 20/20] START colsample_bytree=0.5, gamma=1, learning_rate=0.4, max_depth=2, min_child_weight=3, n_estimators=175




[CV 3/3; 20/20] END colsample_bytree=0.5, gamma=1, learning_rate=0.4, max_depth=2, min_child_weight=3, n_estimators=175;, score=0.923 total time=   6.0s






In [24]:
best_model.best_score_

0.9340325769065821

In [25]:
best_model.best_params_

{'n_estimators': 250,
 'min_child_weight': 2,
 'max_depth': 5,
 'learning_rate': 0.2,
 'gamma': 0.7,
 'colsample_bytree': 0.5}

## Prediccion

In [26]:
pred = best_model.predict_proba(x_test.values)

In [27]:
score = roc_auc_score(y_test, pred[:, 1])
score

0.8117380538247155

In [28]:
best_model.best_estimator_.feature_importances_

array([0.03782318, 0.02444159, 0.00661112, 0.08411793, 0.02746297,
       0.0262827 , 0.00284054, 0.        , 0.04132858, 0.03359295,
       0.00920839, 0.00557097, 0.00138659, 0.01607534, 0.02109884,
       0.03612246, 0.0279269 , 0.00440102, 0.01758355, 0.01453747,
       0.00342374, 0.00797717, 0.00292844, 0.00834748, 0.00264142,
       0.01446243, 0.01169377, 0.00237987, 0.00709523, 0.00483035,
       0.00594692, 0.01306702, 0.00371458, 0.0102802 , 0.01028129,
       0.0089781 , 0.00582778, 0.00409933, 0.00921014, 0.00291451,
       0.00586916, 0.00717879, 0.00852472, 0.02031792, 0.00634628,
       0.0223514 , 0.0096232 , 0.00414471, 0.0056055 , 0.00451604,
       0.00605622, 0.00659406, 0.00165775, 0.0098799 , 0.0092474 ,
       0.00132877, 0.01089192, 0.02108604, 0.        , 0.00246105,
       0.00573437, 0.01422872, 0.0050949 , 0.01126924, 0.0204084 ,
       0.00437584, 0.00843055, 0.00194466, 0.00493183, 0.00747637,
       0.031768  , 0.00824047, 0.00564648, 0.01213313, 0.00760

In [29]:
data = {"column": x_train.columns,
    "importance": best_model.best_estimator_.feature_importances_}

df = pd.DataFrame(data)
df.importance = abs(df.importance)
#df = df.loc[(df.column != "id_33") & (df.column != "DeviceInfo") & (df.column != "id_31")\
 #           & (df.column != 'DeviceType_desktop') & (df.column != 'DeviceType_mobile') & (df.column != 'DeviceType_nan'),] 
#df = df.nlargest(100, "importance")
df

Unnamed: 0,column,importance
0,V200,0.037823
1,V244,0.024442
2,V258,0.006611
3,V189,0.084118
4,V243,0.027463
...,...,...
87,android_so,0.000645
88,ios_so,0.001755
89,mac_so,0.003340
90,windows_so,0.004213


## Submission

In [30]:
test_transaction = pd.read_csv("ieee-fraud-detection/test_transaction.csv")
test_identity = pd.read_csv("ieee-fraud-detection/test_identity.csv")

In [31]:
test = test_transaction.merge(test_identity, on = "TransactionID", how = "left")
test_transaction = None
test_identity = None

In [32]:
test_ids = test.TransactionID

In [33]:
test.columns = test.columns.str.replace('id-','id_')
test = test.iloc[:,1:]
test

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.950,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
1,18403263,49.000,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,,,,,,,,,,
2,18403310,171.000,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,,,,,,,,,,
3,18403310,284.950,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,,,,,,,,,,
4,18403317,67.950,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506686,34214279,94.679,C,13832,375.0,185.0,mastercard,224.0,debit,284.0,...,,,,,,,,,,
506687,34214287,12.173,C,3154,408.0,185.0,mastercard,224.0,debit,,...,chrome 43.0 for android,,,,F,F,T,F,mobile,ALE-L23 Build/HuaweiALE-L23
506688,34214326,49.000,W,16661,490.0,150.0,visa,226.0,debit,327.0,...,,,,,,,,,,
506689,34214337,202.000,W,16621,516.0,150.0,mastercard,224.0,debit,177.0,...,,,,,,,,,,


In [34]:
test.id_30 = test.id_30.fillna("Other")

In [35]:
vectorizer = CountVectorizer(max_features=15)

In [36]:
matrix = vectorizer.fit_transform(list(test.id_30))

In [37]:
vectorizer.vocabulary_

{'other': 13,
 'android': 8,
 'ios': 9,
 '11': 6,
 'windows': 14,
 '10': 0,
 'mac': 11,
 'os': 12,
 'linux': 10,
 '10_12_6': 3,
 '10_11_6': 2,
 '10_10_5': 1,
 '12': 7,
 '10_13_6': 4,
 '10_14_1': 5}

In [38]:
test["android_so"] = list(np.asarray(matrix.todense())[:,8])
test["ios_so"] = list(np.asarray(matrix.todense())[:,9])
test["mac_so"] = list(np.asarray(matrix.todense())[:,11])
test["windows_so"] = list(np.asarray(matrix.todense())[:,14])
test["linux_so"] = list(np.asarray(matrix.todense())[:,10])

In [39]:
test = test.drop("id_30", axis= 1)

In [40]:
matrix_test = oneHotEncoder.transform(test[columns_OHE])
test[list(oneHotEncoder.get_feature_names_out(columns_OHE))] = matrix_test
test.drop(inplace = True, axis = 1, columns = columns_OHE)
matrix_test = None

In [41]:
for c in columns_ME:
    test[c] = test[c].fillna(0)

In [42]:
for i in range(len(list_dicts)): 
    test[columns_ME[i]] =  test[columns_ME[i]].map(list_dicts[i])

In [43]:
test = test.loc[:,selected_features]

In [54]:
for c in selected_features:
    test[c] = test[c].fillna(x_train[selected_features].mean())

In [55]:
test

Unnamed: 0,V200,V244,V258,V189,V243,V172,V187,ProductCD_C,V246,V156,...,id_33,DeviceInfo,DeviceType_desktop,DeviceType_mobile,DeviceType_nan,android_so,ios_so,mac_so,windows_so,linux_so
0,,,,,,,,0.0,,,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
1,,,,,,,,0.0,,,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
2,,,,,,,,0.0,,,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
3,,,,,,,,0.0,,,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
4,,,,,,,,0.0,,,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506686,,,,,,,,1.0,,,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
506687,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,,...,0.032531,0.093264,0.0,1.0,0.0,0,0,0,0,0
506688,,,,,,,,0.0,,,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
506689,,,,,,,,0.0,,,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0


In [56]:
pred = best_model.predict_proba(test.values)

In [57]:
data = {"TransactionID": test_ids,
    "isFraud": pred[:, 1]}

submission = pd.DataFrame(data)
submission

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000001
1,3663550,0.008713
2,3663551,0.000011
3,3663552,0.003197
4,3663553,0.002220
...,...,...
506686,4170235,0.023842
506687,4170236,0.012982
506688,4170237,0.006866
506689,4170238,0.000017


In [58]:
#submission.to_csv("xgboost_submission.csv", index = False)
#score en competencia da 0.747660