In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
transaction_ori = pd.read_csv("ieee-fraud-detection/train_transaction.csv")
identity_ori = pd.read_csv("ieee-fraud-detection/train_identity.csv")

In [3]:
merge = transaction_ori.merge(identity_ori, on = "TransactionID", how = "left")
transaction_ori = None
identity_ori = None
merge

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.50,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,W,6550,,150.0,visa,226.0,...,,,,,,,,,,
590536,3577536,0,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,...,,,,,,,,,,
590537,3577537,0,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,...,,,,,,,,,,
590538,3577538,0,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,...,,,,,,,,,,


## CountVectorizer

In [4]:
merge.id_30 = merge.id_30.fillna("Other")
list(merge.id_30)[0:20]

['Other',
 'Other',
 'Other',
 'Other',
 'Android 7.0',
 'Other',
 'Other',
 'Other',
 'iOS 11.1.2',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Other',
 'Mac OS X 10_11_6',
 'Windows 10',
 'Other',
 'Other']

In [5]:
vectorizer = CountVectorizer(max_features=15)

In [6]:
matrix = vectorizer.fit_transform(list(merge.id_30))

In [7]:
vectorizer.vocabulary_

{'other': 13,
 'android': 8,
 'ios': 9,
 '11': 7,
 'mac': 11,
 'os': 12,
 '10_11_6': 2,
 'windows': 14,
 '10': 0,
 'linux': 10,
 '10_12_6': 3,
 '10_13_1': 4,
 '10_10_5': 1,
 '10_13_2': 5,
 '10_13_3': 6}

In [8]:
merge["android_so"] = list(np.asarray(matrix.todense())[:,8])
merge["ios_so"] = list(np.asarray(matrix.todense())[:,9])
merge["mac_so"] = list(np.asarray(matrix.todense())[:,11])
merge["windows_so"] = list(np.asarray(matrix.todense())[:,14])
merge["linux_so"] = list(np.asarray(matrix.todense())[:,10])

In [9]:
merge.loc[0:20,["id_30","android_so","ios_so", "mac_so","windows_so","linux_so"]]

Unnamed: 0,id_30,android_so,ios_so,mac_so,windows_so,linux_so
0,Other,0,0,0,0,0
1,Other,0,0,0,0,0
2,Other,0,0,0,0,0
3,Other,0,0,0,0,0
4,Android 7.0,1,0,0,0,0
5,Other,0,0,0,0,0
6,Other,0,0,0,0,0
7,Other,0,0,0,0,0
8,iOS 11.1.2,0,1,0,0,0
9,Other,0,0,0,0,0


In [10]:
merge = merge.drop("id_30", axis= 1)

## Columnas categoricas

In [11]:
categorical_columns = ['ProductCD','card1', 'card2', 'card3', 'card4', 'card5', 'card6','addr1', 'addr2', 'P_emaildomain', 'R_emaildomain'\
                      ,'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18',\
                        'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_31',\
                        'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

In [12]:
columns_OHE = []
columns_ME = []
for column in categorical_columns:
    if len(merge[column].value_counts()) > 10:
        columns_ME.append(column)
    else:
        columns_OHE.append(column)

In [13]:
for c in columns_ME:
    merge[c] = merge[c].fillna(0)

## Split train-test

In [14]:
train = merge.iloc[:350000,1:]
test = merge.iloc[350000:,1:]
merge = None

## MeanEncoding

In [15]:
list_dicts = []
for column in columns_ME:
    mean_encoded_col_train = train.groupby(column)['isFraud'].mean().to_dict()  
    train[column] =  train[column].map(mean_encoded_col_train) 
    test[column] =  test[column].map(mean_encoded_col_train)    
    list_dicts.append(mean_encoded_col_train)
for c in columns_ME:
    test[c] = test[c].fillna(train[c].mean())

## OneHotEncoding

In [16]:
oneHotEncoder = OneHotEncoder(sparse = False, handle_unknown = "ignore")
matrix = oneHotEncoder.fit_transform(train[columns_OHE])
matrix_b = oneHotEncoder.transform(test[columns_OHE])
train[list(oneHotEncoder.get_feature_names_out(columns_OHE))] = matrix
test[list(oneHotEncoder.get_feature_names_out(columns_OHE))] = matrix_b
train.drop(inplace = True, axis = 1, columns = columns_OHE)
test.drop(inplace = True, axis = 1, columns = columns_OHE)

In [17]:
matrix = None
matrix_b = None

## RandomizedSearch para XGBClassifier

In [18]:
selected_features = ['V200', 'V244', 'V258', 'V189', 'V243', 'V172', 'V187', 'ProductCD_C', 'V246', 'V156', 'V154', 'V32', 'V196', 'V294', 'M5_nan',\
 'card1', 'V103', 'V323', 'V15', 'C4', 'V268', 'V223', 'V295', 'V62', 'V207', 'V70', 'V58', 'V256', 'V44', 'id_25', 'C5', 'C14', 'V34', 'V283', 'V45',\
 'V317', 'C12', 'C7', 'V205', 'V266', 'C10', 'ProductCD_H', 'V72', 'V198', 'V73', 'V90', 'C8', 'V259', 'card2', 'V169', 'V298', 'id_19', 'id_17',\
 'V13', 'R_emaildomain', 'V22', 'C13', 'M4_nan', 'id_35_F', 'V158', 'V163', 'M5_T', 'V93', 'V281', 'C1', 'card6_credit', 'V296', 'id_32_24.0',\
 'V239', 'V152', 'V149', 'V87', 'V318', 'V54', 'V275', 'V67', 'V327', 'V146', 'V255', 'V53', 'card6_debit',\
 "id_31", "id_33", "DeviceInfo", 'DeviceType_desktop', 'DeviceType_mobile', 'DeviceType_nan', "android_so","ios_so", "mac_so","windows_so","linux_so"]
 #'V30', 'V74', 'V324', 'id_01', 'card3', 'V79', 'V201', 'D2', 'V199', 'C2', 'V308', 'V61', 'id_20', 'M4_M0', 'V220', 'V312', 'V192', 'V82', 'V133',\ 

In [19]:
x_train = train.drop(["isFraud"], axis=1).loc[:,selected_features]
y_train = train["isFraud"]
x_test = test.drop(["isFraud"], axis=1).loc[:,selected_features]
y_test = test["isFraud"]
test = None
train = None

In [20]:
for c in x_train.columns:
    x_train[c] = x_train[c].fillna(0)
    x_test[c] = x_test[c].fillna(0)

In [21]:
rfclassifier = RandomForestClassifier()

In [22]:
params = {'n_estimators' : [100, 200, 300],
        'max_depth' : [1, 3, 7, 10],
        'min_samples_split' : [2, 5, 10],
        'min_samples_leaf' : [1, 5, 10]} 

In [23]:
clf = RandomizedSearchCV(estimator = rfclassifier, param_distributions = params, cv = 3, \
                         scoring= "roc_auc", n_iter = 3, n_jobs = 1, random_state = 1, verbose = 100)

In [24]:
best_model = clf.fit(x_train.values, y_train.values)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV 1/3; 1/3] START max_depth=10, min_samples_leaf=10, min_samples_split=2, n_estimators=200
[CV 1/3; 1/3] END max_depth=10, min_samples_leaf=10, min_samples_split=2, n_estimators=200;, score=0.867 total time=  44.3s
[CV 2/3; 1/3] START max_depth=10, min_samples_leaf=10, min_samples_split=2, n_estimators=200
[CV 2/3; 1/3] END max_depth=10, min_samples_leaf=10, min_samples_split=2, n_estimators=200;, score=0.900 total time=  45.7s
[CV 3/3; 1/3] START max_depth=10, min_samples_leaf=10, min_samples_split=2, n_estimators=200
[CV 3/3; 1/3] END max_depth=10, min_samples_leaf=10, min_samples_split=2, n_estimators=200;, score=0.888 total time=  46.2s
[CV 1/3; 2/3] START max_depth=10, min_samples_leaf=5, min_samples_split=2, n_estimators=100
[CV 1/3; 2/3] END max_depth=10, min_samples_leaf=5, min_samples_split=2, n_estimators=100;, score=0.874 total time=  22.3s
[CV 2/3; 2/3] START max_depth=10, min_samples_leaf=5, min_samples_split=2, 

In [25]:
best_model.best_score_

0.8889335934124226

In [26]:
best_model.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 5,
 'max_depth': 10}

## Prediccion

In [27]:
pred = best_model.predict_proba(x_test.values)

In [28]:
score = roc_auc_score(y_test, pred[:, 1])
score

0.86653575865216

In [29]:
best_model.best_estimator_.feature_importances_

array([3.48504353e-02, 2.58726934e-02, 6.67217259e-02, 2.61824165e-02,
       2.89569064e-02, 2.53640628e-03, 2.15835030e-02, 1.15498870e-03,
       2.10560582e-02, 1.34094234e-02, 1.21749056e-02, 1.37595618e-03,
       3.06576587e-03, 1.00225462e-02, 1.65787022e-03, 1.47694466e-01,
       4.30324675e-03, 2.66422044e-03, 2.90117292e-03, 2.50700003e-02,
       5.38102282e-03, 5.98655518e-03, 7.64095607e-03, 8.29127903e-03,
       4.80017480e-03, 9.24126569e-04, 2.39093086e-03, 4.18451094e-03,
       1.37654484e-02, 1.51498143e-03, 8.68491929e-04, 2.34713457e-02,
       2.15152251e-03, 1.01688014e-02, 3.17970197e-02, 1.29432274e-02,
       1.68271043e-02, 1.75830306e-02, 4.01461525e-03, 5.56891762e-03,
       1.03174912e-02, 1.18709956e-03, 1.94214378e-03, 6.68142911e-03,
       1.93886036e-03, 1.18775220e-03, 1.73854772e-02, 1.67647480e-02,
       2.38893137e-02, 1.45353070e-03, 1.19715313e-03, 1.56972994e-02,
       3.45002937e-03, 3.00769069e-03, 9.14883124e-03, 1.25222596e-03,
      

In [30]:
data = {"column": x_train.columns,
    "importance": best_model.best_estimator_.feature_importances_}

df = pd.DataFrame(data)
df.importance = abs(df.importance)
#df = df.loc[(df.column != "id_33") & (df.column != "DeviceInfo") & (df.column != "id_31")\
 #           & (df.column != 'DeviceType_desktop') & (df.column != 'DeviceType_mobile') & (df.column != 'DeviceType_nan'),] 
#df = df.nlargest(100, "importance")
df

Unnamed: 0,column,importance
0,V200,0.034850
1,V244,0.025873
2,V258,0.066722
3,V189,0.026182
4,V243,0.028957
...,...,...
87,android_so,0.000429
88,ios_so,0.000465
89,mac_so,0.000131
90,windows_so,0.000364


## Submission

In [31]:
test_transaction = pd.read_csv("ieee-fraud-detection/test_transaction.csv")
test_identity = pd.read_csv("ieee-fraud-detection/test_identity.csv")

In [32]:
test = test_transaction.merge(test_identity, on = "TransactionID", how = "left")
test_transaction = None
test_identity = None

In [33]:
test_ids = test.TransactionID

In [34]:
test.columns = test.columns.str.replace('id-','id_')
test = test.iloc[:,1:]
test

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,18403224,31.950,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
1,18403263,49.000,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,,,,,,,,,,
2,18403310,171.000,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,,,,,,,,,,
3,18403310,284.950,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,,,,,,,,,,
4,18403317,67.950,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506686,34214279,94.679,C,13832,375.0,185.0,mastercard,224.0,debit,284.0,...,,,,,,,,,,
506687,34214287,12.173,C,3154,408.0,185.0,mastercard,224.0,debit,,...,chrome 43.0 for android,,,,F,F,T,F,mobile,ALE-L23 Build/HuaweiALE-L23
506688,34214326,49.000,W,16661,490.0,150.0,visa,226.0,debit,327.0,...,,,,,,,,,,
506689,34214337,202.000,W,16621,516.0,150.0,mastercard,224.0,debit,177.0,...,,,,,,,,,,


In [35]:
test.id_30 = test.id_30.fillna("Other")

In [36]:
vectorizer = CountVectorizer(max_features=15)

In [37]:
matrix = vectorizer.fit_transform(list(test.id_30))

In [38]:
vectorizer.vocabulary_

{'other': 13,
 'android': 8,
 'ios': 9,
 '11': 6,
 'windows': 14,
 '10': 0,
 'mac': 11,
 'os': 12,
 'linux': 10,
 '10_12_6': 3,
 '10_11_6': 2,
 '10_10_5': 1,
 '12': 7,
 '10_13_6': 4,
 '10_14_1': 5}

In [39]:
test["android_so"] = list(np.asarray(matrix.todense())[:,8])
test["ios_so"] = list(np.asarray(matrix.todense())[:,9])
test["mac_so"] = list(np.asarray(matrix.todense())[:,11])
test["windows_so"] = list(np.asarray(matrix.todense())[:,14])
test["linux_so"] = list(np.asarray(matrix.todense())[:,10])

In [40]:
test = test.drop("id_30", axis= 1)

In [41]:
matrix_test = oneHotEncoder.transform(test[columns_OHE])
test[list(oneHotEncoder.get_feature_names_out(columns_OHE))] = matrix_test
test.drop(inplace = True, axis = 1, columns = columns_OHE)
matrix_test = None

In [42]:
for c in columns_ME:
    test[c] = test[c].fillna(0)

In [43]:
for i in range(len(list_dicts)): 
    test[columns_ME[i]] =  test[columns_ME[i]].map(list_dicts[i])

In [44]:
test = test.loc[:,selected_features]

In [45]:
for c in test.columns:
    test[c] = test[c].fillna(0)

In [46]:
test

Unnamed: 0,V200,V244,V258,V189,V243,V172,V187,ProductCD_C,V246,V156,...,id_33,DeviceInfo,DeviceType_desktop,DeviceType_mobile,DeviceType_nan,android_so,ios_so,mac_so,windows_so,linux_so
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
506687,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.032531,0.093264,0.0,1.0,0.0,0,0,0,0,0
506688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0
506689,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.032531,0.025390,0.0,0.0,1.0,0,0,0,0,0


In [47]:
pred = best_model.predict_proba(test.values)

In [48]:
data = {"TransactionID": test_ids,
    "isFraud": pred[:, 1]}

submission = pd.DataFrame(data)
submission

Unnamed: 0,TransactionID,isFraud
0,3663549,0.009412
1,3663550,0.013018
2,3663551,0.012182
3,3663552,0.008098
4,3663553,0.008865
...,...,...
506686,4170235,0.021055
506687,4170236,0.027711
506688,4170237,0.012397
506689,4170238,0.011884


In [51]:
submission.to_csv("randomforest_submission.csv", index = False)
#score en competencia da 0.885640