# Modelo 2

## Random Forest

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
dfTransaction = pd.read_csv("../Dataset/ML/train_transaction.csv")
dfIdentity = pd.read_csv("../Dataset/ML/train_identity.csv")

# Uso left ya que pueden haber transacciones sin identidades pero no puede suceder lo contrario
dfMerge = dfTransaction.merge(dfIdentity, on = "TransactionID", how = "left")

del dfTransaction
del dfIdentity
dfMerge.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


### Split del dataset

In [3]:
train_size = int(len(dfMerge) * 0.75)

X_train = dfMerge.iloc[:train_size, 1:] # Saco el Transaction ID
X_val = dfMerge.iloc[train_size:, 1:] # Saco el Transaction ID
y_train = dfMerge["isFraud"].iloc[:train_size]
y_val = dfMerge["isFraud"].iloc[train_size:]

In [4]:
del dfMerge

In [5]:
X_train.tail()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
442900,0,11246397,107.95,W,7508,321.0,150.0,visa,226.0,debit,...,,,,,,,,,,
442901,0,11246476,35.658,C,15885,545.0,185.0,visa,138.0,debit,...,chrome 65.0,,,,F,F,T,F,desktop,Windows
442902,0,11246502,30.95,W,9992,455.0,150.0,mastercard,126.0,debit,...,,,,,,,,,,
442903,0,11246585,57.95,W,9485,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
442904,0,11246605,57.95,W,1919,321.0,150.0,visa,226.0,debit,...,,,,,,,,,,


In [6]:
X_val.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
442905,0,11246665,30.95,W,8528,215.0,150.0,visa,226.0,debit,...,,,,,,,,,,
442906,0,11246704,53.95,W,7919,194.0,150.0,mastercard,166.0,debit,...,,,,,,,,,,
442907,0,11246761,117.0,W,15497,490.0,150.0,visa,226.0,debit,...,,,,,,,,,,
442908,0,11246761,29.0,W,7826,481.0,150.0,mastercard,224.0,debit,...,,,,,,,,,,
442909,0,11247072,200.0,R,2528,399.0,150.0,american express,137.0,credit,...,mobile safari generic,32.0,2732x2048,match_status:1,T,F,F,F,mobile,iOS Device


### Encoding

#### CountVectorizer

In [7]:
columnas_vectorizer = []

In [8]:
# Utilizo CountVectorizer para el campo id_31
X_train["id_31"].fillna("NaN", inplace = True)
X_val["id_31"].fillna("NaN", inplace = True)
vectorizer_browsers = CountVectorizer(max_features = 15, dtype = bool)

browsers_matrix = vectorizer_browsers.fit_transform(X_train["id_31"].values).todense()
browsers_matrix_val = vectorizer_browsers.transform(X_val["id_31"].values).todense()

vectorizer_browsers.vocabulary_

{'nan': 13,
 'mobile': 12,
 'safari': 14,
 '11': 0,
 'chrome': 6,
 '62': 1,
 'for': 9,
 'android': 5,
 'generic': 10,
 'firefox': 8,
 'ie': 11,
 'desktop': 7,
 '63': 2,
 '64': 3,
 '65': 4}

In [9]:
# Me quedo con los nombres de los navegadores
browsers = ["chrome", "safari", "firefox", "ie", "nan"]
for browser in browsers:
    columnas_vectorizer += ["browser_{}".format(browser)]
    indice = vectorizer_browsers.vocabulary_[browser]
    
    X_train["browser_{}".format(browser)] = list(np.asarray(browsers_matrix)[:,indice])
    X_val["browser_{}".format(browser)] = list(np.asarray(browsers_matrix_val)[:,indice])
X_train.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,id_36,id_37,id_38,DeviceType,DeviceInfo,browser_chrome,browser_safari,browser_firefox,browser_ie,browser_nan
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,,,,,,False,False,False,False,True
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,False,False,False,False,True
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,False,False,False,False,True
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,False,False,False,False,True
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,False,False,False,False,False


In [10]:
del browsers_matrix
del browsers_matrix_val

In [11]:
# Utilizo CountVectorizer para el campo id_30
X_train["id_30"].fillna("NaN", inplace = True)
X_val["id_30"].fillna("NaN", inplace = True)
vectorizer_sos = CountVectorizer(max_features = 15, dtype = bool)

sos_matrix = vectorizer_sos.fit_transform(X_train["id_30"].values).todense()
sos_matrix_val = vectorizer_sos.transform(X_val["id_30"].values).todense()

vectorizer_sos.vocabulary_

{'nan': 12,
 'android': 8,
 'ios': 9,
 '11': 7,
 'mac': 11,
 'os': 13,
 '10_11_6': 2,
 'windows': 14,
 '10': 0,
 'linux': 10,
 '10_12_6': 3,
 '10_13_1': 4,
 '10_10_5': 1,
 '10_13_2': 5,
 '10_13_3': 6}

In [12]:
# Me quedo con los nombres de los SOs
sos = ["android", "ios", "mac", "linux", "windows", "nan"]
for so in sos:
    columnas_vectorizer += ["SO_{}".format(so)]
    indice = vectorizer_sos.vocabulary_[so]
    
    X_train["SO_{}".format(so)] = list(np.asarray(sos_matrix)[:,indice])
    X_val["SO_{}".format(so)] = list(np.asarray(sos_matrix_val)[:,indice])
X_train.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,browser_safari,browser_firefox,browser_ie,browser_nan,SO_android,SO_ios,SO_mac,SO_linux,SO_windows,SO_nan
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,False,False,False,True,False,False,False,False,False,True
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,False,False,False,True,False,False,False,False,False,True
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,False,False,False,True,False,False,False,False,False,True
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,False,False,False,True,False,False,False,False,False,True
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,False,False,False,False,True,False,False,False,False,False


In [13]:
del sos_matrix
del sos_matrix_val

#### One Hot Encoding

In [14]:
# Hago OHE de las siguiente columnas
oh_columns = ["ProductCD", "card4", "card6", "DeviceType"]
ohe = OneHotEncoder(sparse = False, dtype = bool, handle_unknown = "ignore")

ohe_matrix = ohe.fit_transform(X_train[oh_columns])
ohe_matrix_val = ohe.transform(X_val[oh_columns])

# Elimino estas 2 categorias que solo aparecen 50 veces en todo el dataset
X_train[list(ohe.get_feature_names_out(oh_columns))] = ohe_matrix
X_train.drop(["card6_debit or credit", "card6_charge card"], axis = 1, inplace = True)

X_val[list(ohe.get_feature_names_out(oh_columns))] = ohe_matrix_val
X_val.drop(["card6_debit or credit", "card6_charge card"], axis = 1, inplace = True)

In [15]:
del ohe_matrix
del ohe_matrix_val

In [16]:
columnas_ohe = list(ohe.get_feature_names_out(oh_columns))
columnas_ohe.remove("card6_debit or credit")
columnas_ohe.remove("card6_charge card")

In [17]:
X_train.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,card4_discover,card4_mastercard,card4_visa,card4_nan,card6_credit,card6_debit,card6_nan,DeviceType_desktop,DeviceType_mobile,DeviceType_nan
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,True,False,False,False,True,False,False,False,False,True
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,False,True,False,False,True,False,False,False,False,True
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,False,False,True,False,False,True,False,False,False,True
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,False,True,False,False,False,True,False,False,False,True
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,False,True,False,False,True,False,False,False,True,False


In [18]:
# Obtengo la cantidad de pixeles de la columna id_33
def parse_resolucion(x):
    if pd.isna(x):
        return np.NaN
    pixeles = x.split("x")
    return int(pixeles[0]) * int(pixeles[1])

X_train["resolucion"] = X_train["id_33"].apply(lambda x: parse_resolucion(x))
X_val["resolucion"] = X_val["id_33"].apply(lambda x: parse_resolucion(x))

In [19]:
X_train.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,card4_mastercard,card4_visa,card4_nan,card6_credit,card6_debit,card6_nan,DeviceType_desktop,DeviceType_mobile,DeviceType_nan,resolucion
0,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,...,False,False,False,True,False,False,False,False,True,
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,True,False,False,True,False,False,False,False,True,
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,False,True,False,False,True,False,False,False,True,
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,True,False,False,False,True,False,False,False,True,
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,True,False,False,True,False,False,False,True,False,2397600.0


In [20]:
X_train.drop(oh_columns + ["id_30", "id_31", "id_33"], axis = 1, inplace = True)
X_val.drop(oh_columns + ["id_30", "id_31", "id_33"], axis = 1, inplace = True)

#### Mean Encoding

In [21]:
columnas_mean = ["card1", "card2", "card3", "card5", "addr1", "addr2",
                "P_emaildomain", "R_emaildomain", "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9",
                "DeviceInfo", 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18',
                'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28',
                'id_29', 'id_32', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']
means = []
for columna in columnas_mean:
    mean_encoded = X_train.groupby(columna)['isFraud'].mean().to_dict()
    means += [mean_encoded]
    
    X_train[columna] = X_train[columna].map(mean_encoded)
    X_val[columna] = X_val[columna].map(mean_encoded)

In [22]:
# Tratamiento de nulos
for columna in X_train.columns:
    X_train[columna].fillna(X_train[columna].mean(), inplace = True)
    X_val[columna].fillna(X_val[columna].mean(), inplace = True)

In [23]:
X_train.drop("isFraud", axis = 1, inplace = True)
X_val.drop("isFraud", axis = 1, inplace = True)

In [24]:
columnas_modelo = columnas_ohe + columnas_mean + columnas_vectorizer + ["resolucion",
                                                                        "TransactionDT", "TransactionAmt", "D1", "D2", "D3", "D4",
                                                                        "D5", "D6", "D7", "D8", "D9", "D10", "D11", "D12", "D13",
                                                                        "D14", "dist1", "dist2"]

### Random Forest con RandomSearch

In [25]:
param_grid = {
    'max_depth': range(3, 11),
    'criterion': ["gini", "entropy"],
    'min_samples_split': range(2, 50),
    'min_samples_leaf': range(1, 20)
}

In [27]:
randomforest = RandomForestClassifier(n_estimators = 100, n_jobs = 1)
classifier = RandomizedSearchCV(randomforest, param_grid, n_iter = 20,
                            n_jobs = 1, verbose = 100, cv = 3,
                            scoring = 'roc_auc', random_state = 1, error_score = "raise")

##### Si hay poroblemas de memoria, se puede guardar la información del Notebook, reiniciarlo y volver a cargarlo. Esto ayuda a liberar memoria _leakeada_ por Jupyter

In [28]:
import dill
dill.dump_session('dump_randomforest.db')

> Reiniciar kernel

In [1]:
import dill
dill.load_session('dump_randomforest.db')

In [2]:
classifier.fit(X_train.values, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3; 1/20] START criterion=entropy, max_depth=9, min_samples_leaf=13, min_samples_split=7
[CV 1/3; 1/20] END criterion=entropy, max_depth=9, min_samples_leaf=13, min_samples_split=7;, score=0.876 total time= 1.4min
[CV 2/3; 1/20] START criterion=entropy, max_depth=9, min_samples_leaf=13, min_samples_split=7
[CV 2/3; 1/20] END criterion=entropy, max_depth=9, min_samples_leaf=13, min_samples_split=7;, score=0.906 total time= 1.5min
[CV 3/3; 1/20] START criterion=entropy, max_depth=9, min_samples_leaf=13, min_samples_split=7
[CV 3/3; 1/20] END criterion=entropy, max_depth=9, min_samples_leaf=13, min_samples_split=7;, score=0.762 total time= 1.4min
[CV 1/3; 2/20] START criterion=gini, max_depth=3, min_samples_leaf=5, min_samples_split=45
[CV 1/3; 2/20] END criterion=gini, max_depth=3, min_samples_leaf=5, min_samples_split=45;, score=0.803 total time=  37.8s
[CV 2/3; 2/20] START criterion=gini, max_depth=3, min_samples_leaf=5,

RandomizedSearchCV(cv=3, error_score='raise',
                   estimator=RandomForestClassifier(n_jobs=1), n_iter=20,
                   n_jobs=1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': range(3, 11),
                                        'min_samples_leaf': range(1, 20),
                                        'min_samples_split': range(2, 50)},
                   random_state=1, scoring='roc_auc', verbose=100)

In [3]:
predictions = classifier.predict_proba(X_val.values)
print("Validation score: %r" % roc_auc_score(y_val, [pred[1] for pred in predictions]))

Validation score: 0.8320090036365336


#### Predicción del set de Test para la competencia

In [4]:
del X_train
del y_train
del X_val
del y_val

In [19]:
dfTestTransaction = pd.read_csv("../Dataset/ML/test_transaction.csv")
dfTestIdentity = pd.read_csv("../Dataset/ML/test_identity.csv")
dfTest = dfTestTransaction.merge(dfTestIdentity, on = "TransactionID", how = "left")

del dfTestTransaction
del dfTestIdentity

In [20]:
dfTest.columns = [columna.replace("id-", "id_") for columna in dfTest.columns]

In [21]:
dfTest["id_31"].fillna("NaN", inplace = True)
browsers_matrix = vectorizer_browsers.transform(dfTest["id_31"].values).todense()

for browser in browsers:
    indice = vectorizer_browsers.vocabulary_[browser]
    dfTest["browser_{}".format(browser)] = list(np.asarray(browsers_matrix)[:,indice])

In [22]:
del browsers_matrix

In [23]:
dfTest["id_30"].fillna("NaN", inplace = True)
sos_matrix = vectorizer_sos.transform(dfTest["id_30"].values).todense()

for so in sos:
    indice = vectorizer_sos.vocabulary_[so]
    dfTest["SO_{}".format(so)] = list(np.asarray(sos_matrix)[:,indice])

In [24]:
del sos_matrix

In [25]:
ohe_matrix = ohe.transform(dfTest[oh_columns])

dfTest[list(ohe.get_feature_names_out(oh_columns))] = ohe_matrix
dfTest.drop(["card6_debit or credit", "card6_charge card"], axis = 1, inplace = True)

In [26]:
del ohe_matrix

In [27]:
dfTest["resolucion"] = dfTest["id_33"].apply(lambda x: parse_resolucion(x))

In [28]:
dfTest.drop(oh_columns + ["id_30", "id_31", "id_33"], axis = 1, inplace = True)

In [29]:
for i, columna in enumerate(columnas_mean):
    mean_encoded = means[i]
    dfTest[columna] =  dfTest[columna].map(mean_encoded)

In [30]:
for columna in dfTest.columns:
    dfTest[columna].fillna(dfTest[columna].mean(), inplace = True)

In [17]:
# Me deshago de las columnas que no necesito para ahorrar memoria
# Me quedo con TransactionID para el submission, pero no lo uso para predecir
dfTest = dfTest[["TransactionID"] + columnas_modelo]

In [31]:
predictions = classifier.predict_proba(dfTest.iloc[:,1:].values)
dfTest["isFraud"] = [pred[1] for pred in predictions]
dfTest.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,card4_visa,card4_nan,card6_credit,card6_debit,card6_nan,DeviceType_desktop,DeviceType_mobile,DeviceType_nan,resolucion,isFraud
0,3663549,18403224,31.95,0.0,0.021859,0.02486,0.02955,0.04008,0.024207,1.0,...,True,False,False,True,False,False,False,True,2268999.0,0.081722
1,3663550,18403263,49.0,0.028136,0.021859,0.02486,0.02955,0.022031,0.024207,4.0,...,True,False,False,True,False,False,False,True,2268999.0,0.058942
2,3663551,18403310,171.0,0.0,0.0,0.02486,0.02955,0.025105,0.024207,2635.0,...,True,False,False,True,False,False,False,True,2268999.0,0.058042
3,3663552,18403310,284.95,0.003663,0.010315,0.02486,0.010512,0.012753,0.024207,17.0,...,True,False,False,True,False,False,False,True,2268999.0,0.068176
4,3663553,18403317,67.95,0.013263,0.012275,0.02486,0.015206,0.017881,0.024207,6.0,...,False,False,False,True,False,False,False,True,2268999.0,0.062511


In [32]:
dfTest[["TransactionID", "isFraud"]].to_csv("randomforest_submission.csv", index = False)

El score obtenido en Kaggle es de `0.854148`. El csv con las predicciones se encuentra en https://github.com/ManuelBilbao/75.06-OrgaDeDatos-TPs/tree/main/ML/randomforest_submission.csv