## EDA

In [1]:
import pandas as pd
import numpy as np

#Leemos los datasets disponibles
ruta = '/Users/jorge/Desktop/TFM/ieee-fraud-detection/'

#Submission
submission = pd.read_csv(f'{ruta}sample_submission.csv')

#Train
train_identity = pd.read_csv(f'{ruta}train_identity.csv')
train_transaction = pd.read_csv(f'{ruta}train_transaction.csv')

#Test
test_identity = pd.read_csv(f'{ruta}test_identity.csv')
test_transaction = pd.read_csv(f'{ruta}test_transaction.csv')

In [2]:
#Unimos los conjuntos de train y test respectivamente
train = pd.merge(train_identity,train_transaction, on='TransactionID', how='left')
test = pd.merge(test_identity,test_transaction, on='TransactionID', how='left')

In [3]:
train.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987004,0.0,70787.0,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,,,,,,,,,,
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,,,,,,,,,,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
test.head()

Unnamed: 0,TransactionID,id-01,id-02,id-03,id-04,id-05,id-06,id-07,id-08,id-09,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,3663586,-45.0,280290.0,,,0.0,0.0,,,,...,,,,,,,,,,
1,3663588,0.0,3579.0,0.0,0.0,0.0,0.0,,,0.0,...,0.0,0.0,310.0,90.0,0.0,310.0,90.0,0.0,0.0,0.0
2,3663597,-5.0,185210.0,,,1.0,0.0,,,,...,,,,,,,,,,
3,3663601,-45.0,252944.0,0.0,0.0,0.0,0.0,,,0.0,...,,,,,,,,,,
4,3663602,-95.0,328680.0,,,7.0,-33.0,,,,...,,,,,,,,,,


In [5]:
#Como hemos visto, los nombres de las columnas id no son iguales en ambos DF, por lo que vamos a cambiarlos

test = test.rename(columns = lambda x: x.replace("id-","id_"))

In [6]:
#Los datos del CSV submissions es el ID de la transacción junto a su probabilidad de ser Fraude o no. 
#Es un ejemplo del resultado final y es por ello que se va a utilizar únicamente para realizar las predicciones
# y almacenarlas
submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.5
1,3663550,0.5
2,3663551,0.5
3,3663552,0.5
4,3663553,0.5


In [7]:
#Una vez nos hemos hecho una idea del contenido de los datasets, utilizaremos la función que proporciona SweetViz
#para realizar EDA sobre ambos DF. Las conclusiones de este análisis se adjuntan en la memoria del TFM.
#Es importante poner pairwise_anaylisis en off por tema de rendimiento.
import sweetviz as sviz

eda = sviz.compare(train,test, pairwise_analysis='off')
eda.show_html('EDA.html', layout = 'vertical')

                                             |          | [  0%]   00:00 -> (? left)

Report EDA.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [10]:
#Hacemos un análisis de correlaciones

eda2 = sviz.analyze(train, pairwise_analysis='on')
eda2.show_html('corr.html', layout = 'vertical')

                                             |          | [  0%]   00:00 -> (? left)

Report corr.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


Se puede comprobar en el reporte que existen numerosas variables de las que no tenemos constancia de su origen. 
Por ello, este análisis y posterior modelado en Spark no realizará Feature Engineering, al desconocer las variables potenciales
que podemos generar.
Además, el grafo de correlaciones no muestra información, por lo que obtendremos un dataframe auxiliar y realizaremos el análisis allí sobre las variables que tenemos descripción.

In [12]:
corr = train[['TransactionID','TransactionAmt','ProductCD','card4','card6','P_emaildomain','R_emaildomain','DeviceType',
              'DeviceInfo','isFraud']]

corr2 = sviz.analyze(corr, pairwise_analysis='on')
corr2.show_html('correlaciones.html', layout = 'vertical')


                                             |          | [  0%]   00:00 -> (? left)

Report correlaciones.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


## Modelo

In [111]:
#En el reporte vemos que existe una gran cantidad de columnas con valores nulos, por lo que se van a eliminar
#Las columnas que se van a borrar son las siguientes (más de un 60% de nan en train)
col_dropped = []
for columns in train:
    nan = (train[columns].isnull().sum()*100 + train[columns].isna().sum()*100)/len(train[columns])
    if (nan > 60):
        col_dropped.append(columns)
col_dropped

#axis = 1 hace que se borren columnas y thresh indica el valor mínimo (en %) 
#de NAs que debe tener la columna para borrarse
#como test no tiene los mismos NAs, se eliminan las mismas columnas que en train (por eso se calculan previamente)

train = train.dropna(axis=1,thresh=len(train)*0.6)
test = test.drop(axis=1, columns = col_dropped)

In [112]:
train

Unnamed: 0,TransactionID,id_01,id_02,id_05,id_06,id_11,id_12,id_13,id_15,id_16,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,2987004,0.0,70787.0,,,100.0,NotFound,,New,NotFound,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2987008,-5.0,98945.0,0.0,-5.0,100.0,NotFound,49.0,New,NotFound,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2987010,-5.0,191631.0,0.0,0.0,100.0,NotFound,52.0,Found,Found,...,90.327904,90.327904,90.327904,90.327904,0.000000,0.000000,0.000000,75.887497,75.887497,75.887497
3,2987011,-5.0,221832.0,0.0,-6.0,100.0,NotFound,52.0,New,NotFound,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,2987016,0.0,7460.0,1.0,0.0,100.0,NotFound,,Found,Found,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144228,3577521,-15.0,145955.0,0.0,0.0,100.0,NotFound,27.0,Found,Found,...,60.066002,60.066002,60.066002,60.066002,488.765991,488.765991,488.765991,0.000000,0.000000,0.000000
144229,3577526,-5.0,172059.0,1.0,-5.0,100.0,NotFound,27.0,New,NotFound,...,0.000000,,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
144230,3577529,-20.0,632381.0,-1.0,-36.0,100.0,NotFound,27.0,New,NotFound,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
144231,3577531,-5.0,55528.0,0.0,-7.0,100.0,NotFound,27.0,Found,Found,...,0.000000,,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [113]:
test

Unnamed: 0,TransactionID,id_01,id_02,id_05,id_06,id_11,id_12,id_13,id_15,id_16,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,3663586,-45.0,280290.0,0.0,0.0,100.0,NotFound,27.0,New,NotFound,...,0.0000,,,,0.0,0.0,0.0,0.000,0.000,0.000
1,3663588,0.0,3579.0,0.0,0.0,100.0,Found,,Found,Found,...,90.0000,60.0000,310.0000,130.0000,0.0,0.0,0.0,0.000,0.000,0.000
2,3663597,-5.0,185210.0,1.0,0.0,100.0,NotFound,52.0,New,NotFound,...,0.0000,,,,0.0,0.0,0.0,0.000,0.000,0.000
3,3663601,-45.0,252944.0,0.0,0.0,100.0,NotFound,27.0,Found,Found,...,0.0000,,,,0.0,0.0,0.0,6.271,6.271,6.271
4,3663602,-95.0,328680.0,7.0,-33.0,100.0,NotFound,27.0,New,NotFound,...,0.0000,,,,0.0,0.0,0.0,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141902,4170230,-20.0,473365.0,0.0,0.0,100.0,NotFound,27.0,New,NotFound,...,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.000,0.000,0.000
141903,4170233,-5.0,489917.0,-4.0,-32.0,100.0,NotFound,27.0,Found,Found,...,13.4026,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.000,0.000,0.000
141904,4170234,-5.0,110081.0,22.0,-31.0,100.0,NotFound,27.0,New,NotFound,...,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.000,0.000,0.000
141905,4170236,-45.0,266704.0,-3.0,-10.0,100.0,NotFound,27.0,New,NotFound,...,0.0000,31.7237,31.7237,31.7237,0.0,0.0,0.0,0.000,0.000,0.000


In [114]:
col_dropped

['id_03',
 'id_04',
 'id_07',
 'id_08',
 'id_09',
 'id_10',
 'id_14',
 'id_18',
 'id_21',
 'id_22',
 'id_23',
 'id_24',
 'id_25',
 'id_26',
 'id_27',
 'id_30',
 'id_32',
 'id_33',
 'id_34',
 'addr1',
 'addr2',
 'dist1',
 'dist2',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V75',
 'V76',
 'V7

In [115]:
#Columnas de train categóricas
train_cat = train.select_dtypes(include=['object','category','boolean']).columns
train_cat

Index(['id_12', 'id_15', 'id_16', 'id_28', 'id_29', 'id_31', 'id_35', 'id_36',
       'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4',
       'card6', 'P_emaildomain', 'R_emaildomain'],
      dtype='object')

In [116]:
#Columnas de test categóricas
test_cat = test.select_dtypes(include=['object','category','boolean']).columns
test_cat

Index(['id_12', 'id_15', 'id_16', 'id_28', 'id_29', 'id_31', 'id_35', 'id_36',
       'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4',
       'card6', 'P_emaildomain', 'R_emaildomain'],
      dtype='object')

In [117]:
from sklearn import preprocessing

#Utilizamos el labelEncoder de sklearn para transformarlas a numéricas

for col in train_cat:
    labelEnc = preprocessing.LabelEncoder()
    labelEnc.fit(train[col].astype(str).values)
    train[col] = labelEnc.transform(train[col].astype(str).values)
    
for col in test_cat:
    labelEnc = preprocessing.LabelEncoder()
    labelEnc.fit(test[col].astype(str).values)
    test[col] = labelEnc.transform(test[col].astype(str).values)

In [118]:
train

Unnamed: 0,TransactionID,id_01,id_02,id_05,id_06,id_11,id_12,id_13,id_15,id_16,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,2987004,0.0,70787.0,,,100.0,1,,1,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2987008,-5.0,98945.0,0.0,-5.0,100.0,1,49.0,1,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2987010,-5.0,191631.0,0.0,0.0,100.0,1,52.0,0,0,...,90.327904,90.327904,90.327904,90.327904,0.000000,0.000000,0.000000,75.887497,75.887497,75.887497
3,2987011,-5.0,221832.0,0.0,-6.0,100.0,1,52.0,1,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,2987016,0.0,7460.0,1.0,0.0,100.0,1,,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144228,3577521,-15.0,145955.0,0.0,0.0,100.0,1,27.0,0,0,...,60.066002,60.066002,60.066002,60.066002,488.765991,488.765991,488.765991,0.000000,0.000000,0.000000
144229,3577526,-5.0,172059.0,1.0,-5.0,100.0,1,27.0,1,1,...,0.000000,,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
144230,3577529,-20.0,632381.0,-1.0,-36.0,100.0,1,27.0,1,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
144231,3577531,-5.0,55528.0,0.0,-7.0,100.0,1,27.0,0,0,...,0.000000,,,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [119]:
test

Unnamed: 0,TransactionID,id_01,id_02,id_05,id_06,id_11,id_12,id_13,id_15,id_16,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,3663586,-45.0,280290.0,0.0,0.0,100.0,1,27.0,1,1,...,0.0000,,,,0.0,0.0,0.0,0.000,0.000,0.000
1,3663588,0.0,3579.0,0.0,0.0,100.0,0,,0,0,...,90.0000,60.0000,310.0000,130.0000,0.0,0.0,0.0,0.000,0.000,0.000
2,3663597,-5.0,185210.0,1.0,0.0,100.0,1,52.0,1,1,...,0.0000,,,,0.0,0.0,0.0,0.000,0.000,0.000
3,3663601,-45.0,252944.0,0.0,0.0,100.0,1,27.0,0,0,...,0.0000,,,,0.0,0.0,0.0,6.271,6.271,6.271
4,3663602,-95.0,328680.0,7.0,-33.0,100.0,1,27.0,1,1,...,0.0000,,,,0.0,0.0,0.0,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141902,4170230,-20.0,473365.0,0.0,0.0,100.0,1,27.0,1,1,...,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.000,0.000,0.000
141903,4170233,-5.0,489917.0,-4.0,-32.0,100.0,1,27.0,0,0,...,13.4026,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.000,0.000,0.000
141904,4170234,-5.0,110081.0,22.0,-31.0,100.0,1,27.0,1,1,...,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.000,0.000,0.000
141905,4170236,-45.0,266704.0,-3.0,-10.0,100.0,1,27.0,1,1,...,0.0000,31.7237,31.7237,31.7237,0.0,0.0,0.0,0.000,0.000,0.000


In [120]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

model = lgb.LGBMClassifier(boosting_type='gdbt', #Se utiliza el algoritmo tradicional GBDT
                          objective='binary', #El tipo de clasificación es binaria
                          n_jobs = 2, #Número de cores de la máquina actual
                          max_depth = -1, #no hay límite
                          max_bin = 256, #A menor número de elementos en el nodo del árbol, mayor probabilidad de overfitting
                          subsample_for_bin = 10000, #datos de muestra por cada nodo del árbol
                          subsample = 1 #ratio de muestra para el entrenamiento
                          ) 

In [121]:
X = np.array(train.drop(['isFraud'], axis=1))
y = train['isFraud'].values

In [122]:
gridParams = {
    'learning_rate': [0.01, 0.02, 0.05, 0.1],
    'n_estimators': [8,16,24,32],
    'num_leaves': [2,8,16,32], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['gbdt', 'rf'], # for better accuracy -> try dart
    'objective' : ['binary'],
    'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [500],
    'colsample_bytree' : [0.65],
    'subsample' : [0.7,0.75],
    }

grid = GridSearchCV(model, gridParams, verbose=1, cv=4, n_jobs=-1)
# Run the grid
grid.fit(X, y)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

Fitting 4 folds for each of 512 candidates, totalling 2048 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   41.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 19.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 24.2min
[Parallel(n_jobs=-1)]: Done 2048 out of 2048 | elapsed: 26.7min finished


{'boosting_type': 'gbdt', 'colsample_bytree': 0.65, 'learning_rate': 0.05, 'max_bin': 255, 'n_estimators': 24, 'num_leaves': 8, 'objective': 'binary', 'random_state': 500, 'subsample': 0.7}
0.9387797322634741


In [123]:
from sklearn.model_selection import train_test_split

params = {
    'boosting_type': 'gbdt', 
    'colsample_bytree': 0.65, 
    'learning_rate': 0.05, 
    'max_bin': 255, 
    'n_estimators': 24, 
    'num_leaves': 8, 
    'objective': 'binary', 
    'random_state': 500, 
    'subsample': 0.7
}

X_test = np.array(test.drop(['TransactionID'], axis=1))
ids = test['TransactionID'].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=12)

df_train = lgb.Dataset(X_train, label = y_train)
df_valid = lgb.Dataset(X_valid, label = y_valid)

best_model = lgb.train(params, train_set=df_train, num_boost_round = 1000, valid_sets=[df_train,df_valid],
                      early_stopping_rounds = 50, verbose_eval = 4)

p_test = best_model.predict(X_test, predict_disable_shape_check=True)

result = pd.DataFrame()
result['TransactionID'] = ids
result['isFraud'] = p_test



[LightGBM] [Info] Number of positive: 10197, number of negative: 119612
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 29816
[LightGBM] [Info] Number of data points in the train set: 129809, number of used features: 243
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.078554 -> initscore=-2.462160
[LightGBM] [Info] Start training from score -2.462160
Training until validation scores don't improve for 50 rounds
[4]	training's binary_logloss: 0.228831	valid_1's binary_logloss: 0.226614
[8]	training's binary_logloss: 0.20806	valid_1's binary_logloss: 0.205744
[12]	training's binary_logloss: 0.193864	valid_1's binary_logloss: 0.191826
[16]	training's binary_logloss: 0.184457	valid_1's binary_logloss: 0.18236
[20]	training's binary_logloss: 0.176276	valid_1's binary_logloss: 0.173764
[24]	training's binary_logloss: 0.170069	valid_1's binary_logloss: 0.167335
Did not meet early stopping. 

In [124]:
result

Unnamed: 0,TransactionID,isFraud
0,3663586,0.105416
1,3663588,0.142750
2,3663597,0.077761
3,3663601,0.378679
4,3663602,0.452101
...,...,...
141902,4170230,0.060211
141903,4170233,0.038309
141904,4170234,0.060211
141905,4170236,0.071816


In [127]:
best_model.params

{'boosting_type': 'gbdt',
 'colsample_bytree': 0.65,
 'learning_rate': 0.05,
 'max_bin': 255,
 'num_leaves': 8,
 'objective': 'binary',
 'random_state': 500,
 'subsample': 0.7,
 'num_iterations': 24,
 'early_stopping_round': 50}