In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

In [2]:
transaction_ori = pd.read_csv("ieee-fraud-detection/train_transaction.csv")
identity_ori = pd.read_csv("ieee-fraud-detection/train_identity.csv")

In [3]:
merge = transaction_ori.merge(identity_ori, on = "TransactionID", how = "left")
transaction_ori = None
identity_ori = None
merge

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.50,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,W,6550,,150.0,visa,226.0,...,,,,,,,,,,
590536,3577536,0,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,...,,,,,,,,,,
590537,3577537,0,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,...,,,,,,,,,,
590538,3577538,0,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,...,,,,,,,,,,


In [4]:
merge.fillna(0, inplace = True)

In [5]:
categorical_columns = ['ProductCD','card1', 'card2', 'card3', 'card4', 'card5', 'card6','addr1', 'addr2', 'P_emaildomain', 'R_emaildomain'\
                      ,'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18',\
                        'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31',\
                        'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

In [6]:
merge_encoded = merge
merge = None
for column in categorical_columns:
    mean_encoded_col = merge_encoded.groupby(column)['isFraud'].mean().to_dict()  
    merge_encoded[column] =  merge_encoded[column].map(mean_encoded_col)
merge_encoded

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.50,0.020399,0.232558,0.047353,0.024568,0.077282,0.043321,...,0.021081,0.033560,0.033446,0.033503,0.021001,0.021001,0.021001,0.021001,0.021017,0.025549
1,2987001,0,86401,29.00,0.020399,0.046852,0.065118,0.024568,0.034331,0.062945,...,0.021081,0.033560,0.033446,0.033503,0.021001,0.021001,0.021001,0.021001,0.021017,0.025549
2,2987002,0,86469,59.00,0.020399,0.008123,0.024014,0.024568,0.034756,0.011026,...,0.021081,0.033560,0.033446,0.033503,0.021001,0.021001,0.021001,0.021001,0.021017,0.025549
3,2987003,0,86499,50.00,0.020399,0.014018,0.016295,0.024568,0.034331,0.013646,...,0.021081,0.033560,0.033446,0.033503,0.021001,0.021001,0.021001,0.021001,0.021017,0.025549
4,2987004,0,86506,50.00,0.047662,0.055556,0.046214,0.024568,0.034331,0.062945,...,0.070688,0.066727,0.053309,0.039293,0.044683,0.081885,0.083312,0.059541,0.101662,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,0.020399,0.004227,0.047353,0.024568,0.034756,0.029516,...,0.021081,0.033560,0.033446,0.033503,0.021001,0.021001,0.021001,0.021001,0.021017,0.025549
590536,3577536,0,15811049,39.50,0.020399,0.000000,0.027401,0.024568,0.034331,0.038669,...,0.021081,0.033560,0.033446,0.033503,0.021001,0.021001,0.021001,0.021001,0.021017,0.025549
590537,3577537,0,15811079,30.95,0.020399,0.018841,0.017711,0.024568,0.034331,0.038669,...,0.021081,0.033560,0.033446,0.033503,0.021001,0.021001,0.021001,0.021001,0.021017,0.025549
590538,3577538,0,15811088,117.00,0.020399,0.025615,0.029356,0.024568,0.034331,0.038669,...,0.021081,0.033560,0.033446,0.033503,0.021001,0.021001,0.021001,0.021001,0.021017,0.025549


In [7]:
logRegressor = LogisticRegression(max_iter = 200)

In [8]:
x_train = merge_encoded.drop(["isFraud"], axis=1).iloc[:350000,]
y_train = merge_encoded["isFraud"].iloc[:350000,]
x_test = merge_encoded.drop(["isFraud"], axis=1).iloc[350000:,]
y_test = merge_encoded["isFraud"].iloc[350000:,]
merge_encoded = None

In [10]:
params = {
    "penalty": ["l2", "none"]    
}

In [11]:
clf = RandomizedSearchCV(estimator = logRegressor, param_distributions = params, cv = 3, \
                         scoring= "roc_auc", n_iter = 2, n_jobs = 1, random_state = 1)

In [None]:
clf.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### ¿Cuál es el mejor score de validación obtenido? (¿Cómo conviene obtener el dataset para validar?)

### Al predecir con este modelo para la competencia, ¿Cúal es el score obtenido? (guardar el csv con predicciones para entregarlo después)

### ¿Qué features son los más importantes para predecir con el mejor modelo? Graficar.