In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import uniform, truncnorm, randint
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
dfTransaction = pd.read_csv("../Dataset/ML/train_transaction.csv")
dfIdentity = pd.read_csv("../Dataset/ML/train_identity.csv")
dfMerge = dfTransaction.merge(dfIdentity, on = "TransactionID", how = "left")
del dfTransaction
del dfIdentity
dfMerge.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [3]:
model_params = {
    "penalty": ["none", "l2"],
    "tol": uniform(0.5e-4, 1.5e-4),
    "C": uniform(0.1, 1),
    "fit_intercept": [True, False],
}

In [4]:
regression = LogisticRegression(max_iter = 250)
classifier = RandomizedSearchCV(regression, model_params, n_iter=20, cv=3, random_state=1, n_jobs = 1, scoring = "roc_auc", verbose = 100)

In [5]:
X_train, X_val, y_train, y_val = train_test_split(dfMerge.iloc[:,1:], dfMerge["isFraud"].values, train_size = 0.8, shuffle = False)

In [6]:
del dfMerge

In [7]:
columnas_categoricas = ["ProductCD", "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2",
                        "P_emaildomain", "R_emaildomain", "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9",
                        "DeviceType", "DeviceInfo", 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18',
                        'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28',
                        'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']
means = []
for columna in columnas_categoricas:
    mean_encoded = X_train.groupby(columna)['isFraud'].mean().to_dict()
    means += [mean_encoded]
    X_train[columna] =  X_train[columna].map(mean_encoded)

In [8]:
for columna in X_train.columns:
    if X_train[columna].dtype == object:
        X_train[columna].fillna("NaN", inplace = True)
    else:
        X_train[columna].fillna(X_train[columna].mean(), inplace = True)

In [9]:
model = classifier.fit(X_train.iloc[:,1:].values, y_train)
from pprint import pprint
pprint(model.best_estimator_.get_params())

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV 1/3; 1/20] START C=0.517022004702574, fit_intercept=True, penalty=none, tol=5.0017156222601735e-05




[CV 1/3; 1/20] END C=0.517022004702574, fit_intercept=True, penalty=none, tol=5.0017156222601735e-05;, score=0.423 total time=   7.0s
[CV 2/3; 1/20] START C=0.517022004702574, fit_intercept=True, penalty=none, tol=5.0017156222601735e-05


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3; 1/20] END C=0.517022004702574, fit_intercept=True, penalty=none, tol=5.0017156222601735e-05;, score=0.681 total time=  32.6s
[CV 3/3; 1/20] START C=0.517022004702574, fit_intercept=True, penalty=none, tol=5.0017156222601735e-05


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3; 1/20] END C=0.517022004702574, fit_intercept=True, penalty=none, tol=5.0017156222601735e-05;, score=0.679 total time=  31.4s
[CV 1/3; 2/20] START C=0.40233257263183975, fit_intercept=False, penalty=none, tol=6.385078921531968e-05




[CV 1/3; 2/20] END C=0.40233257263183975, fit_intercept=False, penalty=none, tol=6.385078921531968e-05;, score=0.423 total time=   8.2s
[CV 2/3; 2/20] START C=0.40233257263183975, fit_intercept=False, penalty=none, tol=6.385078921531968e-05




KeyboardInterrupt: 

In [None]:
model.best_score_

In [None]:
for i, columna in enumerate(columnas_categoricas):
    mean_encoded = means[i]
    X_val[columna] =  X_val[columna].map(mean_encoded)

In [None]:
for columna in X_val.columns:
    X_val[columna].fillna(X_val[columna].mean(), inplace = True)

In [None]:
predictions = model.predict_proba(X_val.iloc[:,1:].values)
predictions

In [None]:
roc_auc_score(y_val, [proba[1] for proba in predictions])

In [None]:
suma = 0
count = 0
for proba in predictions:
  count += 1
  if proba[1] >= 0.5:
    suma += 1
print("Suma:", suma, "Cuenta:", count)

In [None]:
del X_train
del X_val
del y_train
del y_val

In [None]:
dfTestTransaction = pd.read_csv("/content/drive/MyDrive/Materias/Datos/2021C2/test_transaction.csv")
dfTestIdentity = pd.read_csv("/content/drive/MyDrive/Materias/Datos/2021C2/test_identity.csv")
dfTestMerge = dfTestTransaction.merge(dfTestIdentity, on = "TransactionID", how = "left")
del dfTestTransaction
del dfTestIdentity

In [None]:
columnas = []
for columna in dfTestMerge.columns:
  columnas += [columna.replace("id-", "id_")]
dfTestMerge.columns = columnas

In [None]:
for i, columna in enumerate(columnas_categoricas):
    mean_encoded = means[i]
    dfTestMerge[columna] =  dfTestMerge[columna].map(mean_encoded)

In [None]:
for columna in dfTestMerge.columns:
    dfTestMerge[columna].fillna(dfTestMerge[columna].mean(), inplace = True)

In [None]:
predictions = model.predict_proba(dfTestMerge.iloc[:,1:].values)
predictions

#### ¿Cuál es el mejor score de validación obtenido? (¿Cómo conviene obtener el dataset para validar?)

El mejor score fue XXXX. Para obtener el dataset de validación hay que tener en cuenta que hay campos temporales (TransactionDT), con lo cual no se pueden obtener al azar. Por esto, como el dataset esta ordenado por ese campo, hago un _split_ del primer 80% para el set de entrenamiento y el 20% siguiente lo dejo para la validación. Todo esto hay que realizarlo antes de hacer el _encoding_ ya que de otra forma estaríamos filtrando datos del set de validación al entrenamiento.

#### Al predecir con este modelo para la competencia, ¿Cúal es el score obtenido? (guardar el csv con predicciones para entregarlo después)

El score obtenido en Kaggle es de XXXX. El csv con las predicciones se encuentra en https://github.com/ManuelBilbao/75.06-OrgaDeDatos-TPs/tree/main/ML/test_predictions.csv

#### ¿Qué features son los más importantes para predecir con el mejor modelo? Graficar.

Las X _features_ más importantes según el modelo obtenido fueron: 