In [112]:
#Manipulacion de datos
import pandas as pd
import numpy as np

#Visualizacion
import missingno as miss

#Funciones creadas
from funciones import *

#Evitar Warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [113]:
#Accuracy y RMSLE
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


#Modelos utilizados
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


Carga de CSV

In [114]:
csv = pd.read_csv("fraude_tc.csv",encoding='latin-1')

Analizamos las columnas

In [115]:
csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   distance_from_home              1000000 non-null  float64
 1   distance_from_last_transaction  1000000 non-null  float64
 2   ratio_to_median_purchase_price  1000000 non-null  float64
 3   repeat_retailer                 1000000 non-null  float64
 4   used_chip                       1000000 non-null  float64
 5   used_pin_number                 1000000 non-null  float64
 6   online_order                    1000000 non-null  float64
 7   fraud                           1000000 non-null  float64
dtypes: float64(8)
memory usage: 61.0 MB


Vemos las correlaciones de la variable a predecir

In [116]:
csv.corr()['fraud']

distance_from_home                0.187571
distance_from_last_transaction    0.091917
ratio_to_median_purchase_price    0.462305
repeat_retailer                  -0.001357
used_chip                        -0.060975
used_pin_number                  -0.100293
online_order                      0.191973
fraud                             1.000000
Name: fraud, dtype: float64

Elegimos las 4 columnas con mayor correlacion (se usaron 4 porque se obtiene la menor cantidad de Falsos Negativos, se busca esto por ser un caso de fraudes)

In [117]:
X = csv[csv.corr()['fraud'].sort_values(ascending=False).index[1:5]]

y = csv['fraud']
    

Se divide la informacion en train y test

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8)


Utilizamos 6 modelos distintos para ver los resultados

In [119]:
modelos_class = {
    "Logit": LogisticRegression(),
    "DecTree": DecisionTreeClassifier(),
    "RFC": RandomForestClassifier(max_depth=5),
    "AdaBoostC": AdaBoostClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
}

In [120]:
for name, model in modelos_class.items():
    print("Modelo",name)
    print("\n")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    confusion = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix\n')
    print(confusion)
    print("\n")
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
    print("Metricas cross_validation", scores)
    print("Media de cross_validation", scores.mean())
    print(f"RMSLE :",mean_squared_log_error(y_test, y_pred,squared=False))
    accuracy=accuracy_score(y_test, y_pred)
    print("Accuracy de Test :",accuracy)
    print("\n")

Modelo Logit


Confusion Matrix

[[180147   2081]
 [  9471   8301]]


Metricas cross_validation [0.94385    0.94363125 0.944275   0.9435125  0.9436    ]
Media de cross_validation 0.9437737499999999
RMSLE : 0.16658621216629937
Accuracy de Test : 0.94224


Modelo DecTree


Confusion Matrix

[[179065   3163]
 [  3362  14410]]


Metricas cross_validation [0.96731875 0.96756875 0.96865    0.96780625 0.96813125]
Media de cross_validation 0.9678949999999998
RMSLE : 0.12519895997603703
Accuracy de Test : 0.967375


Modelo RFC


Confusion Matrix

[[177847   4381]
 [   481  17291]]


Metricas cross_validation [0.97534375 0.97539375 0.97626875 0.97526875 0.9751125 ]
Media de cross_validation 0.9754775
RMSLE : 0.10807318246610245
Accuracy de Test : 0.97569


Modelo AdaBoostC


Confusion Matrix

[[180804   1424]
 [  4611  13161]]


Metricas cross_validation [0.96924375 0.96998125 0.970475   0.9696875  0.96981875]
Media de cross_validation 0.96984125
RMSLE : 0.12040626933420755
Accuracy de Test : 0.

Elegimos el modelo con mejor resultado

In [121]:
model = LGBMClassifier(task="train",application="binary")
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix\n')
print(confusion)
print("\n")
scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
print("Metricas cross_validation", scores)
print("Media de cross_validation", scores.mean())
print(f"RMSLE :",mean_squared_log_error(y_test, y_pred,squared=False))
accuracy=accuracy_score(y_test, y_pred)
print("Accuracy de Test :",accuracy)

Confusion Matrix

[[177739   4489]
 [   489  17283]]


Metricas cross_validation [0.97559375 0.97584375 0.97563125 0.97459375 0.9745375 ]
Media de cross_validation 0.97524
RMSLE : 0.10935481478391353
Accuracy de Test : 0.97511
