In [1]:
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
base_credito = pd.read_csv('creditcard.csv')
base_credito.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
nlin, ncol = base_credito.shape
print(f'existem {nlin} linhas e {ncol} colunas')

existem 284807 linhas e 31 colunas


In [4]:
base_credito.dtypes

Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object

In [5]:
base_credito.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [6]:
#Verificando os dados fraudulentos
df_fraude = base_credito.Amount[base_credito.Class == 1]
df_fraude

541         0.00
623       529.00
4920      239.93
6108       59.00
6329        1.00
           ...  
279863    390.00
280143      0.76
280149     77.89
281144    245.00
281674     42.53
Name: Amount, Length: 492, dtype: float64

In [7]:
#Verificando os dados nao fraudulentos
df_nfraude = base_credito.Amount[base_credito.Class == 0]
df_nfraude

0         149.62
1           2.69
2         378.66
3         123.50
4          69.99
           ...  
284802      0.77
284803     24.79
284804     67.88
284805     10.00
284806    217.00
Name: Amount, Length: 284315, dtype: float64

In [8]:
#Checando a quantidade de cada classe
base_credito.Class.value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [9]:
x = base_credito.drop(['Class'], axis=1)
y = base_credito['Class']

x_treino, x_teste, y_treino, y_teste = train_test_split(x,y, test_size = 0.3)

In [10]:
y_treino.value_counts()

Class
0    199026
1       338
Name: count, dtype: int64

In [11]:
#Normalizando
scaler = StandardScaler()
x_treino = scaler.fit_transform(x_treino)
x_teste = scaler.transform(x_teste)

In [12]:
#Equilibrando a quantidade de dado fraudulentos
smote = SMOTE()
x_res, y_res = smote.fit_resample(x_treino, y_treino)

In [13]:
y_res.value_counts()

Class
0    199026
1    199026
Name: count, dtype: int64

In [14]:
x_res.shape

(398052, 30)

In [15]:
modelo = keras.Sequential([keras.layers.InputLayer(input_shape=[30,], name='Entrada'),
    keras.layers.Dense(32, activation='relu', name='Oculta_1'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(16, activation='relu', name='Oculta_2'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(1, activation='sigmoid', name='Saida')
])



In [16]:
modelo.summary()

In [19]:
modelo.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', keras.metrics.Recall(name='recall')]
)

In [21]:
historico = modelo.fit(x_res, y_res, epochs=10, validation_split=0.3, class_weight={0:1, 1:50})

Epoch 1/10
[1m8708/8708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 3ms/step - accuracy: 0.9864 - loss: 0.0754 - recall: 0.9999 - val_accuracy: 1.0000 - val_loss: 3.5451e-04 - val_recall: 1.0000
Epoch 2/10
[1m8708/8708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - accuracy: 0.9871 - loss: 0.0843 - recall: 0.9998 - val_accuracy: 1.0000 - val_loss: 3.1776e-04 - val_recall: 1.0000
Epoch 3/10
[1m8708/8708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 3ms/step - accuracy: 0.9883 - loss: 0.0646 - recall: 0.9999 - val_accuracy: 1.0000 - val_loss: 2.7196e-04 - val_recall: 1.0000
Epoch 4/10
[1m8708/8708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - accuracy: 0.9901 - loss: 0.0564 - recall: 0.9999 - val_accuracy: 1.0000 - val_loss: 4.2890e-04 - val_recall: 1.0000
Epoch 5/10
[1m8708/8708[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 3ms/step - accuracy: 0.9896 - loss: 0.0590 - recall: 0.9999 - val_accuracy: 1.0000 - val_los

In [None]:
y_pred_prob = modelo.predict(x_teste)  # Probabilidades
y_pred = (y_pred_prob > 0.2).astype(int)

print("Acurácia:", accuracy_score(y_teste, y_pred))
print(classification_report(y_teste, y_pred))
print(confusion_matrix(y_teste, y_pred))


[1m2671/2671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 743us/step
Acurácia: 0.9953887386912913
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85284
           1       0.26      0.83      0.40       159

    accuracy                           1.00     85443
   macro avg       0.63      0.91      0.70     85443
weighted avg       1.00      1.00      1.00     85443

[[84917   367]
 [   27   132]]


O resultado foi que conseguimos identificar 83% das fraudes