# 相关设置

In [1]:
from IPython.core.interactiveshell import InteractiveShell

import calendar
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline

InteractiveShell.ast_node_interactivity = "all"  # 一个cell显示多输出

pd.set_option('display.max_rows', 100)  # 设置最大显示100行
pd.set_option('display.max_columns', 100)  # 设置最大显示100列

In [2]:
data = pd.read_csv('data/data.csv')
data.head()
data.info()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-0.694242,-0.044075,1.672773,0.973366,-0.245117,0.347068,0.193679,0.082637,0.331128,0.083386,-0.540407,-0.618296,-0.996099,-0.32461,1.604014,-0.536833,0.244863,0.03077,0.496282,0.326118,-0.024923,0.382854,-0.176911,0.110507,0.246585,-0.39217,0.330892,-0.063781,149.62,0
1,0.608496,0.161176,0.109797,0.316523,0.043483,-0.06182,-0.0637,0.071253,-0.232494,-0.15335,1.580003,1.066089,0.491418,-0.149982,0.69436,0.529434,-0.13517,-0.218763,-0.179086,-0.089611,-0.307377,-0.880077,0.162201,-0.561131,0.320694,0.261069,-0.022256,0.044608,2.69,0
2,-0.6935,-0.811578,1.169468,0.268231,-0.364572,1.351454,0.639776,0.207373,-1.378675,0.1907,0.61183,0.066137,0.7207,-0.173114,2.562906,-3.298235,1.306868,-0.14479,-2.778561,0.680975,0.337632,1.063358,1.45632,-1.138092,-0.628537,-0.288447,-0.137137,-0.181021,378.66,0
3,-0.493325,-0.112169,1.182516,-0.609727,-0.007469,0.93615,0.192071,0.316018,-1.262503,-0.050468,-0.221892,0.178371,0.510169,-0.30036,-0.689837,-1.209296,-0.805445,2.345305,-1.514205,-0.269855,-0.147443,0.007267,-0.304777,-1.941027,1.241904,-0.460217,0.155396,0.186189,123.5,0
4,-0.59133,0.531541,1.021412,0.284655,-0.295015,0.071999,0.479302,-0.22651,0.744326,0.691625,-0.806147,0.538627,1.352244,-1.168034,0.191323,-0.515205,-0.279081,-0.045569,0.987037,0.529939,-0.012839,1.100011,-0.220123,0.23325,-0.395202,1.041611,0.54362,0.651816,69.99,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 30 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   V1      284807 non-null  float64
 1   V2      284807 non-null  float64
 2   V3      284807 non-null  float64
 3   V4      284807 non-null  float64
 4   V5      284807 non-null  float64
 5   V6      284807 non-null  float64
 6   V7      284807 non-null  float64
 7   V8      284807 non-null  float64
 8   V9      284807 non-null  float64
 9   V10     284807 non-null  float64
 10  V11     284807 non-null  float64
 11  V12     284807 non-null  float64
 12  V13     284807 non-null  float64
 13  V14     284807 non-null  float64
 14  V15     284807 non-null  float64
 15  V16     284807 non-null  float64
 16  V17     284807 non-null  float64
 17  V18     284807 non-null  float64
 18  V19     284807 non-null  float64
 19  V20     284807 non-null  float64
 20  V21     284807 non-null  float64
 21  V22     28

# 样本不均衡

In [3]:
data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [4]:
unfraud_data = data[data['Class'] == 0].values
fraud_data = data[data['Class'] == 1].values

# SMOTE 算法

In [5]:
import random
from sklearn.neighbors import NearestNeighbors

class Smote:
    def __init__(self, samples, N=10, k=5):
        self.n_samples, self.n_attrs = samples.shape
        self.N = N
        self.k = k
        self.samples = samples
        self.new_index = 0

    def over_sampling(self):
        N_ = int(self.N / 100)
        self.synthetic = np.zeros((self.n_samples * N_, self.n_attrs))
        neighbors = NearestNeighbors(n_neighbors=self.k).fit(self.samples)
        # print('neighbors', neighbors)
        for i in range(len(self.samples)):
            # print('samples', self.samples[i])
            # Finds the K-neighbors of a point.
            nnarray = neighbors.kneighbors(self.samples[i].reshape((1, -1)),
                                           return_distance=False)[0]
            # print('nna', nnarray)
            self._populate(N_, i, nnarray)
        return self.synthetic

    # for each minority class sample i ,choose N of the k nearest neighbors and generate N synthetic samples.
    def _populate(self, N, i, nnarray):
        for j in range(N):
            # print('j', j)
            nn = random.randint(0, self.k - 1)  # 包括end
            dif = self.samples[nnarray[nn]] - self.samples[i]
            gap = random.random()
            self.synthetic[self.new_index] = self.samples[i] + gap * dif
            self.new_index += 1
            # print(self.new_index)

In [6]:
smote = Smote(fraud_data, N=1000)
smote_fraud_data = smote.over_sampling()

In [7]:
new_fraud_data = np.vstack((fraud_data, smote_fraud_data))

In [8]:
len(new_fraud_data)

5412

In [9]:
all_data = np.vstack((new_fraud_data, unfraud_data))

In [10]:
all_data.shape

(289727, 30)

In [11]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(all_data, test_size=0.3, random_state=412)

print(train_set.shape)
print(test_set.shape)

(202808, 30)
(86919, 30)


In [12]:
X = train_set[:, 0:29]
y = train_set[:, -1]
X_test = test_set[:, 0:29]
y_test = test_set[:, -1]

In [16]:
import tensorflow
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

In [17]:
n_inputs = 29

In [18]:
# define model
model = Sequential()
# define first hidden layer and visible layer
model.add(Dense(50, input_dim=n_inputs, activation='relu', kernel_initializer='he_uniform'))
# define output layer
model.add(Dense(1, activation='sigmoid'))
# define loss and optimizer
model.compile(loss='binary_crossentropy', optimizer='adam')
model.fit(X, y, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1e74f0a9fc8>

In [19]:
from sklearn.metrics import roc_auc_score

y_pred = model.predict(X_test)

print(roc_auc_score(y_test, y_pred))

0.9903614033525587
