In [None]:
import os
import numpy as np
import warnings

warnings.filterwarnings("ignore")


In [None]:
import pandas as pd
import modin.pandas as pd

import modin.config as cfg
cfg.StorageFormat.put('hdk')


In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

from sklearn import config_context
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import sklearn.linear_model as lm

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
 
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


In [None]:

import time
dt_start = time.time()

# 训练该数据集需要大约30G内存，如果内存足够，使用该行代码读取所有数据
df = pd.read_csv('creditcard.csv')

print("read_csv time: ", time.time() - dt_start)

In [None]:
from sklearn.preprocessing import StandardScaler
#StandardScaler作用：去均值和方差归一化。且是针对每一个特征维度来做的，而不是针对样本。
df['normAmount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))
#删除Time和Amount所在的列
df = df.drop(['Time','Amount'],axis=1)
df.head()


In [None]:
X = df.loc[:, df.columns != 'Class']
Y = df.loc[:, df.columns == 'Class']
 
#计算出class==1（存在欺诈行为）元素有多少个
number_records_fraud = len(df[df.Class == 1])
#取出class==1的行索引
fraud_indices = np.array(df[df.Class == 1].index)
 
#取出class==0的行索引
normal_indices = df[df.Class == 0].index
 
#随机选择和1这个属性样本个数相同的0样本
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)
#转换成numpy的格式
random_normal_indices = np.array(random_normal_indices)
 
#将class=0和1的样本的索引拼接在一起
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
 
#下采样的数据集
under_sample_data = df.iloc[under_sample_indices,:]
 #下采样数据集的数据
X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'Class']
#下采样数据集的label
y_undersample = under_sample_data.loc[:, under_sample_data.columns == 'Class']
 
#打印Class == 0的样本数目
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
#打印Class == 0的样本数目
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
#打印下采样の1总数量
print("Total number of transactions in resampled data: ", len(under_sample_data))


In [None]:
from sklearn.model_selection import train_test_split
#下取样的数据集
X_train,X_test,y_train,y_test=train_test_split(X_undersample,y_undersample ,test_size = 0.3,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X))
print("Total number of transactions: ", len(X_train)+len(X_test))

In [None]:
# MODELING
dt_start = time.time()
# 1. Decision Tree
tree_model = DecisionTreeClassifier(max_depth=4, criterion='entropy')
tree_model.fit(X_train, y_train)
tree_yhat = tree_model.predict(X_test)
 
# 2. K-Nearest Neighbors
n = 5
knn = KNeighborsClassifier(n_neighbors=n)
knn.fit(X_train, y_train)
knn_yhat = knn.predict(X_test)
 
# 3. Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_yhat = lr.predict(X_test)
 
# 4. SVM 
svm = SVC()
svm.fit(X_train, y_train)
svm_yhat = svm.predict(X_test)
 
# 5. Random Forest Tree
rf = RandomForestClassifier(max_depth=4)
rf.fit(X_train, y_train)
rf_yhat = rf.predict(X_test)
 
# # 6. XGBoost
xgb = XGBClassifier(max_depth=4)
xgb.fit(X_train, y_train)
xgb_yhat = xgb.predict(X_test)

print("Ridge Regression traing & inference time: ", time.time() - dt_start)

In [None]:
# 1. Accuracy score
dt_start = time.time()
print('ACCURACY SCORE')
print('Accuracy score of the Decision Tree model is {}'
         .format(accuracy_score(y_test, tree_yhat)),
    )
print('Accuracy score of the KNN model is {}'
         .format(accuracy_score(y_test, knn_yhat)))
print('Accuracy score of the Logistic Regression model is {}'
         .format(accuracy_score(y_test, lr_yhat)))
print('Accuracy score of the SVM model is {}'
         .format(accuracy_score(y_test, svm_yhat)) )
print('Accuracy score of the Random Forest Tree model is {}'
         .format(accuracy_score(y_test, rf_yhat)))
print('Accuracy score of the XGBoost model is {}'
         .format(accuracy_score(y_test, xgb_yhat)) )
print("Accuracy score time: ", time.time() - dt_start)

In [None]:
# 2. F1 score
dt_start = time.time()
print('F1 SCORE')
print('F1 score of the Decision Tree model is {}'
         .format(f1_score(y_test, tree_yhat)))
print('F1 score of the KNN model is {}'
         .format(f1_score(y_test, knn_yhat)))
print('F1 score of the Logistic Regression model is {}'
         .format(f1_score(y_test, lr_yhat)))
print('F1 score of the SVM model is {}'
         .format(f1_score(y_test, svm_yhat)) )
print('F1 score of the Random Forest Tree model is {}'
         .format(f1_score(y_test, rf_yhat)))
print('F1 score of the XGBoost model is {}'
         .format(f1_score(y_test, xgb_yhat)))
print("F1 score time: ", time.time() - dt_start)