In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn import svm
from sklearn.svm import OneClassSVM
import time
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,roc_auc_score, precision_score, roc_curve, recall_score,\
                             classification_report, f1_score, precision_recall_fscore_support)
outlier_fraction = 0.001

import warnings
warnings.simplefilter('ignore')

from sklearn.covariance import EllipticEnvelope

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/ВКР/creditcard.csv')


In [None]:
x_data = data.loc[:, 'V1' : 'V28']
y_data = data.loc[:, 'Class']

In [None]:
start_time_kmeans = time.time()

from sklearn.cluster import KMeans
kmeans=KMeans(n_clusters=2, random_state=0, algorithm="elkan", max_iter=100, n_jobs=-1)

kmeans.fit(x_data)
y_pred_kmeans = pd.Series(kmeans.predict(x_data))
y_pred_kmeans.replace(1,1,inplace = True)
y_pred_kmeans.replace(-1,0,inplace = True)
cross_table = pd.crosstab(data['Class'], columns = y_pred_kmeans)
print(cross_table)
print('Time used:', time.time() - start_time_kmeans)

col_0       0       1
Class                
0      149747  134568
1         358     134
Time used: 6.661710977554321


In [None]:
start_time_iso = time.time()

from sklearn.ensemble import IsolationForest
iso=IsolationForest(max_samples = len(x_data), n_estimators = 100,
                    contamination = outlier_fraction, random_state = 1)
iso.fit(x_data)
y_pred_iso = pd.Series(iso.predict(x_data))
y_pred_iso.replace(-1,0,inplace = True)
y_pred_iso.replace(1,1,inplace = True)
cross_table = pd.crosstab(data['Class'], columns = y_pred_iso)
print(cross_table)
print('Time used:', time.time() - start_time_iso)

col_0    0       1
Class             
0      171  284144
1      114     378
Time used: 41.68232750892639


In [None]:
from sklearn import svm
from sklearn.svm import OneClassSVM

start_time_ocsvm = time.time()

ocsvm = svm.OneClassSVM(kernel='sigmoid', gamma='auto', nu = outlier_fraction)
ocsvm.fit(x_data)
y_pred_ocsvm = pd.Series(ocsvm.predict(x_data))
y_pred_ocsvm.replace(-1,0,inplace = True)
y_pred_ocsvm.replace(1,1,inplace = True)
cross_table = pd.crosstab(data['Class'], columns = y_pred_ocsvm)
print(cross_table)
print('Time used:', time.time() - start_time_ocsvm)

col_0    0       1
Class             
0      259  284056
1       27     465
Time used: 32.245752573013306


In [None]:
import time
import warnings
warnings.simplefilter('ignore')

from sklearn.covariance import EllipticEnvelope
start_time_ee = time.time()

ellipenv = EllipticEnvelope(contamination = outlier_fraction, 
                          random_state=1)
y_pred_el = pd.Series(ellipenv.fit_predict(x_data))
y_pred_el.replace(-1,0,inplace = True)
y_pred_el.replace(1,1,inplace = True)
cross_table = pd.crosstab(data['Class'], columns = y_pred_el)
print(cross_table)
print('Time used:', time.time() - start_time_ee)

col_0    0       1
Class             
0      275  284040
1       10     482
Time used: 185.8741672039032


In [None]:
from sklearn.neighbors import LocalOutlierFactor
start_time_lof = time.time()

lof = LocalOutlierFactor(n_neighbors=2, metric='minkowski', contamination = outlier_fraction)

y_pred_lof = pd.Series(lof.fit_predict(x_data))
y_pred_lof.replace(-1,0,inplace = True)
y_pred_lof.replace(1,1,inplace = True)
cross_table = pd.crosstab(data['Class'], columns = y_pred_lof)
print(cross_table)
print('Time used:', time.time() - start_time_lof)


col_0    0       1
Class             
0      284  284031
1        1     491
Time used: 553.5959331989288


In [None]:
two_score = pd.DataFrame([y_pred_kmeans, y_pred_iso, y_pred_ocsvm, y_pred_el,y_pred_lof]).T
data['Anomaly'] = two_score.apply(lambda x: x.mode(), axis = 1)


In [None]:
label = {
    (0,0):0,
    (1,1):1,
    (0,1):1,
    (1,0):0
}
data['Class_Anomaly'] = data[['Class','Anomaly']].apply(lambda x:label[(x[1],x[0])], axis = 1)

data = data.drop(['Class','Anomaly'], axis = 1)

data.to_csv('/content/drive/MyDrive/ВКР/DatasetAnomaly.csv')

anomalycredits = pd.read_csv('/content/drive/MyDrive/ВКР/DatasetAnomaly.csv')
look_to_anomaly = anomalycredits.loc[:, 'V27' : 'Class_Anomaly']

fraudulent = anomalycredits[anomalycredits['Class_Anomaly']==1]
print(f'мошеннические{fraudulent.shape}')
look_to_anomaly.head()

мошеннические(492, 32)


Unnamed: 0,V27,V28,Amount,Class_Anomaly
0,0.133558,-0.021053,149.62,0
1,-0.008983,0.014724,2.69,0
2,-0.055353,-0.059752,378.66,0
3,0.062723,0.061458,123.5,0
4,0.219422,0.215153,69.99,0


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTETomek
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [None]:

X = np.array(anomalycredits.loc[:,:'V28'])
y = np.array(anomalycredits['Class_Anomaly'])
sess = StratifiedShuffleSplit(n_splits = 5,test_size=0.4,random_state=0)
for train_index,test_index in sess.split(X,y):
    X_train,X_test = X[train_index], X[test_index]
    y_train,y_test = y[train_index], y[test_index]
print('train_size: %s' %len(y_train),
     'test_size: %s' %len(y_test))

train_size: 170884 test_size: 113923


In [None]:
ros = RandomOverSampler(random_state = 0)
sos = SMOTE(random_state=0)
kos = SMOTETomek(random_state=0)

x_ros, y_ros = ros.fit_sample(X_train, y_train)
x_sos, y_sos = sos.fit_sample(X_train, y_train)
x_kos, y_kos = kos.fit_sample(X_train, y_train)
print('ros: {}, sos: {}, kos:{}'.format(len(y_ros),len(y_sos),len(y_kos)))

ros: 341178, sos: 341178, kos:340834


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, log_loss

clf = DecisionTreeClassifier(criterion = 'gini', random_state=1234)
abc =AdaBoostClassifier(n_estimators=50, base_estimator=clf,learning_rate=1)

data_samp = [[X_train, y_train],
                 [x_ros, y_ros],
                 [x_sos, y_sos],
                 [x_kos, y_kos]]

for features, labels in data_samp:
    start_time = time.time()
    abc.fit(features, labels)
    predict_test = abc.predict(X_test)
    
    print('auc:{:.3f}'.format(roc_auc_score(y_test, predict_test)),
          'accuracy:{:.3f}'.format(accuracy_score(y_test, predict_test)),
          'recall:{:.3f}'.format(recall_score(y_test, predict_test)),
          'precision:{:.3f}'.format(precision_score(y_test, predict_test)),
          'Logloss:{:.3f}'.format(log_loss(y_test, predict_test)),
          'Time:{:.3f}'.format(time.time() - start_time))

auc:0.868 accuracy:0.999 recall:0.736 precision:0.747 Logloss:0.031 Time:17.436
auc:0.883 accuracy:0.999 recall:0.766 precision:0.729 Logloss:0.031 Time:15.023
auc:0.900 accuracy:0.998 recall:0.802 precision:0.416 Logloss:0.079 Time:44.241
auc:0.900 accuracy:0.998 recall:0.802 precision:0.397 Logloss:0.085 Time:43.614
