## Pobranie danych z pliku

In [12]:
import numpy as np

file_name = "creditcard.csv"
file = open(file_name)
file.readline()  # skip the header
data = np.loadtxt(file, delimiter=',')

In [15]:
print(data)

[[ 0.00000000e+00 -1.35980713e+00 -7.27811733e-02 ... -2.10530535e-02
   1.49620000e+02  0.00000000e+00]
 [ 0.00000000e+00  1.19185711e+00  2.66150712e-01 ...  1.47241692e-02
   2.69000000e+00  0.00000000e+00]
 [ 1.00000000e+00 -1.35835406e+00 -1.34016307e+00 ... -5.97518406e-02
   3.78660000e+02  0.00000000e+00]
 ...
 [ 1.72788000e+05  1.91956501e+00 -3.01253846e-01 ... -2.65608286e-02
   6.78800000e+01  0.00000000e+00]
 [ 1.72788000e+05 -2.40440050e-01  5.30482513e-01 ...  1.04532821e-01
   1.00000000e+01  0.00000000e+00]
 [ 1.72792000e+05 -5.33412522e-01 -1.89733337e-01 ...  1.36489143e-02
   2.17000000e+02  0.00000000e+00]]


## Wydzielenie zbiorów

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(data[:, :30], data[:, 30:], random_state=0)
Y_train = Y_train.astype(int)
Y_test = Y_test.astype(int)

## Resampling klas

### Oversampling pozytywnych przypadków

#### Metoda SMOTE

In [14]:
from imblearn.over_sampling import SMOTE
X_train, Y_train = SMOTE().fit_resample(X_train, Y_train)

  y = column_or_1d(y, warn=True)


#### Metoda ADASYN

In [None]:
from imblearn.over_sampling import ADASYN
X_train, Y_train = ADASYN().fit_resample(X_train, Y_train)

## Wybór modelu

### Sieć neuronowa z warstwą ukryta

In [20]:
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(200,)))
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('mlpclassifier',
                 MLPClassifier(activation='relu', alpha=0.0001,
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(200,),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
                               max_iter=200, momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                               random_state=None, shuffle=True, solver='adam',
                               tol=0.0001, validation_fraction=0.1,
                               verbose=False, warm_start=False))],
         verbose=False)

### Las losowy

In [19]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(n_jobs=-1))
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=-1,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=Fal

## Wyniki

In [21]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

print("score: {0}".format(accuracy_score(pipeline.predict(X_test), Y_test)))
cm = confusion_matrix(Y_test, pipeline.predict(X_test))
print("False positive:{0}/{1}".format(cm[0, 1], cm[0, 0] + cm[0, 1]))
print("False negative:{0}/{1}".format(cm[1, 0], cm[1, 1] + cm[1, 0]))

score: 0.9994101289289626
False positive:18/71082
False negative:24/120


{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False))], 'verbose': False, 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestclassifier': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max