## Pobranie danych z pliku

In [9]:
import numpy as np

file_name = "creditcard.csv"
file = open(file_name)
file.readline()  # skip the header
data = np.loadtxt(file, delimiter=',')

In [10]:
print(data)

[[ 0.00000000e+00 -1.35980713e+00 -7.27811733e-02 ... -2.10530535e-02
   1.49620000e+02  0.00000000e+00]
 [ 0.00000000e+00  1.19185711e+00  2.66150712e-01 ...  1.47241692e-02
   2.69000000e+00  0.00000000e+00]
 [ 1.00000000e+00 -1.35835406e+00 -1.34016307e+00 ... -5.97518406e-02
   3.78660000e+02  0.00000000e+00]
 ...
 [ 1.72788000e+05  1.91956501e+00 -3.01253846e-01 ... -2.65608286e-02
   6.78800000e+01  0.00000000e+00]
 [ 1.72788000e+05 -2.40440050e-01  5.30482513e-01 ...  1.04532821e-01
   1.00000000e+01  0.00000000e+00]
 [ 1.72792000e+05 -5.33412522e-01 -1.89733337e-01 ...  1.36489143e-02
   2.17000000e+02  0.00000000e+00]]


## Wydzielenie zbiorów

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(data[:, :30], data[:, 30:], random_state=0)
Y_train = Y_train.astype(int)
Y_test = Y_test.astype(int)

## Resampling klas

### Oversampling pozytywnych przypadków

#### Metoda SMOTE

In [14]:
from imblearn.over_sampling import SMOTE
X_train, Y_train = SMOTE().fit_resample(X_train, Y_train)

#### Metoda ADASYN

In [15]:
from imblearn.over_sampling import ADASYN
X_train, Y_train = ADASYN().fit_resample(X_train, Y_train)

### Ustawienie HyperDash

In [16]:
from hyperdash import Experiment
experiment = Experiment("Fraud Transaction Classifier")

score: 0.9991994606893064
False positive: 34/120
False negative: 23/71082
| accuracy:   0.999199 |
This run of Fraud Transaction Classifier ran for 0:01:55 and logs are available locally at: C:\Users\Mao Zedong\.hyperdash\logs\fraud-transaction-classifier\fraud-transaction-classifier_2020-01-13t00-35-39-568723.log


## Wybór modelu

### Sieć neuronowa z warstwą ukryta

In [17]:
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(200,)))
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('mlpclassifier',
                 MLPClassifier(activation='relu', alpha=0.0001,
                               batch_size='auto', beta_1=0.9, beta_2=0.999,
                               early_stopping=False, epsilon=1e-08,
                               hidden_layer_sizes=(200,),
                               learning_rate='constant',
                               learning_rate_init=0.001, max_fun=15000,
                               max_iter=200, momentum=0.9, n_iter_no_change=10,
                               nesterovs_momentum=True, power_t=0.5,
                               random_state=None, shuffle=True, solver='adam',
                               tol=0.0001, validation_fraction=0.1,
                               verbose=False, warm_start=False))],
         verbose=False)

### Las losowy

In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(n_jobs=-1))
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=-1,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=Fal

### Naiwny klasyfikator Bayesa

In [14]:
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(StandardScaler(), GaussianNB())
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('gaussiannb', GaussianNB(priors=None, var_smoothing=1e-09))],
         verbose=False)

### Kwadratowa analiza dyskryminacyjna

In [16]:
from sklearn.pipeline import make_pipeline
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(StandardScaler(), QuadraticDiscriminantAnalysis())
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('quadraticdiscriminantanalysis',
                 QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                                               store_covariance=False,
                                               tol=0.0001))],
         verbose=False)

### Regresja logistyczna

In [26]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(StandardScaler(), LogisticRegression())
pipeline.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

### K najbliższych sąsiadów

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(n_jobs=-1))
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=-1, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

### Support vector classifier

In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(StandardScaler(), SVC())
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

## Wyniki

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np

accuracy = accuracy_score(pipeline.predict(X_test), Y_test)
print("score: {0}".format(accuracy))
cm = confusion_matrix(Y_test, pipeline.predict(X_test))
print("False positive: {0}/{1}".format(cm[0, 1], len(Y_test) - np.sum(Y_test)))
print("False negative: {0}/{1}".format(cm[1, 0], np.sum(Y_test)))
experiment.metric("accuracy", accuracy_score(pipeline.predict(X_test), Y_test))
experiment.end()

score: 0.9991994606893064
False positive: 34/120
False negative: 23/71082
Cannot send metric accuracy, experiment ended. Please start a new experiment.
