# Preprocessing

## Check packages versions

In [1]:
import sys 
import numpy 
import pandas
import matplotlib
import seaborn
import scipy
import sklearn

print('Python: {}'.format(sys.version))
print('Numpy: {}'.format(numpy.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Matplotlib: {}'.format(matplotlib.__version__))
print('Seaborn: {}'.format(seaborn.__version__))
print('Scipy: {}'.format(scipy.__version__))
print('Sklearn: {}'.format(sklearn.__version__))

Python: 3.7.3 (default, Mar 27 2019, 17:13:21) [MSC v.1915 64 bit (AMD64)]
Numpy: 1.16.4
Pandas: 0.24.2
Matplotlib: 3.0.3
Seaborn: 0.9.0
Scipy: 1.2.1
Sklearn: 0.20.3


## Import packages for preprocessing

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load dataset from csv file using pandas
data = pd.read_csv('creditcard.csv')

## Exploratory Data Analysis

In [None]:
print(data.columns)

In [None]:
print(data.shape)

In [None]:
print(data.describe())

In [None]:
# Pick reasonable sample to prevent a high use of computational resource. 
# More data will mean better accuracy but also means making a tradeoff to computational requirements
data = data.sample(frac = 0.1, random_state = 1)
print(data.shape)

In [None]:
# Plot histogram of each parameter
data.hist(figsize = (20, 20))
plt.show()

In [None]:
# Determine number of fraud cases in dataset
fraud = data[data['Class'] == 1]
valid = data[data['Class'] == 0]

outlier_fraction = len(fraud)/ float(len(valid))
print(outlier_fraction)

print('Fraud Cases: {}'.format(len(fraud)))
print('Valid Cases: {}'.format(len(valid)))

In [None]:
## Correlation matrix
corrmat = data.corr()
fig = plt.figure(figsize = (12, 9))

sns.heatmap(corrmat, vmax = .8, square = True)
plt.show()

In [None]:
# Get all columns from the dataset
columns = data.columns.tolist()

# Filter columns to remove data that we do not want
columns = [c for c in columns if c not in ['Class']]

# Store the variable that we are predicting on 
target = 'Class'

X = data[columns]
y = data[target]

# Print the shapes of X and y
print(X.shape)
print(y.shape)

# Prediction

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

# Define random_state
state = 1

# Define outlier detection methods
classifiers = {
    'Isolation Forest': IsolationForest(max_samples = len(X),
                                       contamination = outlier_fraction, 
                                       random_state = state,
                                       behaviour='new'),
    
    'Local Oulier Factor': LocalOutlierFactor(n_neighbors = 20,
                                             contamination = outlier_fraction,
                                             novelty=True)
}

In [None]:
# Fit the model
n_outliers = len(fraud)

for i, (clf_name, clf) in enumerate(classifiers.items()):
    # Fit the data and tag outliers
    if clf_name == 'Local Outlier Factor':
        y_pred = clf.fit_predict(X)
        scores_pred = clf.negative_outlier_factor_
    else:
        clf.fit(X)
        scores_pred = clf.decision_function(X)
        y_pred = clf.predict(X)
        
    # Reshape the prediction values to 0 for valid, 1 for fraud
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
    
    n_errors = (y_pred != y).sum()
    
    # Run classification metrics
    print('{}: {}:'.format(clf_name, n_errors))
    print(accuracy_score(y, y_pred))
    print(classification_report(y, y_pred))