# Credit Card Fraud Detection


### A logistic regression model for fraud detection. 
#### This demo uses the Credit Card Fraud Detection dataset, originally taken from: https://www.kaggle.com/mlg-ulb/creditcardfraud


In [None]:
import os
import warnings
warnings.filterwarnings("ignore")
##### For reproducibility
seed_value= 1
os.environ['PYTHONHASHSEED']=str(seed_value)
import random
random.seed(seed_value)
#####
import h5py
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sklearn_json as skljson
from sklearn.linear_model import LogisticRegression
#####
# import utils
import sys
path_to_utils='..'
sys.path.append(path_to_utils)
import utils

PATH = os.path.join('..', 'data', 'lr_fraud')
if not os.path.exists(PATH):
    os.makedirs(PATH)

print("misc. init complete");

### Read Dataset

In [None]:
df = pd.read_csv(os.path.join(utils.get_data_sets_dir(path_to_utils), 'net_fraud', 'creditcard.csv'))

print(f'Reading {df.shape[0]} samples')

X = df.loc[:, df.columns.tolist()[1:30]].values
Y = df.loc[:, 'Class'].values
print(f'number of features: {X.shape[1]}')

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, stratify=Y, random_state=0)

y_train = y_train.reshape(y_train.shape[0], -1)
y_test = y_test.reshape(y_test.shape[0], -1)

print("Training data ready")

### Save dataset 

In [None]:
def save_data_set(x, y, data_type, s=''):
    print("Saving x_{} of shape {}".format(data_type, x.shape))
    xf = h5py.File(os.path.join(PATH, f'x_{data_type}{s}.h5'), 'w')
    xf.create_dataset('x_{}'.format(data_type), data=x)
    xf.close()

    yf = h5py.File(os.path.join(PATH, f'y_{data_type}{s}.h5'), 'w')
    yf.create_dataset(f'y_{data_type}', data=y)
    yf.close()
    
save_data_set(x_test, y_test, data_type='test')

### Logistic Regression Train

In [None]:
lr = LogisticRegression(C=0.1)
lr.fit(x_train, y_train)

print('LR model ready')

### Confusion Matrix - TEST

In [None]:
batch_size = 8192
batch_x_test = x_test[0:batch_size,:]
batch_y_test = y_test[0:batch_size,:]

batch_y_pred = lr.predict(batch_x_test)
f,t,thresholds = metrics.roc_curve(batch_y_test, batch_y_pred)
cm = metrics.confusion_matrix(batch_y_test, batch_y_pred)
print(f"AUC Score: {metrics.auc(f,t):.3f}")
print("Classification report:")
print(metrics.classification_report(batch_y_test, batch_y_pred))
print("Confusion Matrix:")
print(cm)

### Serialize model and weights

In [None]:
skljson.to_json(lr, os.path.join(PATH, 'model.json'))
print("Saved model to ",PATH)