### Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from smote import smote_sampling
from train_and_pred import train_and_predict
from accuracy_metrics import calculate_metrics
from sklearn.linear_model import LogisticRegression


  from pandas.core import (


### Data Import and Cleaning

In [2]:
# read in data
data = pd.read_csv('dataset.csv')

# check is for nulls
print(data.isnull().values.any())
print('^ No nulls')

# obtain X and y
X = data[["Time", "Amount", "V1", "V2"]]
y = data['Class']

# find fraud proportion
# fraudCount = len(data[data['Class'] == 1])
fraud_count = (y == 1).sum()
fraud_percentage = (fraud_count/len(X))*100 
fraud_percentage_rounded = round(fraud_percentage, 4)

# print fraud distribution
print('There are ' + str(fraud_count) + ' fraudulent transactions- about ' + str(fraud_percentage_rounded) + ' percent of the dataset.')

False
^ No nulls
There are 492 fraudulent transactions- about 0.1727 percent of the dataset.


### Apply SMOTE sampling

In [3]:
# Split data into training set(80%) and test set(20%) using stratified splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=7)

# apply SMOTE to training data- have new minority class be about 30 percent of the majority class
X_train_sampled, y_train_sampled = smote_sampling(X_train, y_train, 0.3)

# find new fraud proportion
fraud_count_new = (y_train_sampled == 1).sum()
fraud_percentage_new = (fraud_count_new/len(X_train_sampled))*100 
fraud_percentage_rounded_new = round(fraud_percentage_new, 4)

# print new fraud distribution
print('There are ' + str(fraud_count_new) + ' fraudulent transactions in the training data- about ' + str(fraud_percentage_rounded_new) + ' percent of the dataset.')

There are 68235 fraudulent transactions in the training data- about 23.0768 percent of the dataset.


### Logistic Regression

In [4]:
# BASE MODEL
# Scale time and amount features(PC's were already scaled)in training and test data
scaler = StandardScaler()
X_train_scaled = X_train.copy() 
X_train_scaled[['Time', 'Amount']] = scaler.fit_transform(X_train[['Time', 'Amount']])

X_test_scaled = X_test.copy() 
X_test_scaled[['Time', 'Amount']] = scaler.fit_transform(X_test[['Time', 'Amount']])

# Initialize model
lr = LogisticRegression(random_state=22)

# Fit model and get predictions
y_pred_lr = train_and_predict(lr, X_train_scaled, y_train, X_test_scaled)

# Get accuracy metrics
lr_metrics = calculate_metrics(y_test, y_pred_lr)
print(lr_metrics)


{'accuracy': 0.9984375548611355, 'precision(y=1)': 0.8461538461538461, 'recall': 0.9984375548611355, 'f1-score': 0.9978399048095916}


### Random Forest

### Gradient Boosting