### Imports

In [42]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE

### Data Import and Cleaning

In [43]:
# read in data
data = pd.read_csv('dataset.csv')

# check is for nulls
print(data.isnull().values.any())
print('^ No nulls')

# obtain X and y
X = data[["Time", "Amount", "V1", "V2"]]
y = data['Class']

# find fraud proportion
# fraudCount = len(data[data['Class'] == 1])
fraudCount = (y == 1).sum()
fraudPercentage = (fraudCount/len(X))*100 
fraudPercentageRounded = round(fraudPercentage, 4)

# print fraud distribution
print('There are ' + str(fraudCount) + ' fraudulent transactions- about ' + str(fraudPercentageRounded) + ' percent of the dataset.')

False
^ No nulls
There are 492 fraudulent transactions- about 0.1727 percent of the dataset.


### Apply SMOTE sampling

In [46]:
# apply SMOTE- have new minority class be 30 percent of the dataset
smote = SMOTE(random_state=16, sampling_strategy=0.3)
X_sampled, y_sampled = smote.fit_resample(X, y)

# find new fraud proportion
fraudCountNew = (y_sampled == 1).sum()
fraudPercentageNew = (fraudCountNew/len(X_sampled))*100 
fraudPercentageRoundedNew = round(fraudPercentageNew, 4)

# print new fraud distribution
print('There are ' + str(fraudCountNew) + ' fraudulent transactions- about ' + str(fraudPercentageRoundedNew) + ' percent of the dataset.')

There are 85294 fraudulent transactions- about 23.0768 percent of the dataset.
