In [1]:
# Here our aim is to detect all the fraudent transactions which occur in the credit card transactions of the customers

In [2]:
# importing all the necessary pacakages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import gridspec

In [None]:
# loading the dataset , upload the data set on collab itself
data = pd.read_csv("creditcard.csv")

In [None]:
# understanding the data and observing
data.head()

In [None]:
# describing the data
print(data.shape)
print(data.describe())

In [None]:
# Determine number of fraud cases in dataset
fraud_transac = data[data['Class'] == 1]
valid_transac = data[data['Class'] == 0]
outerfrac = len(fraud_transac)float(len(valid_transac))
print(outerfrac)
print('The fraud cases are as follows: {}'.format(len(data[data['Class'] == 1])))
print('The valid transaction cases are as follows: {}'.format(len(data[data['Class'] == 0])))

In [None]:
print("Amount details of the transactions which are fraud")
fraud_transac.Amount.describe()

# as we can see the data is highly unbalanced
# we aply our model first , if it turns out to be less accurate we balance the data and reapply it

In [None]:
# furhter printing the details of the transactions which are not fraud

print("Details of the transactions which are not valid")
valid_transac.Amount.describe()

In [None]:
# from the above two plots it was very clear that the average money transaction for the fraud one is
# more than the normal transactions
# Correlation matrix
corrmat = data.corr()
fig = plt.figure(figsize = (12, 9))
sns.heatmap(corrmat, vmax = .8, square = True)
plt.show()

In [None]:

# dividing the X and the Y from the dataset
X = data.drop(['Class'], axis = 1)
Y = data["Class"]
print(X.shape)
print(Y.shape)
# getting the values for processing 
# (its a numpy array with no columns)
xData = X.values
yData = Y.values

In [None]:
#Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
xTrain, xTest, yTrain, yTest = train_test_split(
		xData, yData, test_size = 0.2, random_state = 42)


In [None]:
# here we set up the random forest classifier
from sklearn.ensemble import RandomForestClassifier
# random forest model creation
rfc = RandomForestClassifier()
rfc.fit(xTrain, yTrain)
# predictions
yPred = rfc.predict(xTest)


In [None]:
# Evaluating the classifier
# printing every score of the classifier
# scoring in anything
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix

n_outliers = len(fraud_transac)
n_errors = (yPred != yTest).sum()
print("The model is Random Forest classifier")

acc = accuracy_score(yTest, yPred)
print("The accuracy is {}".format(acc))

prec = precision_score(yTest, yPred)
print("The precision is {}".format(prec))

rec = recall_score(yTest, yPred)
print("The recall is {}".format(rec))

f1 = f1_score(yTest, yPred)
print("The F1-Score is {}".format(f1))

MCC = matthews_corrcoef(yTest, yPred)
print("The Matthews correlation coefficient is{}".format(MCC))


In [None]:

# after running through the model we visualize the data in the confusion matrix
LABELS = ['Normal', 'Fraud']
conf_matrix = confusion_matrix(yTest, yPred)
plt.figure(figsize =(12, 12))
sns.heatmap(conf_matrix, xticklabels = LABELS,
			yticklabels = LABELS, annot = True, fmt ="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()
