#### Problem statement:
#### 1.Detecting fraud transactions is of great importance for any credit card company.
#### 2.We are tasked by a well-known company to detect potential frauds so that customers are not charged for items that they did not purchase.

#### 3.So the goal is to build a classifier that tells if a transaction is a fraud or not.####

#### Import the library #### 

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.metrics import confusion_matrix,accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

#### Data processing ####

In [53]:
dataset = pd.read_csv("D:\\Kaggle Datasets\\creditcard.csv")
dataset.head(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [54]:
dataset.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.16598e-15,3.416908e-16,-1.37315e-15,2.086869e-15,9.604066e-16,1.490107e-15,-5.556467e-16,1.177556e-16,-2.406455e-15,...,1.656562e-16,-3.44485e-16,2.578648e-16,4.471968e-15,5.340915e-16,1.687098e-15,-3.666453e-16,-1.220404e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


#### We can drop the ['Amount','Time'] columns as they would not help in model building ####

In [55]:
df = dataset.drop(columns = ['Amount','Time'], axis = 1)
X = df.iloc[: ,:-1].values
y = df.iloc[: , -1].values

#### Split the data into train and test datasets ####

In [56]:
from sklearn.model_selection import train_test_split
X_train , X_test, y_train, y_test = train_test_split(X , y, test_size = 0.3, random_state = 42)

### DNN Model building###


#### We will build a neural network using Sequential model in keras ####

In [57]:
ann_model = tf.keras.models.Sequential()
ann_model.add(tf.keras.layers.Dense(units = 6, activation = 'relu'))
ann_model.add(tf.keras.layers.Dense(units = 6, activation = 'relu'))
ann_model.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))

### DNN Model Evaluation ###

#### Let's compile and train the model ####

In [58]:
ann_model.compile(optimizer = 'adam', loss = 'binary_crossentropy',metrics = ['accuracy'])
ann_model.fit(X_train, y_train, batch_size = 32, epochs = 10)

Train on 199364 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1ee044a7d88>

#### Let's evaluate the model ####

In [59]:
score = ann_model.evaluate(X_test, y_test)
print(score)

[0.0026729599015297, 0.9994148]


#### The model is found to have 99.94% accuracy ####

#### If 95% of the dataset is negative(Non-frauds) the network will cleverly predict all to be negative leading to 95% accuracy.However, for fraud detection, detecting positive is very important.Therefore, we need better metrics. ####

#### Predicting the Test set Results ####

In [60]:
y_pred = ann_model.predict(X_test)
y_pred = (y_pred > 0.5)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


#### Making the confusion matrix #### 

In [61]:
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[85283    24]
 [   26   110]]


In [62]:
print(precision_score(y_test, y_pred.round()))
print(recall_score(y_test, y_pred.round()))
print(f1_score(y_test, y_pred.round()))

0.8208955223880597
0.8088235294117647
0.8148148148148148


#### The confusion matrix shows a Precision of 82%, Recall of 80.8% and F1 Score of 81.4 %. About 20% of frauds are misclassified as non-frauds , leading to extra payments for customers, though the accuracy is 99.94%. So there is enough space to improve the DNN model.####

#### Let's build a decision tree ####

In [63]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
y_pred = decision_tree_model.predict(X_test)

In [64]:
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[85267    40]
 [   29   107]]


In [65]:
print(precision_score(y_test, y_pred.round()))
print(recall_score(y_test, y_pred.round()))
print(f1_score(y_test, y_pred.round()))

0.7278911564625851
0.7867647058823529
0.7561837455830389


#### The Decision Tree gives a Precision of 72.7 %, Recall of 78.6 %, F1 Score of  75.6% worse than DNN model.####

#### Let's build a Random Forest ####

In [66]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(n_estimators = 100)
random_forest_model.fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)

In [67]:
def conf_mat(model):
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

In [68]:
conf_mat(random_forest_model)

[[85297    10]
 [   26   110]]


In [69]:
print(precision_score(y_test, y_pred.round()))
print(recall_score(y_test, y_pred.round()))
print(f1_score(y_test, y_pred.round()))

0.9166666666666666
0.8088235294117647
0.859375


#### The Random Forest gives a Precision of 91.6%, Recall of 80.8% and F1 Score of  85.9% better than DNN and Decision Tree ####

#### We have a class imbalance issue out here.The model is more sensitive to detect majority class than minority class.####

### Undersampling ###

In [70]:
fraud_ind = np.array(df[df.Class == 1].index)
num_frauds = len(fraud_ind)
print(num_frauds)
normal_ind = np.array(df[df.Class == 0].index)
num_normal = len(normal_ind)
print(num_normal)

492
284315


In [71]:
normal_ind = df[df.Class == 0].index
random_normal_ind = np.random.choice(normal_ind, num_frauds, replace = False)
random_normal_ind = np.array(random_normal_ind)

In [72]:
under_sample_ind = np.concatenate( [fraud_ind, random_normal_ind])

In [73]:
print(len(under_sample_ind))

984


In [74]:
under_sample_data = df.iloc[under_sample_ind, :]

In [75]:
X_undersample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X_undersample, y_undersample, test_size = 0.3)

In [77]:
ann_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
ann_model.fit(X_train,y_train, batch_size = 32, epochs = 10)

Train on 688 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1ee047a3948>

In [78]:
y_pred = ann_model.predict(X_test)

In [79]:
cm = confusion_matrix(y_test, y_pred.round())
print(cm)

[[141   2]
 [ 16 137]]


In [80]:
print(accuracy_score(y_test, y_pred.round()))
print(precision_score(y_test, y_pred.round()))
print(recall_score(y_test, y_pred.round()))
print(f1_score(y_test, y_pred.round()))

0.9391891891891891
0.9856115107913669
0.8954248366013072
0.9383561643835616


#### Undersampling gives a Precision of 98.5%, Recall of 89.5% and F1 Score of 93.8% .Much better than DNN without undersampling. ####

### SMOTE ### 

In [81]:
X_resample, y_resample = SMOTE().fit_resample(X , y)

In [82]:
X_resample.shape

(568630, 28)

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size = 0.3)

In [84]:
ann_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
ann_model.fit(X_train, y_train, batch_size = 32, epochs = 10)

Train on 398041 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1ee049c8148>

In [85]:
y_pred = ann_model.predict(X_test)

In [86]:
cm = confusion_matrix(y_test, y_pred.round())
print(cm)

[[84317   914]
 [   98 85260]]


In [87]:
print(accuracy_score(y_test, y_pred.round()))
print(precision_score(y_test, y_pred.round()))
print(recall_score(y_test, y_pred.round()))
print(f1_score(y_test, y_pred.round()))

0.9940676128003564
0.9893935525796644
0.9988518943742825
0.9941002261968613


### We created 5 models, DNN, Decision Tree, Random Forest, DNN with undersampling and DNN with SMOTE.DNN with SMOTE performs the best.###