In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#https://www.kaggle.com/mlg-ulb/creditcardfraud
#Credit card fraud

#learned from https://www.kaggle.com/abdelhai/the-power-of-eda-90-accuracy-90-recall

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
import sklearn
%matplotlib inline


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        data=pd.read_csv(os.path.join(dirname, filename))
        
data.head()






In [None]:
#correlation heapmap 
plt.figure(figsize=(30,20))
cor=data.corr()
sns.heatmap(cor,annot=True,cmap=plt.cm.Blues)
plt.show()


A value of 1 indicating a very strong positive relation between the two variables, 
while a value of -1 indicating a very strong negative relation, and a value of 0 indicates no relation

looking into the relation between the independent variables and the dependent variable Class, 
and see which variables is highly correlated with our target variable the Class. 

From the visualization, the top 3 independent variables are {V12,V14,V17}.


**Scaterplot 
**
Ploting each one of the interesting variables against a very boring variable V13 and color each data point with corresponding to it's label (Fraud or Normal)

In [None]:
sns.scatterplot(x=data['V13'], y=data['V17'], hue=data['Class'])


In [None]:
sns.scatterplot(x=data['V13'], y=data['V14'], hue=data['Class'])


In [None]:
sns.scatterplot(x=data['V13'], y=data['V12'], hue=data['Class'])


From all three plots, we can observe a horizontal line that classifies almost all data points into their correct Class value. In other words, if we look at the y-axis that corresponds to one of the interesting variables {V17, V14, V12}, then we can see that most fraudulent data points are located below the value of -5 and normal data points are located above.

In contrast, if we look at the x-axis where the boring variable is, then both fraudulent and normal data points are almost equally distributed between -4 and 4, which means that we can't draw a vertical line to separate the two groups.

Is that good enough?

No. Although scatter plots gave us a general idea of what is going on, but they are not precise. So how we can take a better view?

**Kernel density estimation (KDE) plots
**
Take each one of the interesting variables and approximate the underlying probability density function for each Class value (Frauds Vs. Normal) using kernel density estimation. This should give us a clear idea of how fraudulent and normal datapoints (credit card transactions) are distributed along each variable.

In [None]:
# Plot KDE for V17 values that belong to Class 0 (Normal)
sns.kdeplot(data=data[data['Class']==0]['V17'],label="Class 0",shade=True)
# Plot KDE for V17 values that belong to Class 1 (Fraud)
sns.kdeplot(data=data[data['Class']==1]['V17'],label="Class 1",shade=True)
plt.legend() #didn't have legends showing before adding this line


In [None]:
# Plot KDE for V14 values that belong to Class 0 (Normal)
sns.kdeplot(data=data[data['Class']==0]['V14'],label="Class 0",shade=True)
# Plot KDE for V14 values that belong to Class 1 (Fraud)
sns.kdeplot(data=data[data['Class']==1]['V14'],label="Class 1",shade=True)
plt.legend() #didn't have legends showing before adding this line


In [None]:
# Plot KDE for V12 values that belong to Class 0 (Normal)
sns.kdeplot(data=data[data['Class']==0]['V12'],label="Class 0",shade=True)
# Plot KDE for V12 values that belong to Class 1 (Fraud)
sns.kdeplot(data=data[data['Class']==1]['V12'],label="Class 1",shade=True)
plt.legend() #didn't have legends showing before adding this line


Let's summarize our findings in the following points:

In every interesting variable, the distribution of the normal transaction takes a shape that very close to the standard normal distribution.
In every interesting variable, the distribution of the fraudulent transaction takes a shape that very close to a normal distribution with height standard deviation (highly spread).
In the boring variable, both fraudulent and normal transactions have the same distribution. close to standard normal distribution.

Now let's display some statistical measurements to support our findings.



In [None]:
# Values of the variable V14 that belong to Class 0 (Normal)
data[data['Class'] == 0]['V14'].describe()

In [None]:
# Values of the variable V14 that belong to Class 1 (Fraud)
data[data['Class'] == 1]['V14'].describe()

In [None]:
# The true lables
y = data.Class

# This is our classifier 
high_accuracy_y = [0 if i>-4 and i<4 else 1 for i in data['V14']]

# Calculate accuracy
accuracy_score(y, high_accuracy_y)

Thus, from these two statistic findings, by only using the variable V14 and only one line of code, I can achieve an accuracy of 99% predicting whether a credit card transaction is fraudulent or normal.

**However, one thing to consider
**This data set is highly imbalanced, and the positive class (frauds) account for only 0.172% of all transactions.

What does that mean?

Well, the accuracy metric is not a fair measurement of the performance of my classifier, because if I classified all negative (normal) transactions correctly, then I will achieve a very high accuracy since almost all data points belong to this class, and it does not matter how many data points from the other class I classified correctly. That is exactly what I did, I figured out from the KDE that most normal transactions have a V14 value between 4 and -4.

Solution?

We should consider another metric to measure how many data points from the positive class did we classify correctly, and that metric is called Recall. So let's display the confusion matrix and calculate the Recall.

In [None]:
# Display confusion matrix for our high accuracy classifier
sns.set(font_scale=3)
confusion_matrix = sklearn.metrics.confusion_matrix(y, high_accuracy_y)

plt.figure(figsize=(16, 14))
sns.heatmap(confusion_matrix, annot=True, fmt="d", annot_kws={"size": 20});
plt.title("Confusion matrix", fontsize=30)
plt.ylabel('True label', fontsize=25)
plt.xlabel('Clustering label', fontsize=25)
plt.show()

In [None]:
TN = confusion_matrix[0][0] # True Negative
FP = confusion_matrix[0][1] # False Positive
FN = confusion_matrix[1][0] # False Negative
TP = confusion_matrix[1][1] # True Positive
# Recall
TP/(TP+FN)

We got a Recall of 76%, which is not that bad, but I claim that we can do better. But how can we classify more of the positive transactions correctly?

The idea is to narrow down the interval for the negative transactions. By doing that, we are going to miss classify some of the negative transactions which will decrease the accuracy. But in return, we will increase the number of correctly classified positive transactions which increases the recall. It's a tradeoff! If you look at the KDE of V14 this will make much more sense.

In [None]:
# This is our high recall classifier 
high_recall_y = [0 if i>-1.05 and i<3 else 1 for i in data['V14']]

accuracy_score(y, high_recall_y)


In [None]:
# Display confusion matrix for our high recall classifier
sns.set(font_scale=3)
confusion_matrix = sklearn.metrics.confusion_matrix(y, high_recall_y)

plt.figure(figsize=(16, 14))
sns.heatmap(confusion_matrix, annot=True, fmt="d", annot_kws={"size": 20});
plt.title("Confusion matrix", fontsize=30)
plt.ylabel('True label', fontsize=25)
plt.xlabel('Clustering label', fontsize=25)
plt.show()

In [None]:
TN = confusion_matrix[0][0] # True Negative
FP = confusion_matrix[0][1] # False Positive
FN = confusion_matrix[1][0] # False Negative
TP = confusion_matrix[1][1] # True Positive
# Recall
TP/(TP+FN)

Now, we have a fair trade-off with 90% for both accuracy and recall. I can continue trying to find a better interval

In [None]:
X = data[['V17', 'V14', 'V12']]
kmeans = KMeans(n_clusters=2, max_iter=3000, n_init=20)
"""
n_init. (default=10)
Number of time the k-means algorithm will be run with different centroid seeds. 
The final results will be the best output of n_init consecutive runs in terms of inertia.

max_iter(default=300)
Maximum number of iterations of the k-means algorithm for a single run.


"""

# Fit and then store predictions in y_pred_kmeans
y_pred_kmeans = kmeans.fit_predict(X)
# Calculate accuracy
accuracy_score(y, y_pred_kmeans)

In [None]:
# Display confusion matrix for K-means
sns.set(font_scale=3)
confusion_matrix = sklearn.metrics.confusion_matrix(y, y_pred_kmeans)

plt.figure(figsize=(16, 14))
sns.heatmap(confusion_matrix, annot=True, fmt="d", annot_kws={"size": 20});
plt.title("Confusion matrix", fontsize=30)
plt.ylabel('True label', fontsize=25)
plt.xlabel('Clustering label', fontsize=25)
plt.show()

In [None]:
TN = confusion_matrix[0][0] # True Negative
FP = confusion_matrix[0][1] # False Positive
FN = confusion_matrix[1][0] # False Negative
TP = confusion_matrix[1][1] # True Positive
# Recall
TP/(TP+FN)

6. Conclusion
By a careful exploration of the dataset, we successfully identified the most important independent variables. We also understood their relationship with the dependent variable, and that led us to a simple solution for this dataset using only one variable and one line of code. Moreover, we showed that our elegant solution outperformed a KMeans model built using the top three independent variables.