[![alt text](https://avatars1.githubusercontent.com/u/59831504?s=400&v=4 "MtheEPIC User Icon")](https://github.com/MtheEPIC)

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

---
# Setup Data

## Load & Display Data
[Kaggle Dataset](https://www.kaggle.com/ntnu-testimon/paysim1 "Synthetic Financial Datasets For Fraud Detection")

In [21]:
df = pd.read_csv('data/card.csv')
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


## Check Value Type

In [None]:
df.info()

## Define Each Feature

### We have 11 initial features:

* **step:** Maps a unit of time in the real world. In this case 1 step is 1 hour of time, 743 is the end of the month
* **type:** CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER
* **amount:** amount of the transaction in local currency _(may be bigger then the account balance)_
* **nameOrig:** customer who started the transaction _(start with C for customer)_
* **oldbalanceOrg:** initial balance before the transaction _(at least 0)_
* **newbalanceOrig:** customer's balance after the transaction. _(at least 0)_
* **nameDest:** recipient ID of the transaction. _(start with C for customer)_
* **oldbalanceDest:** initial recipient balance before the transaction. _(at least 0)_
* **newbalanceDest:** recipient's balance after the transaction. _(at least 0)_
* **isFraud:** identifies a fraudulent transaction (1) and non fraudulent (0)
* **isFlaggedFraud:** flags illegal attempts to transfer more than 200.000 in a single transaction.

---
# Clear Data

## Check for Null

### since the data is synthetic there are no null values
#### (as shown bellow)

In [None]:
df.isnull().sum()

## Understand The Data & Trends

### Transaction Types

In [None]:
df['type'].value_counts().plot.bar()
plt.show()

In [None]:
precent = df['type'].value_counts(normalize=True)['DEBIT']*100
lines = df[df['type'] == 'DEBIT']['type'].value_counts()['DEBIT']
print("The precent of DEBIT in the Data Set is {:.2f}% ({} lines)".format(precent, lines))

### Fraud happens aroud 0.13%

In [None]:
"{:.2f}%".format(df['isFraud'].value_counts(normalize=True)[1]*100)

### Fraud is only in: TRANSFER, CASH OUT
#### (at about the same amount)

In [None]:
fraud = df[df['isFraud'] == 1]
fraud['type'].value_counts(normalize=True)*100

### Precentage of Transfer Fraud is about 0.77%

In [None]:
df[df['type'] == 'TRANSFER']['isFraud'].value_counts(normalize=True)[1]*100

### Precentage of Cash Out Fraud is about 0.18%

In [None]:
df[df['type'] == 'CASH_OUT']['isFraud'].value_counts(normalize=True)[1]*100

### If The Transaction Amount Was The Same As The Balance: Its Fraud

In [None]:
df[df['amount'] == df['oldbalanceOrg']]['isFraud'].unique()

### If the Existing Fraud Detection Flags the Transaction as Fraud, It's Fraud

In [None]:
df[df['isFlaggedFraud'] == 1]['isFraud'].unique()

### Clients who Commited Fraud also Have Legit Transactions

In [None]:
fraudClients = df[df['isFraud']==1]['nameDest'].unique()
df[df['nameDest']==fraudClients[0]].head()

### Though in general there are more Initiates then Recipients, In Fraudulent Transactions it's the other way

In [None]:
print("In Total:")
print("The Number Of Initiates is: {}".format(len(df['nameOrig'].unique())))
print("The Number Of Recipients is: {}".format(len(df['nameDest'].unique())))
print("In Fraudulent Transactions:")
print("The Number Of Initiates is: {}".format(len(df[df['isFraud']==1]['nameOrig'].unique())))
print("The Number Of Recipients is: {}".format(len(df[df['isFraud']==1]['nameDest'].unique())))

### Almost all of the fraudulent transactions where when the amount in the transaction was the same as the account balance

In [None]:
dfFraud = df[df['isFraud']==1][['amount', 'oldbalanceOrg']]
precent = dfFraud[dfFraud['amount']!=dfFraud['oldbalanceOrg']].shape[0]/dfFraud.shape[0]*100
print("The precentage of fraudulent transaction that didn't take the hull balace of the customer is {}%".format(precent))

### The destination account doesn't seem to update according to the transferred amount, probebly to simulate the bank verifying the transaction

In [None]:
amount = df.loc[10]['amount']
delta = df.loc[10]['newbalanceDest'] - df.loc[10]['oldbalanceDest']
print('the amount transferred ({}) doesn\'t match the change in the destination account ({})'.format(amount, delta))

---
# Predict Fraud

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
# from sklearn.tree import DecisionTreeClassifier

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
step              int64
type              object
amount            float64
nameOrig          object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest          object
oldbalanceDest    float64
newbalanceDest    float64
isFraud           int64
isFlaggedFraud    int64
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


## Convert Strings to float/int

In [22]:
#tmp
df2 = df.copy()

In [23]:
df = df2

In [61]:
data = df.copy()

In [62]:
tmp = pd.get_dummies(data['type'])
data = data.join(tmp)
data = data.drop('type', axis=1)
data.head()

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,0,0,0,1,0
1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,0,0,0,1,0
2,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,0,0,0,0,1
3,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,0,1,0,0,0
4,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,0,0,0,1,0


In [63]:
data = data.drop(['nameOrig', 'nameDest', 'step'], axis=1)
data.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,0,0,1,0
1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,0,0,1,0
2,181.0,181.0,0.0,0.0,0.0,1,0,0,0,0,0,1
3,181.0,181.0,0.0,21182.0,0.0,1,0,0,1,0,0,0
4,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,0,0,1,0


In [64]:
def normalize(x):
    data[x] = data[x]/data[x].max()
    
[normalize(x) for x in data]
data.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,0.000106,0.002855,0.003233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2e-05,0.000357,0.000391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2e-06,3e-06,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2e-06,3e-06,0.0,5.9e-05,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.000126,0.000697,0.000603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [49]:
# from sklearn.preprocessing import StandardScaler
# # data = [[0, 0], [0, 0], [1, 1], [1, 1]]
# scaler = StandardScaler()
# print(scaler.fit(data))
# StandardScaler()
# print(scaler.mean_)
# print(scaler.transform(data))
# scale = scaler.transform(data)
# print(scale[0])
# data.head()
# print(scaler.transform([[2, 2]]))

StandardScaler(copy=True, with_mean=True, with_std=True)
[2.43397246e+02 1.79861904e+05 8.33883104e+05 8.55113669e+05
 1.10070167e+06 1.22499640e+06 1.29082045e-03 2.51468735e-06
 2.19922610e-01 3.51663309e-01 6.51178288e-03 3.38146078e-01
 8.37562199e-02]
[[-1.70304158e+00 -2.81559923e-01 -2.29810037e-01 ... -8.09596451e-02
   1.39903602e+00 -3.02345156e-01]
 [-1.70304158e+00 -2.94767262e-01 -2.81359380e-01 ... -8.09596451e-02
   1.39903602e+00 -3.02345156e-01]
 [-1.70304158e+00 -2.97554804e-01 -2.88653782e-01 ... -8.09596451e-02
  -7.14777880e-01  3.30747816e+00]
 ...
 [ 3.51012348e+00  1.01539526e+01  1.89649113e+00 ... -8.09596451e-02
  -7.14777880e-01 -3.02345156e-01]
 [ 3.51012348e+00  1.10976490e+00  5.58104668e-03 ... -8.09596451e-02
  -7.14777880e-01  3.30747816e+00]
 [ 3.51012348e+00  1.10976490e+00  5.58104668e-03 ... -8.09596451e-02
  -7.14777880e-01 -3.02345156e-01]]
[-1.70304158e+00 -2.81559923e-01 -2.29810037e-01 -2.37621696e-01
 -3.23813895e-01 -3.33411405e-01 -3.595120

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,0,0,0,1,0
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,0,0,0,1,0
2,1,181.0,181.0,0.0,0.0,0.0,1,0,0,0,0,0,1
3,1,181.0,181.0,0.0,21182.0,0.0,1,0,0,1,0,0,0
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,0,0,0,1,0


In [65]:
X=data.drop('isFraud', axis=1)
y=data['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Train

In [66]:
def evaluate(test, prediction):
    print(confusion_matrix(test, prediction))
    print(metrics.accuracy_score(test, prediction))

### Logistic Regression

In [None]:
# Logistic Regression (kills cpu)
lr=LogisticRegression(solver='lbfgs')
lr.fit(X_train,y_train)
#we used the train model to get the prediction for our test data set (X_test)
predictions=lr.predict(X_test)
evaluate(y_test, predictions)

### Linear SVC

In [79]:
clf = LinearSVC(random_state=0, tol=1e30)
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
evaluate(y_test, predictions)

[[1755389  150962]
 [    973    1462]]
0.920402287108141


In [None]:
clf = LinearSVC(random_state=0, tol=1e-2)
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
evaluate(y_test, predictions)

In [75]:
clf = LinearSVC(random_state=0, tol=2*1e1)
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
evaluate(y_test, predictions)

[[1755389  150962]
 [    973    1462]]
0.920402287108141


In [76]:
clf = LinearSVC(random_state=0, tol=1e0)
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
evaluate(y_test, predictions)

[[1906346       5]
 [   1988     447]]
0.9989558808583047


In [77]:
clf = LinearSVC(random_state=0, tol=9*1e-1)
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
evaluate(y_test, predictions)

[[1906346       5]
 [   1988     447]]
0.9989558808583047


In [78]:
clf = LinearSVC(random_state=0, tol=5*1e-2)
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
evaluate(y_test, predictions)

[[1906343       8]
 [   1963     472]]
0.9989674065086396


### KNN

In [None]:
#now lets find the optimum K 
err_rate = []
for i in range(1,30):
    k = KNeighborsClassifier(n_neighbors=i)
    k.fit(X_train, y_train)
    pred_i=k.predict(X_test)
    
    err_rate.append(np.mean(pred_i !=y_test))