In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

https://drive.google.com/uc?export=download&confirm=6gh6&id=1VNpyNkGxHdskfdTNRSjjyNa5qC9u0JyV

In [None]:
df = pd.read_csv("Fraud.csv")

## Data Analysis & Cleaning

In [None]:
df.isnull().values.any()

In [None]:
df[df.isFlaggedFraud==1].isFlaggedFraud.count()

In [None]:
df[ df.isFlaggedFraud == 1].type.unique()

In [None]:
df[ (df.isFlaggedFraud == 1)].describe()

###### Flagged Fraud is both 0 & 1 for amount > 200000 so it seems to have no effect on Amount

In [None]:
df[ (df.amount>200000) & (df.type == "TRANSFER")].isFlaggedFraud.count()

In [None]:
df[ (df.isFlaggedFraud == 1) & (df.amount>200000)].isFlaggedFraud.count()

###### Flagged Fraud is both 0 & 1 for the case when both Old & New Balance in Destination is 0 so hence we can assume it dosen't affect these columns

In [None]:
df[(df.isFlaggedFraud == 1) & (df.oldbalanceDest == 0) & (df.newbalanceDest == 0)].isFlaggedFraud.all()

In [None]:
(df[(df.isFlaggedFraud == 0) & (df.oldbalanceDest == 0) & (df.newbalanceDest == 0)& (df.type == "TRANSFER")].isFlaggedFraud == 0).any()

###### No Merchant is involved in Flag Fraud Transaction

In [None]:
df[df.nameDest.str.contains('M')].isFlaggedFraud.unique()

###### Difference in Old & New Balance of Originator is 0 in both the cases of Flaged Fraud so these also dosen't get affected by  Flaged Fraud

In [None]:
(df[df.isFlaggedFraud==1].oldbalanceOrg - df[df.isFlaggedFraud==1].newbalanceOrig).sum()

In [None]:
((df[df.isFlaggedFraud==0].oldbalanceOrg - df[df.isFlaggedFraud==0].newbalanceOrig)==0).unique()

###### As there are only 16 Flaged Fraud Rows in Millions of data & they are not affecting much to our dataset so we can drop this column

In [None]:
df.drop("isFlaggedFraud",axis=1,inplace=True)

In [None]:
df.head(2)

###### Types of Transaction which are marked as fraud

In [None]:
df.type.unique()

In [None]:
df[df.isFraud==1].type.unique()

In [None]:
df[df.nameDest.str.contains('M')].type.unique()

In [None]:
df.nameOrig.str.contains('M').any()

###### We can easily see that Fraud Transactions occured mainly in Transfer & Cash_Out so we can drop other types

In [None]:
df = df.loc[(df.type == 'TRANSFER') | (df.type == 'CASH_OUT')]

###### Created 2 new columns showing difference in old & new amount of origion & destination

In [None]:
df.insert(6,"DifferenceOrig",df.oldbalanceOrg - df.newbalanceOrig)

In [None]:
df.insert(10,"DifferenceDest",(df.newbalanceDest - df.oldbalanceDest))

In [None]:
df.head()

###### From Data Dictionary Fraudulent transaction is that which first transfer the amount & then cashing out

In [None]:
df[(df.isFraud==1) & df.nameDest.isin(df.nameOrig)]

In [None]:
df[(df.isFraud==1) & df.nameOrig.isin(df.nameDest)].any().any()

###### Transfer Destination accounts which are present in origin accounts & have cash out in second step are not fraud 

In [None]:
l1 = df[(df.type=="TRANSFER") & df.nameDest.isin(df.nameOrig)].nameDest.unique()

In [None]:
df[(df.type=="CASH_OUT") & df.nameOrig.isin(l1)].isFraud.unique()

###### Originator & Destiantion account name seems to have not much impact on predicting wheather a transaction is fraud or not so we can drop them

In [None]:
df.drop(["nameDest","nameOrig"],axis=1,inplace=True)

In [None]:
df.head(2)

In [None]:
df.type = df.type.apply(lambda x : 0 if x=="TRANSFER" else 1)

In [None]:
df.head(2)

###### Almost 50% fraud data has both old & new balance of destination account 0, so seems like a important parameter

In [None]:
L = len(df[(df.isFraud==1) & (df.oldbalanceDest == 0) & (df.newbalanceDest == 0)]) 
L

In [None]:
percentage_fraud_dest = L/len(df[df.isFraud==1])
percentage_fraud_dest

In [None]:
percentage_fraud_orig = len(df[(df.isFraud==1) & (df.oldbalanceOrg == 0) & (df.newbalanceOrig == 0)])/len(df[df.isFraud==1])
percentage_fraud_orig

In [None]:
percentage_not_fraud_orig = len(df[(df.isFraud==0) & (df.oldbalanceOrg == 0) & (df.newbalanceOrig == 0)])/len(df[df.isFraud==0])
percentage_not_fraud_orig

In [None]:
df[df.isFraud==1].describe()

## Data Visualization

In [None]:
plt.subplot(1,2,1)
plt.xlabel("DifferenceOrig")
plt.ylabel("isFraud")
plt.scatter(df.DifferenceOrig/1000000,df.isFraud)
plt.subplot(1,2,2)
plt.xlabel("DifferenceDest")
plt.scatter(df.DifferenceDest/1000000,df.isFraud,c = "g")

In [None]:
sns.scatterplot(df.step,df.isFraud)

###### As it is very unlikely to have a fraud for transaction amount >10M so we can drop those rows for the amount > 20M without missing any valuable information

In [None]:
sns.scatterplot(df.amount/1000000,df.isFraud)

In [None]:
df = df.loc[df.amount<20000000]

In [None]:
df.amount.max()

In [None]:
df = df.reset_index()
df.drop("index",axis=1,inplace=True)

In [None]:
df.head()

#### Feature Scaling

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
my_pipeline = Pipeline([('std_scaler', MinMaxScaler()),])
df2 = my_pipeline.fit_transform(df.drop(["type","isFraud"],axis=1))

In [None]:
df2 = pd.concat([pd.DataFrame(df2),pd.DataFrame(df.isFraud)],axis=1)
df2.insert(1,"1",df.type)
df2.columns = df.columns
df2.head()

In [None]:
X = df2.drop("isFraud",axis=1)
y = df2["isFraud"]

## ML Model

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.1)

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
svc = SVC(kernel='rbf')
rfc = RandomForestClassifier(n_estimators = 50,n_jobs=-1,random_state = 101)
sgd = SGDClassifier(loss='modified_huber',shuffle = True,random_state = 101)

In [None]:
svc.fit(x_train,y_train)

In [None]:
rfc.fit(x_train,y_train)

In [None]:
sgd.fit(x_train,y_train)

## Cross Validation Score

In [None]:
from sklearn.model_selection import cross_val_score
for model in [svc,rfc,sgd]:
    scores = cross_val_score(model, x_train,y_train, scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    print(model,": ",rmse_scores.mean())

## Prediction

In [None]:
y_pred = []
for model in [svc,rfc,sgd]:
    pred = model.predict(x_test)
    y_pred.append([model,pred])

## Calculating Confusion Matrix, Precision & Recall, F1-Score

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

In [None]:
for m,i in y_pred:
    print(m,"\n")
    print("Confusion Matrix: ","\n",confusion_matrix(y_test,i))
    print("Precision: ",precision_score(y_test,i))
    print("Recall: ",recall_score(y_test,i))
    print("F1 Score: ",f1_score(y_test,i),"\n")