In [1]:
#import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [2]:
# read the data file

data =("Fraud.csv")

n = sum(1 for line in open(data)) - 1 #number of records in file (excludes header)
s = 100000 #desired sample size
skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
df = pd.read_csv(data, skiprows=skip)

In [3]:
# creating a dataframe 
df_fraud = pd.DataFrame(df)
df_fraud

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,CASH_OUT,28404.60,C2091072548,0.00,0.00,C1282788025,51744.00,0.00,0,0
1,1,DEBIT,864.68,C1368862151,69836.00,68971.32,C20671747,12040.00,43691.09,0,0
2,1,CASH_OUT,53631.83,C1233595751,0.00,0.00,C757108857,83244.00,0.00,0,0
3,1,CASH_OUT,3146.16,C923083575,0.00,0.00,C1504109395,9471.00,593737.38,0,0
4,1,CASH_OUT,68912.23,C1620529408,0.00,0.00,C1688019098,237326.80,97263.78,0,0
...,...,...,...,...,...,...,...,...,...,...,...
99995,718,DEBIT,425.65,C1835928822,4046.00,3620.35,C701975669,3009282.73,3009708.38,0,0
99996,736,CASH_OUT,274799.58,C1263652542,274799.58,0.00,C1406817938,0.00,274799.58,1,0
99997,737,TRANSFER,946705.53,C1524194171,946705.53,0.00,C357888969,0.00,0.00,1,0
99998,739,TRANSFER,194521.28,C1696012302,194521.28,0.00,C411128068,0.00,0.00,1,0


In [4]:
df_fraud.shape

(100000, 11)

step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

type - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

amount - amount of the transaction in local currency.

nameOrig - customer who started the transaction

oldbalanceOrg - initial balance before the transaction

newbalanceOrig - new balance after the transaction

nameDest - customer who is the recipient of the transaction

oldbalanceDest - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

newbalanceDest - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

isFraud - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

isFlaggedFraud - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

In [5]:
df_fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   step            100000 non-null  int64  
 1   type            100000 non-null  object 
 2   amount          100000 non-null  float64
 3   nameOrig        100000 non-null  object 
 4   oldbalanceOrg   100000 non-null  float64
 5   newbalanceOrig  100000 non-null  float64
 6   nameDest        100000 non-null  object 
 7   oldbalanceDest  100000 non-null  float64
 8   newbalanceDest  100000 non-null  float64
 9   isFraud         100000 non-null  int64  
 10  isFlaggedFraud  100000 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 8.4+ MB


In [6]:
df_fraud.describe(include='all')

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,100000.0,100000,100000.0,100000,100000.0,100000.0,100000,100000.0,100000.0,100000.0,100000.0
unique,,5,,99995,,,92996,,,,
top,,CASH_OUT,,C375450542,,,C1627055761,,,,
freq,,35046,,2,,,5,,,,
mean,243.54454,,179563.6,,840517.5,861877.2,,1097035.0,1218403.0,0.00146,0.0
std,142.952664,,632293.0,,2904906.0,2941013.0,,3277820.0,3575415.0,0.038182,0.0
min,1.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0
25%,155.0,,13436.74,,0.0,0.0,,0.0,0.0,0.0,0.0
50%,239.0,,75777.58,,14487.0,0.0,,135965.5,219460.2,0.0,0.0
75%,335.0,,208888.0,,107527.5,144534.2,,941324.7,1113352.0,0.0,0.0


In [7]:
# data is clean as their is no null values present in the data
df_fraud.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [None]:
#As the Data is the classification data so outliners doesn't play a major role in it, Spo the value can be 0 or 1
sns.displot(df_fraud['oldbalanceOrg'])

In [None]:
df_fraud.hist(figsize=(15,15))
plt.show()

In [None]:
df_fraud.boxplot(column='oldbalanceOrg')
plt.show()

In [None]:
plt.pie(df_fraud["type"].value_counts(), labels=df_fraud["type"].unique(), autopct='%.0f%%')
plt.show()

In [None]:
df_fraud.boxplot(column='amount',by='type')
plt.tight_layout()
plt.show()

In [None]:
g = sns.relplot(x="amount",y="isFraud",col='isFraud',data=df,kind="scatter")
plt.show()

In [None]:
df_fraud['type'].unique()

In [None]:
df_fraud['type'].replace(['PAYMENT', 'TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'],[0,1,2,3,4], inplace=True)
df_fraud

In [None]:
df_Fraudtype=df_fraud["type"].value_counts(normalize=True)
df_Fraudtype

df_Fraudtype.plot(kind = 'bar', title = "type",width=0.2)

In [None]:
plt.figure(figsize=[15,15])
df_fraud.boxplot(column='amount',by='isFraud')
plt.show()
plt.figure(figsize=[15,15])
df_fraud.boxplot(column='oldbalanceOrg',by='isFraud')
plt.show()

In [None]:
df_fraud["nameOrig"].where(df_fraud["nameOrig"].value_counts()>3).sum()

In [None]:
corr=df_fraud.corr()
corr

In [None]:
#Multicollinearity
plt.figure(figsize=[13,13])
sns.heatmap(corr, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5},annot=True)

In [None]:
# lowest effect is vissible in newbalanceOrig,newbalabceDest,oldbalanceDest,type

In [None]:
df =df_fraud.drop([ 'nameOrig','nameDest', 'newbalanceOrig','newbalanceDest','oldbalanceDest','type'], axis=1)
df.info()

In [None]:
from sklearn.preprocessing import LabelBinarizer

In [None]:
# by default majority class (benign) will be negative
lb = LabelBinarizer()
df['isFraud'] = lb.fit_transform(df['isFraud'].values)
targets = df['isFraud']

df.drop([ 'isFraud'], axis=1, inplace=True)
df

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold

X_train, X_test, y_train, y_test = train_test_split(df, targets, stratify=targets,test_size=0.3,random_state=100)

In [None]:
print('y_train class distribution')
print(y_train.value_counts(normalize=True))

print('y_test class distribution')
print(y_test.value_counts(normalize=True))

In [None]:
# first classifier
from sklearn.svm import SVC
svm = SVC(gamma='auto')

In [None]:
svm.fit(X_train, y_train)

In [None]:
svm.score(X_test, y_test)

In [None]:
#second classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1)

In [None]:
rf.fit(X_train, y_train)

In [None]:
# second classifier 2 model
rf2 = RandomForestClassifier(n_estimators=50) #second model
rf2.fit(X_train, y_train)
rf2.score(X_test, y_test)

In [None]:
preds_rf2 = rf2.predict(X_test)
preds_rf2

In [None]:
##Evaluation ( for 2 classifier 2 model)

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix,classification_report,plot_confusion_matrix

In [None]:
print(classification_report(y_test,preds_rf2))
print(confusion_matrix(y_test,preds_rf2))
plot_confusion_matrix(rf,X_test,y_test)
rf2.feature_importances_

In [None]:
df.columns

In [None]:
pd.DataFrame(data=rf2.feature_importances_,index=df.columns,columns=['Feature Importance']).sort_values('Feature Importance')

In [None]:
from sklearn.model_selection import cross_val_score

scores=cross_val_score(SVC(gamma='auto'), X_train,y_train,cv=5)
scores

In [None]:
np.average(scores)

In [None]:
scores=cross_val_score(RandomForestClassifier(n_estimators=10), X_train,y_train,cv=5)
scores

In [None]:
np.average(scores)

In [None]:
scores=cross_val_score(RandomForestClassifier(n_estimators=40), X_train,y_train,cv=5)
np.average(scores)

##GRID SEARCH

In [None]:

param_grid = {
    'min_samples_split': [3, 5, 10], 
    'n_estimators' : [100, 300],
    'max_depth': [3, 5, 15, 25],
    'max_features': [3, 5, 10, 20]
}


In [None]:
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

In [None]:
def grid_search_wrapper(refit_score='precision_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10)
    grid_search = GridSearchCV(rf, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train.values, y_train.values)

    # make the predictions
    y_pred = grid_search.predict(X_test.values)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(y_test, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search
grid_search_clf = grid_search_wrapper(refit_score='precision_score')

y_train class distribution


NameError: name 'y_train' is not defined