<a href="https://colab.research.google.com/github/Janebhop/fraud-detection-course/blob/main/notebook/chapter3/Chapter3_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import Data

In [3]:
dataPath = '/content/drive/My Drive/fraud/PS_20174392719_1491204439457_log.csv'

In [4]:
df_test = pd.read_csv(dataPath, nrows=10)

float_cols = [c for c in df_test if df_test[c].dtype == "float64"]
float32_cols = {c: np.float32 for c in float_cols}

In [5]:
float_cols

['amount',
 'oldbalanceOrg',
 'newbalanceOrig',
 'oldbalanceDest',
 'newbalanceDest']

In [6]:
data = pd.read_csv(dataPath, engine='c', dtype=float32_cols)
data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9.839640e+03,C1231006815,170136.000,160296.359375,M1979787155,0.000000e+00,0.000,0,0
1,1,PAYMENT,1.864280e+03,C1666544295,21249.000,19384.720703,M2044282225,0.000000e+00,0.000,0,0
2,1,TRANSFER,1.810000e+02,C1305486145,181.000,0.000000,C553264065,0.000000e+00,0.000,1,0
3,1,CASH_OUT,1.810000e+02,C840083671,181.000,0.000000,C38997010,2.118200e+04,0.000,1,0
4,1,PAYMENT,1.166814e+04,C2048537720,41554.000,29885.859375,M1230701703,0.000000e+00,0.000,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,3.396821e+05,C786484425,339682.125,0.000000,C776919290,0.000000e+00,339682.125,1,0
6362616,743,TRANSFER,6.311410e+06,C1529008245,6311409.500,0.000000,C1881841831,0.000000e+00,0.000,1,0
6362617,743,CASH_OUT,6.311410e+06,C1162922333,6311409.500,0.000000,C1365125890,6.848884e+04,6379898.000,1,0
6362618,743,TRANSFER,8.500025e+05,C1685995037,850002.500,0.000000,C2080388513,0.000000e+00,0.000,1,0


### Preprocessing

In [7]:
# cleaning data 
# checking null values

data.isnull().values.any()

False

In [8]:
# feature selection
data = data.drop(['nameOrig','nameDest'],axis = 1)

In [10]:
# feature engineering

dummy_type = pd.get_dummies(data['type'])
dummy_type

Unnamed: 0,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,0,1
3,0,1,0,0,0
4,0,0,0,1,0
...,...,...,...,...,...
6362615,0,1,0,0,0
6362616,0,0,0,0,1
6362617,0,1,0,0,0
6362618,0,0,0,0,1


In [11]:
data = (pd.concat([data,dummy_type], axis = 1).reset_index(drop = True)).drop(['type'],axis = 1)[['step','CASH_IN','CASH_OUT','DEBIT','PAYMENT','TRANSFER','amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest','isFlaggedFraud','isFraud']]
data

Unnamed: 0,step,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,isFraud
0,1,0,0,0,1,0,9.839640e+03,170136.000,160296.359375,0.000000e+00,0.000,0,0
1,1,0,0,0,1,0,1.864280e+03,21249.000,19384.720703,0.000000e+00,0.000,0,0
2,1,0,0,0,0,1,1.810000e+02,181.000,0.000000,0.000000e+00,0.000,0,1
3,1,0,1,0,0,0,1.810000e+02,181.000,0.000000,2.118200e+04,0.000,0,1
4,1,0,0,0,1,0,1.166814e+04,41554.000,29885.859375,0.000000e+00,0.000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,0,1,0,0,0,3.396821e+05,339682.125,0.000000,0.000000e+00,339682.125,0,1
6362616,743,0,0,0,0,1,6.311410e+06,6311409.500,0.000000,0.000000e+00,0.000,0,1
6362617,743,0,1,0,0,0,6.311410e+06,6311409.500,0.000000,6.848884e+04,6379898.000,0,1
6362618,743,0,0,0,0,1,8.500025e+05,850002.500,0.000000,0.000000e+00,0.000,0,1


In [12]:
data['errorBalanceOrig'] = data['newbalanceOrig'] + data['amount'] - data['oldbalanceOrg']
data['errorBalanceDest'] = data['oldbalanceDest'] + data['amount'] - data['newbalanceDest']
data

Unnamed: 0,step,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,isFraud,errorBalanceOrig,errorBalanceDest
0,1,0,0,0,1,0,9.839640e+03,170136.000,160296.359375,0.000000e+00,0.000,0,0,0.0,9.839640e+03
1,1,0,0,0,1,0,1.864280e+03,21249.000,19384.720703,0.000000e+00,0.000,0,0,0.0,1.864280e+03
2,1,0,0,0,0,1,1.810000e+02,181.000,0.000000,0.000000e+00,0.000,0,1,0.0,1.810000e+02
3,1,0,1,0,0,0,1.810000e+02,181.000,0.000000,2.118200e+04,0.000,0,1,0.0,2.136300e+04
4,1,0,0,0,1,0,1.166814e+04,41554.000,29885.859375,0.000000e+00,0.000,0,0,0.0,1.166814e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,0,1,0,0,0,3.396821e+05,339682.125,0.000000,0.000000e+00,339682.125,0,1,0.0,0.000000e+00
6362616,743,0,0,0,0,1,6.311410e+06,6311409.500,0.000000,0.000000e+00,0.000,0,1,0.0,6.311410e+06
6362617,743,0,1,0,0,0,6.311410e+06,6311409.500,0.000000,6.848884e+04,6379898.000,0,1,0.0,5.000000e-01
6362618,743,0,0,0,0,1,8.500025e+05,850002.500,0.000000,0.000000e+00,0.000,0,1,0.0,8.500025e+05


In [14]:
# splitting data

# fraud and no fraud transactions are separated by 'isFraud' column
dataF = data[data['isFraud'] == 1]
dataNF = data[data['isFraud'] == 0]

# split fraud transactions into 80 percent and 20 percent sessions
mskF = np.random.rand(len(dataF)) < 0.8
trainF = dataF[mskF]
testF = dataF[~mskF]

# split no fraud transactions into 80 percent and 20 percent sessions
mskNF = np.random.rand(len(dataNF)) < 0.8
trainNF = dataNF[mskNF]
testNF = dataNF[~mskNF]

# create training data and testing data
train_data = pd.concat([trainF,trainNF]).reset_index(drop = True)
test_data = pd.concat([testF,testNF]).reset_index(drop = True)

#create input data attributes by dropped irrevalent and target column 
train_feature = train_data.drop(['isFraud'],axis = 1)
test_feature = test_data.drop(['isFraud'],axis = 1)

train_label = train_data['isFraud']
test_label = test_data['isFraud']

In [15]:
# feature scaling

scaler = preprocessing.StandardScaler()
train_feature_scaled =scaler.fit_transform(train_feature)

test_feature_scaled = scaler.transform(test_feature)

In [16]:
train_feature_scaled[1]

array([-1.70317324e+00, -5.30919959e-01,  1.35774400e+00, -8.06908639e-02,
       -7.14899389e-01, -3.02298348e-01, -2.92792798e-01, -2.87697209e-01,
       -2.92408454e-01, -3.16049626e-01, -3.33367178e-01, -1.65875706e-03,
       -3.30986466e-01, -5.97883086e-02])

In [17]:
col = ['step', 'CASH_IN', 'CASH_OUT', 'DEBIT', 'PAYMENT', 'TRANSFER', 'amount',
       'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest',
       'isFlaggedFraud', 'errorBalanceOrig', 'errorBalanceDest']

In [18]:
data_feature_train = pd.DataFrame(data=train_feature_scaled, columns=col)
data_feature_train

Unnamed: 0,step,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,errorBalanceOrig,errorBalanceDest
0,-1.703173,-0.53092,1.357744,-0.080691,-0.714899,-0.302298,-0.297135,-0.288607,-0.292408,-0.317527,-0.333367,-0.001659,-0.330986,-0.077041
1,-1.703173,-0.53092,1.357744,-0.080691,-0.714899,-0.302298,-0.292793,-0.287697,-0.292408,-0.316050,-0.333367,-0.001659,-0.330986,-0.059788
2,-1.703173,-0.53092,1.357744,-0.080691,-0.714899,-0.302298,0.390692,-0.288669,-0.292408,-0.323728,2.195693,-0.001659,0.353753,-20.154680
3,-1.703173,-0.53092,-0.736516,-0.080691,-0.714899,3.307990,1.815259,0.153800,-0.292408,-0.323758,-0.333367,-0.001659,-0.330986,2.757042
4,-1.703173,-0.53092,-0.736516,-0.080691,-0.714899,3.307990,-0.239434,-0.276522,-0.292408,-0.323758,-0.333367,-0.001659,-0.330986,-0.046123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5088189,3.334637,-0.53092,1.357744,-0.080691,-0.714899,-0.302298,0.227223,-0.288610,-0.292408,-0.222246,-0.153119,-0.001659,0.190809,-0.125251
5088190,3.334637,-0.53092,-0.736516,-0.080691,1.398798,-0.302298,-0.283152,-0.108939,-0.117837,-0.323758,-0.333367,-0.001659,-0.330986,-0.105766
5088191,3.334637,-0.53092,1.357744,-0.080691,-0.714899,-0.302298,-0.034114,-0.287332,-0.292408,-0.323758,-0.290038,-0.001659,-0.075314,-0.125251
5088192,3.334637,-0.53092,-0.736516,-0.080691,-0.714899,3.307990,-0.161635,-0.283995,-0.292408,-0.323758,-0.311022,-0.001659,-0.218063,-0.125251


In [19]:
train_label.to_frame()

Unnamed: 0,isFraud
0,1
1,1
2,1
3,1
4,1
...,...
5088189,0
5088190,0
5088191,0
5088192,0


In [20]:
data_feature_test = pd.DataFrame(data=test_feature_scaled, columns=col)
data_feature_test

Unnamed: 0,step,CASH_IN,CASH_OUT,DEBIT,PAYMENT,TRANSFER,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,errorBalanceOrig,errorBalanceDest
0,-1.703173,-0.53092,-0.736516,-0.080691,-0.714899,3.307990,-0.297135,-0.288607,-0.292408,-0.323758,-0.333367,-0.001659,-0.330986,-0.124842
1,-1.703173,-0.53092,-0.736516,-0.080691,-0.714899,3.307990,-0.292793,-0.287697,-0.292408,-0.323758,-0.333367,-0.001659,-0.330986,-0.118919
2,-1.703173,-0.53092,-0.736516,-0.080691,-0.714899,3.307990,-0.264140,-0.281696,-0.292408,-0.323758,-0.333367,-0.001659,-0.330986,-0.079828
3,-1.703173,-0.53092,1.357744,-0.080691,-0.714899,-0.302298,-0.264140,-0.281696,-0.292408,-0.321914,-0.330061,-0.001659,-0.330986,-0.093092
4,-1.703173,-0.53092,1.357744,-0.080691,-0.714899,-0.302298,1.815259,0.153800,-0.292408,-0.323758,0.332126,-0.001659,-0.330986,-2.760570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1274421,3.334637,-0.53092,-0.736516,-0.080691,1.398798,-0.302298,-0.289774,-0.252315,-0.258085,-0.323758,-0.333367,-0.001659,-0.330986,-0.114800
1274422,3.334637,-0.53092,1.357744,-0.080691,-0.714899,-0.302298,-0.017402,-0.279344,-0.292408,0.466058,0.443426,-0.001659,-0.096641,-0.125251
1274423,3.334637,-0.53092,-0.736516,-0.080691,1.398798,-0.302298,-0.286501,-0.278576,-0.284700,-0.323758,-0.333367,-0.001659,-0.330986,-0.110335
1274424,3.334637,-0.53092,-0.736516,-0.080691,1.398798,-0.302298,-0.290637,-0.286757,-0.291925,-0.323758,-0.333367,-0.001659,-0.330986,-0.115977


In [21]:
test_label.to_frame()

Unnamed: 0,isFraud
0,1
1,1
2,1
3,1
4,1
...,...
1274421,0
1274422,0
1274423,0
1274424,0


## Logistic Regression

Logistic regression is used to describe data and to explain the relationship between one dependent binary variable and one or more nominal, ordinal, interval or ratio-level independent variables
    
    class sklearn.linear_model.LogisticRegression (penalty='l2', *, dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)
    
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [22]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0)
clf.fit(train_feature_scaled,train_label)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
# predict train data
lr_predY_train = clf.predict(train_feature_scaled)

In [24]:
# predict test data
lr_predY = clf.predict(test_feature_scaled)
lr_predY[0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1])

## XGBoost 

XGBoost stands for Extreme Gradient Boosting; it is a specific implementation of the Gradient Boosting method which uses more accurate approximations to find the best tree model. It employs a number of nifty tricks that make it exceptionally successful, particularly with structured data.

XGBoost's hyperparameters

    -learning_rate: step size shrinkage used to prevent overfitting. Range is [0,1]
    -max_depth: determines how deeply each tree is allowed to grow during any boosting round.
    -subsample: percentage of samples used per tree. Low value can lead to underfitting.
    -colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
    -n_estimators: number of trees you want to build.
    -objective: determines the loss function to be used like reg:linear for regression problems, reg:logistic for classification problems with only decision, binary:logistic for classification problems with probability.

XGBoost also supports regularization parameters to penalize models as they become more complex and reduce them to simple (parsimonious) models.

    -gamma: controls whether a given node will split based on the expected reduction in loss after the split. A higher value leads to fewer splits. Supported only for tree-based learners.
    -alpha: L1 regularization on leaf weights. A large value leads to more regularization.
    -lambda: L2 regularization on leaf weights and is smoother than L1 regularization.

In [None]:
from xgboost.sklearn import XGBClassifier

In [None]:
%%time
from xgboost.sklearn import XGBClassifier
xgbc = XGBClassifier(objective='binary:logistic',max_depth=3)


xgbc.fit(train_feature_scaled, train_label)

In [None]:
 # predict train data
xgb_predY_train = xgbc.predict(train_feature_scaled)

In [None]:
# predict test data
xgb_predY = xgbc.predict(test_feature_scaled)
xgb_predY[0:10]

## Model Evaluation

### Confusion Matrix

confusion matrix, also known as an error matrix, is a specific table layout that allows visualization of the performance of an algorithm, typically a supervised learning one (in unsupervised learning it is usually called a matching matrix). Each row of the matrix represents the instances in a predicted class while each column represents the instances in an actual class (or vice versa)

![image.png](attachment:image.png)


https://en.wikipedia.org/wiki/Confusion_matrix

--------------------------------------------
Accuracy: Overall, how often is the classifier correct?
    
    Accuracy = (TP+TN)/total = (TP+TN)/(TP+TN+FN+FP)

Precision: When it predicts yes, how often is it correct?

    Precision = TP/predicted yes = TP/(TP+FP)
    
Recall: When it's actually yes, how often does it predict yes? (True Positive Rate)

    Recall = TP/actual yes = TP/(TP+FN)
    
F1 Score: This is a weighted average of the recall and precision

    F1 Score = (2*(Precision*Recall))/(Precision+Recall)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score

def evaluation_metrics(types,label_train,predict_train,label_test,predict_test):
  print("Accuracy score in",types, ": Train =" ,accuracy_score(label_train,predict_train), " , Test = ",accuracy_score(label_test,predict_test))
  print("Precision score in ",types, ": Train =" ,precision_score(label_train,predict_train), " , Test = ",precision_score(label_test,predict_test))
  print("Recall score in ",types, ": Train =",recall_score(label_train,predict_train), " , Test = ",recall_score(label_test,predict_test))
  print("F1 score score in ",types, ": Train =" ,f1_score(label_train,predict_train), " , Test = ",f1_score(label_test,predict_test))

In [None]:
evaluation_metrics("Logistic Regression",train_label,lr_predY_train,test_label,lr_predY)

In [None]:
cm_lr = confusion_matrix(lr_predY,test_label,labels=[1,0])
cm_lr

In [None]:
evaluation_metrics("XGBoost",train_label,xgb_predY_train,test_label,xgb_predY)

In [None]:
cm_xgb = confusion_matrix(xgb_predY,test_label,labels=[1, 0])
cm_xgb

## Random OverSampling 

Random oversampling involves randomly duplicating examples from the minority class and adding them to the training dataset.

This technique can be effective for those machine learning algorithms that are affected by a skewed distribution and where multiple duplicate examples for a given class can influence the fit of the model

In [None]:
from imblearn.over_sampling import RandomOverSampler 
ros = RandomOverSampler()
train_feature_resampled, train_label_resampled = ros.fit_sample(train_feature, train_label)

In [None]:
scaler = preprocessing.StandardScaler()
train_feature_scaled =scaler.fit_transform(train_feature_resampled)
test_feature_scaled = scaler.transform(test_feature)

In [None]:
clf_ros = LogisticRegression(random_state=0)
clf_ros.fit(train_feature_scaled,train_label_resampled)

In [None]:
lr_predY_ros_train = clf_ros.predict(train_feature_scaled)

In [None]:
lr_predY_ros = clf_ros.predict(test_feature_scaled)


In [None]:
evaluation_metrics("Logistic Regession after RandomOverSampling",train_label_resampled,lr_predY_ros_train,test_label,lr_predY_ros)

In [None]:
cm_lr_ros = confusion_matrix(lr_predY_ros,test_label,labels=[1, 0])
cm_lr_ros

## Random Undersampling

Random undersampling involves randomly selecting examples from the majority class to delete from the training dataset.

This has the effect of reducing the number of examples in the majority class in the transformed version of the training dataset. This process can be repeated until the desired class distribution is achieved, such as an equal number of examples for each class

A limitation of undersampling is that examples from the majority class are deleted that may be useful, important, or perhaps critical to fitting a robust decision boundary. Given that examples are deleted randomly, there is no way to detect or preserve “good” or more information-rich examples from the majority class.

https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
train_feature_resampled, train_label_resampled = rus.fit_sample(train_feature,train_label)

In [None]:
scaler = preprocessing.StandardScaler()
train_feature_scaled =scaler.fit_transform(train_feature_resampled)
test_feature_scaled = scaler.transform(test_feature)

In [None]:
clf_rus = LogisticRegression(random_state=0)
clf_rus.fit(train_feature_scaled,train_label_resampled)

In [None]:
lr_predY_rus_train = clf_rus.predict(train_feature_scaled)

In [None]:
lr_predY_rus = clf_rus.predict(test_feature_scaled)

In [None]:
evaluation_metrics("Logistic Regression after RandomUnderSampling",train_label_resampled,lr_predY_rus_train,test_label,lr_predY_rus)

In [None]:
cm_lr_rus = confusion_matrix(lr_predY_rus,test_label,labels=[1, 0])
cm_lr_rus