# Credit Card Fraud Detection
## Capstone Project in Adavnce Data Scientist Course by IBM

# ETL Phase

In [1]:
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

import os
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
path=r'C:\Users\kobi_z\Documents\Data Science\Advance Data Science\Capstone Project'
df=pd.read_csv(os.path.join(path,'training.csv'))

In [3]:
df.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult'],
      dtype='object')

# Data Cleaning Phase
I found that no data cleaning is required, since all the data that was extracted from Kaggle was clean and ready for feature eng. phase

In [4]:
for col in df.columns:
    print('len df is',len(df),'unique values of {col}'.format(col=col), len(df[col].unique()))
    if df[col].isnull().unique()==False:
        print('no missing values')
    else:
        print('null values were found')

len df is 95662 unique values of TransactionId 95662
no missing values
len df is 95662 unique values of BatchId 94809
no missing values
len df is 95662 unique values of AccountId 3633
no missing values
len df is 95662 unique values of SubscriptionId 3627
no missing values
len df is 95662 unique values of CustomerId 3742
no missing values
len df is 95662 unique values of CurrencyCode 1
no missing values
len df is 95662 unique values of CountryCode 1
no missing values
len df is 95662 unique values of ProviderId 6
no missing values
len df is 95662 unique values of ProductId 23
no missing values
len df is 95662 unique values of ProductCategory 9
no missing values
len df is 95662 unique values of ChannelId 4
no missing values
len df is 95662 unique values of Amount 1676
no missing values
len df is 95662 unique values of Value 1517
no missing values
len df is 95662 unique values of TransactionStartTime 94556
no missing values
len df is 95662 unique values of PricingStrategy 4
no missing valu

# Feature Eng. Phase

### Getting day, month, year and hour from time stamp column

In [5]:
df['TransactionStartTime']=pd.to_datetime(df['TransactionStartTime'],errors='coerce')
df['trans day']=df['TransactionStartTime'].dt.day
df['trans month']=df['TransactionStartTime'].dt.month
df['trans year']=df['TransactionStartTime'].dt.year
df['trans hour']=df['TransactionStartTime'].dt.hour

In [6]:
df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,trans day,trans month,trans year,trans hour
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0,15,11,2018,2
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0,15,11,2018,2
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0,15,11,2018,2
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0,15,11,2018,3
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0,15,11,2018,3


### Choose the relevant columns that later on will be defined as features

In [7]:
df_features=df[['AccountId','SubscriptionId','CustomerId','ProviderId','ProductId','ProductCategory','ChannelId','PricingStrategy','trans day','trans month','trans year','trans hour','Amount','Value']]
df_features.head()

Unnamed: 0,AccountId,SubscriptionId,CustomerId,ProviderId,ProductId,ProductCategory,ChannelId,PricingStrategy,trans day,trans month,trans year,trans hour,Amount,Value
0,AccountId_3957,SubscriptionId_887,CustomerId_4406,ProviderId_6,ProductId_10,airtime,ChannelId_3,2,15,11,2018,2,1000.0,1000
1,AccountId_4841,SubscriptionId_3829,CustomerId_4406,ProviderId_4,ProductId_6,financial_services,ChannelId_2,2,15,11,2018,2,-20.0,20
2,AccountId_4229,SubscriptionId_222,CustomerId_4683,ProviderId_6,ProductId_1,airtime,ChannelId_3,2,15,11,2018,2,500.0,500
3,AccountId_648,SubscriptionId_2185,CustomerId_988,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,2,15,11,2018,3,20000.0,21800
4,AccountId_4841,SubscriptionId_3829,CustomerId_988,ProviderId_4,ProductId_6,financial_services,ChannelId_2,2,15,11,2018,3,-644.0,644


### Define columns that reuired for OneHotEncoded, MaxMinScaler and PCA for all data set

In [8]:
col_trans=ColumnTransformer(transformers=[('cat', OneHotEncoder(), ['AccountId','SubscriptionId','CustomerId','ProviderId','ProductId','ProductCategory','ChannelId','PricingStrategy','trans day','trans month','trans year','trans hour']),('scale',MinMaxScaler(),['Amount','Value'])])
pipeline_features=Pipeline(steps=[('col_t',col_trans)])
X_trans=pipeline_features.fit_transform(df_features).toarray()

In [9]:
pca = PCA(n_components=3)
X_trans_PCA=pca.fit_transform(X_trans)

### Choose the column with the true classes

In [16]:
df_class=df['FraudResult']
CLASS=df_class.to_numpy()

# Model Selection Phase

### Perdorming pipeline for gridsearch in order to determine what are the best hyperparameters setting

In [17]:
pipeline_model=Pipeline(steps=[('svm',SVC(gamma='auto'))])
grid_param={'svm__C':[5,10,15,20,25,30], 'svm__kernel': ['rbf','linear','poly','sigmoid'],'svm__degree' : [1,2,3,5,8],'svm__max_iter': [3,5,7,100,150]}

gd_sr = GridSearchCV(estimator=pipeline_model,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

In [18]:
gd_sr.fit(X_trans_PCA,CLASS)



GridSearchCV(cv=5, estimator=Pipeline(steps=[('svm', SVC(gamma='auto'))]),
             n_jobs=-1,
             param_grid={'svm__C': [5, 10, 15, 20, 25, 30],
                         'svm__degree': [1, 2, 3, 5, 8],
                         'svm__kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
                         'svm__max_iter': [3, 5, 7, 100, 150]},
             scoring='accuracy')

In [33]:
print(gd_sr.best_params_)
print('The best score found is: ',round(gd_sr.best_score_,3))

{'svm__C': 30, 'svm__degree': 1, 'svm__kernel': 'rbf', 'svm__max_iter': 5}
The best score found is:  0.995


# Model Evaluation Phase

In [21]:
model=SVC(C=30,degree=1,kernel='rbf',max_iter=5,gamma='auto')

In [23]:
model.fit(X_trans_PCA,CLASS)



SVC(C=30, degree=1, gamma='auto', max_iter=5)

In [26]:
model.score(X_trans_PCA,CLASS)

0.97409629738036

In [27]:
from sklearn.metrics import confusion_matrix

In [28]:
y_true=CLASS
y_pred=model.predict(X_trans_PCA)

In [30]:
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
tn, fp, fn, tp

(93180, 2289, 189, 4)

1. True Negative - 93,180 - we can see the vast majority of the data is transaction with no any fraud detected, therefore     the model in basic will give good results.
2. Flase Positive - 2,289 cases the model raised a flag on fraud but by mistake. that's false alaram or type I error
3. False Negative - 189 fraud cases were missed... that's not good results at all...type II error
4. True Positive - 4 frauds were detected