# Credit Card Fraud Detection Using Python & Machine learning 

In [10]:
#importing related to general operating system & warnings
import os 
import warnings
warnings.filterwarnings('ignore')
#importing related to data importing, manipulation, exploratory data #analysis, data understanding
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from termcolor import colored as cl # text customization
#importing related to data visualizaiton
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#Setting the plot sizes and type of plot
plt.rc("font", size=14)
plt.rcParams['axes.grid'] = True
plt.figure(figsize=(6,3))
plt.gray()
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.preprocessing import  PolynomialFeatures, KBinsDiscretizer, FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, OrdinalEncoder
import statsmodels.formula.api as smf
import statsmodels.tsa as tsa
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz, export_text
from sklearn.ensemble import BaggingClassifier, BaggingRegressor,RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor 
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

ModuleNotFoundError: No module named 'statsmodels'

<Figure size 432x216 with 0 Axes>

In [None]:
#importing and reading dataset
data=pd.read_csv("creditcard.csv.zip")

In [None]:
Total_transactions = len(data)
normal = len(data[data.Class == 0])
fraudulent = len(data[data.Class == 1])
fraud_percentage = round(fraudulent/normal*100, 2)
print(cl('Total number of Trnsactions are {}'.format(Total_transactions), attrs = ['bold']))
print(cl('Number of Normal Transactions are {}'.format(normal), attrs = ['bold']))
print(cl('Number of fraudulent Transactions are {}'.format(fraudulent), attrs = ['bold']))
print(cl('Percentage of fraud Transactions is {}'.format(fraud_percentage), attrs = ['bold']))

In [None]:
#Data Processing & Understanding 
data.info()

In [None]:
sc = StandardScaler() 
amount = data['Amount'].values 
data['Amount'] = sc.fit_transform(amount.reshape(-1, 1)) 
data.drop(['Time'], axis=1, inplace=True) 

In [None]:
# Removing any duplicates in the Dataset. 
data.drop_duplicates(inplace=True)

In [None]:
# Appling the Train & Test after defining the dependent and independent variables. The dependent variable is also known as X and the independent variable is known as y.
X = data.drop('Class', axis = 1).values 
y = data['Class'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1) 

# Model Building

# Decision Tree 

In [None]:
DT = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy') 
DT.fit(X_train, y_train)
tree_yhat = DT.predict(X_test)

In [None]:
#checking the accuracy of our decision tree model. 
print('Accuracy score of the Decision Tree model is {}'.format(accuracy_score(y_test, tree_yhat)))

In [None]:
#Checking F1-Score for the decision tree model. 
print('F1 score of the Decision Tree model is {}'.format(f1_score(y_test, tree_yhat))) 

In [None]:
#Checking the confusion matrix: 
confusion_matrix(y_test, tree_yhat, labels = [0, 1]) 

Here, the first row represents positive and the second row represents negative. 
So, we have 68782 as true positive and 18 are false positive. 
That says, out of 68782+18=68800, we have 68782 that are successfully classified as a normal transaction and 18 were falsely classified as normal — but they were fraudulent. 
Let’s now try different models and check their performance.

# K-Nearest Neighbors

In [None]:
n = 7 
KNN = KNeighborsClassifier(n_neighbors = n) 
KNN.fit(X_train, y_train) 
knn_yhat = KNN.predict(X_test)

In [None]:
#checking the accuracy of the K-Nearest Neighbors model.
print('Accuracy score of the K-Nearest Neighbors model is {}'.format(accuracy_score(y_test, knn_yhat)))

In [None]:
#Checking F1-Score for the K-Nearest Neighbors model.
 print('F1 score of the K-Nearest Neighbors model is {}'.format(f1_score(y_test, knn_yhat)))

# Logistic Regression 

In [None]:
lr = LogisticRegression() 
lr.fit(X_train, y_train) 
lr_yhat = lr.predict(X_test) 

In [None]:
# checking the accuracy of the Logistic Regression model. 
print('Accuracy score of the Logistic Regression model is {}'.format(accuracy_score(y_test, lr_yhat))) 

In [None]:
 #Checking F1-Score for the Logistic Regression model. 
print('F1 score of the Logistic Regression model is {}'.format(f1_score(y_test, lr_yhat))) 

# Support Vector Machines 

In [None]:
svm = SVC() 
svm.fit(X_train, y_train) 
svm_yhat = svm.predict(X_test)

In [None]:
# checking the accuracy of the Support Vector Machines (SVM) model.
print('Accuracy score of the Support Vector Machines model is {}'.format(accuracy_score(y_test, svm_yhat)))

In [None]:
#Checking F1-Score for the Support Vector Machines model. 
print('F1 score of the Support Vector Machines model is {}'.format(f1_score(y_test, svm_yhat))) 

# Random Forest

In [None]:
rf = RandomForestClassifier(max_depth = 4) 
rf.fit(X_train, y_train)
rf_yhat = rf.predict(X_test)

In [None]:
#checking the accuracy of our Random Forest model.
print('Accuracy score of the Random Forest model is {}'.format(accuracy_score(y_test, rf_yhat))) 

In [None]:
#Checking F1-Score for the Random Forest model. 
print('F1 score of the Random Forest model is {}'.format(f1_score(y_test, rf_yhat))) 

# Xgboost 

In [None]:
xgb = XGBClassifier(max_depth = 4) 
xgb.fit(X_train, y_train)
xgb_yhat = xgb.predict(X_test) 

In [None]:
# Let’s check the accuracy of our XGBoost model.
print('Accuracy score of the XGBoost model is {}'.format(accuracy_score(y_test, xgb_yhat))) 

In [None]:
#Checking F1-Score for the XGBoost model.
print('F1 score of the XGBoost model is {}'.format(f1_score(y_test, xgb_yhat))) 

# CONCLUSION 

We achieved 99.95% accuracy in our credit card fraud detection system. 
This number should not be surprising due to the fact that our data was balanced towards one class. 
The good thing that we have noticed from the confusion matrix is that — our model is not overfitted. 
Finally, based on our accuracy score — XGBoost is the best performing model for our case. 
The only catch here is the data that we have received for model training. 
The data features are the transformed version of PCA. 
If the actual features follow a similar pattern then we are on the right track.
