## Import packages

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [2]:
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [3]:
from xgboost import XGBClassifier

## Loading the Data

In [4]:
df = pd.read_csv('fraud_data_features.csv')
pd.set_option('display.max_columns', 500)
df.head()

FileNotFoundError: [Errno 2] File fraud_data_features.csv does not exist: 'fraud_data_features.csv'

In [None]:
df.info()

## Exploratory Data Analysis

In [None]:
plt.figure(figsize=(19,8))
sns.barplot(x=df.category, y=df.is_fraud)

In [None]:
plt.figure(figsize=(19,8))
sns.barplot(x=df.PartofDay, y=df.is_fraud)

In [None]:
plt.figure(figsize=(19,19))
sns.barplot(y=df.city, x=df.is_fraud, data=df)

## Feature engineering and selection

In [None]:
df2 = df.drop(['ssn', 'cc_num', 'first', 'last', 'zip', 'street', 
               'state', 'city_pop', 'job', 'dob', 'acct_num'], axis=1)

In [None]:
#One Hot Encoding of Categorical Data
enc = OneHotEncoder(handle_unknown='ignore')

onehotcategory = pd.DataFrame(enc.fit_transform(df2[['category']]).toarray())
onehotcategory.columns = enc.get_feature_names(['category'])

onehotcity = pd.DataFrame(enc.fit_transform(df2[['city']]).toarray())
onehotcity.columns = enc.get_feature_names(['city'])

onehotlatenight = pd.DataFrame(enc.fit_transform(df2[['PartofDay']]).toarray())
onehotlatenight.columns = enc.get_feature_names(['PartofDay'])
onehotlatenight

onehotday = pd.DataFrame(enc.fit_transform(df2[['DayName']]).toarray())
onehotday.columns = enc.get_feature_names(['DayName'])

#onehotmerchant = pd.DataFrame(enc.fit_transform(df2[['merchant']]).toarray())
#onehotmerchant.columns = enc.get_feature_names(['merchant'])

In [None]:
#df_numeric = df2[['lat', 'long', 'unix_time', 'amt', 'merch_lat', 'merch_long',
#       'LocDist', 'ra_Tx', 'ra_LocDist', 'ra_LocDist_Fr', 'ra_TxAmt_Wknd', 'ra_TxAmt_Night', 'ra_TxCount_Night',
#      'ra_TxCount_Daily', 'is_fraud']]

df_numeric = df2[['lat', 'long', 'unix_time', 'amt', 'merch_lat', 'merch_long', 
                  'LocDist', 'ra_Tx', 'ra_LocDist', 'ra_TxAmt_Wknd', 'ra_TxAmt_Night', 'ra_TxCount_Night',
                  'ra_TxCount_Daily', 'is_fraud']]

In [None]:
xgdf = pd.concat([df_numeric, onehotcategory, onehotcity, onehotday], axis=1)

In [None]:
xgdf.shape

In [None]:
xgdf.fillna(0)

In [None]:
xgdf_sample = xgdf.sample(frac=0.2)

## Train-Test Split

In [None]:
X = xgdf_sample.iloc[:, xgdf.columns != 'is_fraud']
y = xgdf_sample.is_fraud
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
X_test

In [None]:
import xgboost
from xgboost import XGBClassifier 

d_train = xgboost.DMatrix(X_train, label=y_train)
d_test = xgboost.DMatrix(X_test, label=y_test)

In [None]:
params = {
    "eta": 0.01,
    "objective": "binary:logistic",
    "subsample": 0.5,
    "base_score": 0.5,
    "eval_metric": "error"
}
model = XGBClassifier(objective = "binary:logistic")

In [None]:
train_model = model.fit(X_train, y_train)

In [None]:
print(model)

## Classification Report

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
y_pred_rf = model.predict(X_test)
print('Accuracy score: ',accuracy_score(y_test, y_pred_rf))

cm = confusion_matrix(y_test, y_pred_rf)
ConfusionMatrixDisplay(confusion_matrix = cm).plot()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_rf))

In [None]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [None]:
pd.DataFrame(xgdf.iloc[291388,:]).T

In [None]:
y_test

In [None]:
y_test[y_test == y_pred]

In [None]:
y_test.value_counts()

In [None]:
y_test_incorrect = y_test[y_test==1]

In [None]:
y_test_incorrect.head()
pd.DataFrame(y_test_incorrect)

In [None]:
import matplotlib.pylab as pl
xgboost.plot_importance(model)
pl.title("xgboost.plot_importance(model)")
pl.rcParams["figure.figsize"] = (30,10)
pl.show()

In [None]:
xgboost.plot_importance(model, importance_type="cover")
pl.title('xgboost.plot_importance(model, importance_type="cover")')
pl.rcParams["figure.figsize"] = (30,10)
pl.show()

In [None]:
xgboost.plot_importance(model, importance_type="gain")
pl.title('xgboost.plot_importance(model, importance_type="gain")')
pl.show()

In [None]:
import shap

# this takes a minute or two since we are explaining over 30 thousand samples in a model with over a thousand trees
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])

In [None]:
i = 168785

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[i,:])

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[:1000,:], df.iloc[:1000,:])

In [None]:
shap.summary_plot(shap_values, X, plot_type="bar")

In [None]:
shap.summary_plot(shap_values, X)

## Save to joblib

In [None]:
import joblib
joblib.dump(model, 'model.joblib')

In [None]:
xgdf.columns.tolist()

In [None]:
feature_names = ['lat',
 'long',
 'unix_time',
 'amt',
 'merch_lat',
 'merch_long',
 'LocDist',
 'ra_Tx',
 'ra_LocDist',
 'ra_TxAmt_Wknd',
 'ra_TxAmt_Night',
 'ra_TxCount_Night',
 'ra_TxCount_Daily',
 'category_entertainment',
 'category_food_dining',
 'category_gas_transport',
 'category_grocery_net',
 'category_grocery_pos',
 'category_health_fitness',
 'category_home',
 'category_kids_pets',
 'category_misc_net',
 'category_misc_pos',
 'category_personal_care',
 'category_shopping_net',
 'category_shopping_pos',
 'category_travel',
 'city_Angeles City',
 'city_Antipolo',
 'city_Bacolod',
 'city_Bacoor',
 'city_Baguio City',
 'city_Balanga',
 'city_Batangas',
 'city_Binan',
 'city_Borongan',
 'city_Butuan',
 'city_Cabadbaran',
 'city_Cabanatuan City',
 'city_Cabuyao',
 'city_Cadiz',
 'city_Calapan',
 'city_Caloocan City',
 'city_Catbalogan',
 'city_Cebu City',
 'city_City of Calamba',
 'city_City of Isabela',
 'city_City of Paranaque',
 'city_Cotabato',
 'city_Dagupan City',
 'city_Dasmarinas',
 'city_Davao',
 'city_Digos',
 'city_Dipolog',
 'city_Dumaguete City',
 'city_General Santos',
 'city_General Trias',
 'city_Ilagan',
 'city_Iligan',
 'city_Iloilo',
 'city_Imus',
 'city_Kidapawan',
 'city_Laoag',
 'city_Legazpi City',
 'city_Lipa City',
 'city_Lucena',
 'city_Maasin',
 'city_Mabalacat',
 'city_Makati City',
 'city_Malabon',
 'city_Malaybalay',
 'city_Malolos',
 'city_Mandaluyong City',
 'city_Mandaue City',
 'city_Manila',
 'city_Marikina City',
 'city_Masbate',
 'city_Mati',
 'city_Meycauayan',
 'city_Muntinlupa City',
 'city_Naga City',
 'city_Navotas',
 'city_Olongapo',
 'city_Ormoc',
 'city_Oroquieta',
 'city_Pagadian',
 'city_Palayan City',
 'city_Pasay City',
 'city_Pasig City',
 'city_Puerto Princesa',
 'city_Quezon City',
 'city_Roxas City',
 'city_San Fernando',
 'city_San Jose del Monte',
 'city_San Juan',
 'city_San Pablo',
 'city_San Pedro',
 'city_Santa Rosa',
 'city_Santiago',
 'city_Sorsogon',
 'city_Surigao',
 'city_Tacloban',
 'city_Tagbilaran City',
 'city_Taguig City',
 'city_Tagum',
 'city_Talisay',
 'city_Tandag',
 'city_Tarlac City',
 'city_Tuguegarao',
 'city_Valenzuela',
 'city_Vigan',
 'city_Zamboanga City',
 'DayName_Friday',
 'DayName_Monday',
 'DayName_Saturday',
 'DayName_Sunday',
 'DayName_Thursday',
 'DayName_Tuesday',
 'DayName_Wednesday']

In [None]:
model

In [None]:
from lime.lime_tabular import LimeTabularExplainer
explainer = LimeTabularExplainer(X_train.to_numpy(), feature_names=feature_names, class_names=['Not Fraud', 'Fraud'], discretize_continuous=False)

In [None]:
from lime.lime_tabular import LimeTabularExplainer
explainer = LimeTabularExplainer(X_train.to_numpy(), feature_names = feature_names, 
                                 mode = 'classification', training_labels=xgdf['is_fraud'])

In [None]:
model = joblib.load('model.joblib')

In [None]:
model

In [None]:
i = 534

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[i,:])

In [None]:
def prob(xgdf):
    return np.array(list(zip(1-model.predict(data),model.predict(xgdf))))

In [None]:
exp = explainer.explain_instance(xgdf.loc[i,feature_names].astype(int).values, prob, num_features=5)

In [None]:
#exp = explainer.explain_instance(X_test.to_numpy()[i], model.predict_proba, num_features=10, top_labels=1)

In [None]:
exp.show_in_notebook(show_table=True, show_all=True)