# About the data

Features Explanation:
* distancefromhome, numeric - the distance from home where the transaction happened.
* distancefromlast_transaction, numeric - the distance from last transaction happened.
* ratiotomedianpurchaseprice, numeric - Ratio of purchased price transaction to median purchase price.
* repeat_retailer, binary - Is the transaction happened from same retailer.
* used_chip, binary - Is the transaction through chip (credit card).
* used_pin_number, binary - Is the transaction happened by using PIN number.
* online_order, binary - Is the transaction an online order.
* fraud, binary - Is the transaction fraudulent.

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time

data_path = 'card_transdata.csv'
data = pd.read_csv(data_path)
n = data.shape[0]
data.head()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0


In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# Data distribution
for col in ['repeat_retailer','used_chip','used_pin_number','online_order']:
    sns.countplot(data=data,x=col,hue='fraud')
    plt.show()

In [None]:
# Unbalanced data
print('Percentage of fraud in whole dataset:', 100*data.loc[data.fraud==1].shape[0]/n)

Notes from barplots:
* transaction happened with PIN number are not subject to fraud
* most of the fraud have been done over online orders

In [None]:
print('Percentage of fraud not using PIN:',
      100*data.loc[(data.fraud==1) & (data.used_pin_number==0)].shape[0]/data.loc[data.fraud==1].shape[0])
print('Percentage of fraud in online orders:',
      100*data.loc[(data.fraud==1) & (data.online_order==1)].shape[0]/data.loc[data.fraud==1].shape[0])

Since target data are unbalanced we will set as our baseline an unsophisticated  model that marks fraudolent every online transaction as it makes much more sense than considering wether the transaction occurred using PIN, even though it seem to have a higher impact on the target with these data.

In [None]:
# correlation heatmap
corr = data.corr()
sns.heatmap(data=corr, annot=True, mask=np.triu(corr)|(np.abs(corr)<0.01))

# Preprocessing and Hyperparameter Tuning

In [None]:
# Preprocessing and Splitting
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

X_sc = data.copy()
features_names = data.columns.tolist()
features_names.remove('fraud')
MMsc = MinMaxScaler()
X_sc[features_names] = MMsc.fit_transform(data[features_names])

y_sc = X_sc.pop('fraud')
X_train, X_test, y_train, y_test = train_test_split(X_sc,y_sc, test_size=0.2)

In [None]:
# Random Forest Classifier Tuning (Commented to run faster)
# from sklearn.model_selection import GridSearchCV
# n_estimators_range = range(3,8)
# max_depth_range = range(3,13,3)
# param_grid ={'n_estimators': n_estimators_range, 'max_depth': max_depth_range,
#              'random_state': [5]}

# start_time = time.time()

# rfc_grid = GridSearchCV(estimator=RandomForestClassifier(),
#              param_grid=param_grid, scoring='accuracy',)
# rfc_grid.fit(X_train,y_train)

# elapsed_time = time.time()-start_time

# print('RFC tuning time: ', elapsed_time)
# print('Best score obtained with RFC: '+str(rfc_grid.best_score_))
# print('by setting',rfc_grid.best_params_)

# results = rfc_grid.cv_results_
# fig = plt.figure()
# ax = fig.add_subplot(projection='3d')

# x = results['param_max_depth']
# y = results['param_n_estimators']
# z = results['mean_test_score']
# ax.scatter(x, y, z)
# ax.set_xlabel('max_depth')
# ax.set_ylabel('n_estimators')
# ax.set_zlabel('mean_test_score')

# Model Evaluation and Features Importances

In [None]:
#Run RFC and evaluate model on test set
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

rfc = RandomForestClassifier(n_estimators=5, max_depth=12, random_state=5)
rfc.fit(X_train,y_train)
y_rfc = rfc.predict(X_test)
cm = confusion_matrix(y_true=y_test, y_pred=y_rfc)
ConfusionMatrixDisplay(cm).plot(cmap='Blues', colorbar=False)
plt.title('RFC Confusion Matrix')
plt.show()

report = pd.DataFrame(data=classification_report(
    y_true=y_test, y_pred=y_rfc, output_dict=True))
report

Now that we have built our model and ensured it has an almost perfect prediction power we focus on the interpretability of our model. We will look at features importances in the RandomForestClassifier used for classification and then look at how the base estimator in the forest combined them in order to get to its results (we suggest to download the decision path to take a closer look).

In [None]:
#Feature importances
importances = rfc.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis=0)
rfc_importances = pd.Series(importances, index=X_train.columns).sort_values(ascending=False)

fig, ax = plt.subplots()
rfc_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI") #Minimum decrease in impurity
ax.set_ylabel("Mean decrease in impurity")
plt.show()

print('Mean features importance')
for feature in rfc_importances.index:
    print(feature+': '+str(rfc_importances[feature]))

In [None]:
from sklearn import tree
import graphviz

base_tree = rfc.base_estimator_
base_tree.fit(X_train,y_train)

dot_data = tree.export_graphviz(base_tree, out_file=None)
graph = graphviz.Source(dot_data) 
graph.render("card fraud")

dot_data = tree.export_graphviz(base_tree, out_file=None, 
                                feature_names=X_train.columns,  
                                class_names=y_train.name,  
                                filled=True, rounded=True,  
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph