In [1]:
import numpy as np
import pandas as pd

In [5]:
%run ../shared_functions.py
%run ../my_shared_functions.py

DIR_INPUT = '../../fraud-detection-handbook/simulated-data-transformed/data/'
END_DATE = "2018-09-14"

print("Load  files")
%time transactions_df=read_from_files(DIR_INPUT, "2018-06-11", END_DATE)
print("{0} transactions loaded, containing {1} fraudulent transactions".format(len(transactions_df),
                                                                    transactions_df.TX_FRAUD.sum()))

output_feature="TX_FRAUD"
input_features=['TX_AMOUNT','TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
       'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
       'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
       'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
       'TERMINAL_ID_RISK_30DAY_WINDOW']

BEGIN_DATE = "2018-08-08"
start_date_training = datetime.datetime.strptime(BEGIN_DATE, "%Y-%m-%d")
delta_train=7
delta_delay=7
delta_test=7
delta_valid = delta_test

(train_df, valid_df)=get_train_test_set(transactions_df,start_date_training,
                            delta_train=delta_train,delta_delay=delta_delay,delta_test=delta_test)

SEQ_LEN = 5

# By default, scales input data
(train_df, valid_df)=scaleData(train_df, valid_df,input_features)

if torch.cuda.is_available():
    DEVICE = "cuda" 
else:
    DEVICE = "cpu"
print("Selected device is",DEVICE)

SEED = 42
seed_everything(SEED)

Load  files
CPU times: total: 484 ms
Wall time: 4.79 s
919767 transactions loaded, containing 8195 fraudulent transactions
Selected device is cuda


In [6]:
full_df = pd.concat([train_df, valid_df])

pearson correlation between TX_FRAUD and input features

In [19]:
full_df_corr = full_df[input_features + [output_feature]].corr()
full_df_corr['TX_FRAUD'].abs().sort_values(ascending=False)

TX_FRAUD                               1.000000
TERMINAL_ID_RISK_7DAY_WINDOW           0.485194
TERMINAL_ID_RISK_1DAY_WINDOW           0.385799
TERMINAL_ID_RISK_30DAY_WINDOW          0.234498
TX_AMOUNT                              0.134427
CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW     0.097321
CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW     0.050848
CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW    0.019095
TERMINAL_ID_NB_TX_30DAY_WINDOW         0.008379
TERMINAL_ID_NB_TX_7DAY_WINDOW          0.008243
TERMINAL_ID_NB_TX_1DAY_WINDOW          0.003747
TX_DURING_NIGHT                        0.002192
CUSTOMER_ID_NB_TX_30DAY_WINDOW         0.002006
TX_DURING_WEEKEND                      0.001259
CUSTOMER_ID_NB_TX_7DAY_WINDOW          0.001074
CUSTOMER_ID_NB_TX_1DAY_WINDOW          0.000619
Name: TX_FRAUD, dtype: float64

Kendall Tau correlation between TX_FRAUD and input features

In [7]:
full_df_corr_kendall = full_df[input_features + [output_feature]].corr(method='kendall')
full_df_corr_kendall['TX_FRAUD'].abs().sort_values(ascending=False)

TX_FRAUD                               1.000000
TERMINAL_ID_RISK_1DAY_WINDOW           0.331542
TERMINAL_ID_RISK_7DAY_WINDOW           0.250657
TERMINAL_ID_RISK_30DAY_WINDOW          0.133213
TX_AMOUNT                              0.030936
CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW     0.027232
CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW     0.020854
CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW    0.012846
TERMINAL_ID_NB_TX_30DAY_WINDOW         0.008227
TERMINAL_ID_NB_TX_7DAY_WINDOW          0.007720
TERMINAL_ID_NB_TX_1DAY_WINDOW          0.002931
TX_DURING_NIGHT                        0.002192
CUSTOMER_ID_NB_TX_30DAY_WINDOW         0.001714
TX_DURING_WEEKEND                      0.001259
CUSTOMER_ID_NB_TX_7DAY_WINDOW          0.001173
CUSTOMER_ID_NB_TX_1DAY_WINDOW          0.000349
Name: TX_FRAUD, dtype: float64

There's a change at the top of correlation ranking between Pearson and Kendall Tau methods -> Kendall Tau is the preferred method

Features affected by fraudulent scenarios based:
1. scenario: TX_AMOUNT (t-0) 
2. scenario: TERMINAL_ID_RISK_1DAY_WINDOW, TERMINAL_ID_RISK_7DAY_WINDOW, TERMINAL_ID_RISK_30DAY_WINDOW 
3. scenario: CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW, CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW, CUSTOMER_ID_AVG_AMOUNT_30_DAY_WINDOW, TX_AMOUNT 

Number of frauds per scenario:
- Number of frauds from scenario 1: 978
- Number of frauds from scenario 2: 9099
- Number of frauds from scenario 3: 4604

That would mean that the most common influential featues correlated with TX_FRAUD are TERMINAL_ID_RISK features, then 2nd most common would be TX_AMOUNT (if number of frauds from scenario 1 and 3 is summed up) and then CUSTOMER_ID_AVG_AMOUNT features. 

The same can be said when looking at generated kendall coefficient values. The order of 1DAY/7DAY/30DAY window for the features will be aligned with kendall coefficient values 

In [8]:
# full_df_corr_kendall['TX_FRAUD'].abs().sort_values(ascending=False).iloc[1:].to_csv('ground_truth_feature_ranking.csv', index=True)

In [9]:
ground_truth_ranking = (
    full_df_corr_kendall['TX_FRAUD']
    .abs()
    .sort_values(ascending=False)
    .iloc[1:]
    .reset_index(drop=False)
    .rename(columns={
        'index': 'FEATURE_NAME',
        'TX_FRAUD': 'FEATURE_CORRELATION'
        })
)

In [11]:
input_feature_df = pd.DataFrame({'FEATURE_NAME': input_features, 'INPUT_FEATURE_INDEX': range(len(input_features))})
input_feature_df

Unnamed: 0,FEATURE_NAME,INPUT_FEATURE_INDEX
0,TX_AMOUNT,0
1,TX_DURING_WEEKEND,1
2,TX_DURING_NIGHT,2
3,CUSTOMER_ID_NB_TX_1DAY_WINDOW,3
4,CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW,4
5,CUSTOMER_ID_NB_TX_7DAY_WINDOW,5
6,CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW,6
7,CUSTOMER_ID_NB_TX_30DAY_WINDOW,7
8,CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW,8
9,TERMINAL_ID_NB_TX_1DAY_WINDOW,9


In [12]:
ground_truth_ranking.merge(input_feature_df, on='FEATURE_NAME', how='left').to_csv('ground_truth_feature_ranking.csv', index=False)