In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 5))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'

In [4]:
customers_og_df = pd.read_csv('./data/customers.csv')
customers_og_df = customers_og_df.set_index("CUSTOMER_ID")

terminals_og_df = pd.read_csv('./data/terminals.csv')
terminals_og_df = terminals_og_df.set_index("TERMINAL_ID")

In [5]:
transactions_train_og = pd.read_csv('./data/transactions_train.csv', usecols=['TX_ID','TERMINAL_ID','CUSTOMER_ID','TX_FRAUD'])

In [7]:
transactions_train_merged  = pd.merge(transactions_train_og,customers_og_df,on='CUSTOMER_ID')
transactions_train_merged.count()

TX_ID            1066440
CUSTOMER_ID      1066440
TERMINAL_ID      1066440
TX_FRAUD         1066440
x_customer_id    1066440
y_customer_id    1066440
dtype: int64

In [8]:
transactions_train_merged  = pd.merge(transactions_train_merged,terminals_og_df,on='TERMINAL_ID')
transactions_train_merged.count()

TX_ID             1066440
CUSTOMER_ID       1066440
TERMINAL_ID       1066440
TX_FRAUD          1066440
x_customer_id     1066440
y_customer_id     1066440
x_terminal_id     1066440
y_terminal__id    1066440
dtype: int64

In [9]:
transactions_train_merged.head()

Unnamed: 0,TX_ID,CUSTOMER_ID,TERMINAL_ID,TX_FRAUD,x_customer_id,y_customer_id,x_terminal_id,y_terminal__id
0,c6dde46458f3d1cfeb9256f8add45a14fb349f8f,7323088226725338,55875360,0,27.712334,18.147509,27.628549,20.873479
1,988e8385f2934716ff5e57566617e077678b7ec9,7323088226725338,55875360,0,27.712334,18.147509,27.628549,20.873479
2,0726d4ff29dbcd4e21c4dbe5291ee4165d15d1c6,4362790219307564,55875360,0,30.27067,20.798376,27.628549,20.873479
3,998ec1dfa3664fbf40f8f2f5ad38e1bd71841baf,2376754785900600,55875360,0,31.512472,22.541913,27.628549,20.873479
4,a6df8a36e41707014adf056496e329709bee902a,2376754785900600,55875360,1,31.512472,22.541913,27.628549,20.873479


In [10]:
transactions_train_merged.set_index('TX_ID')

Unnamed: 0_level_0,CUSTOMER_ID,TERMINAL_ID,TX_FRAUD,x_customer_id,y_customer_id,x_terminal_id,y_terminal__id
TX_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c6dde46458f3d1cfeb9256f8add45a14fb349f8f,7323088226725338,55875360,0,27.712334,18.147509,27.628549,20.873479
988e8385f2934716ff5e57566617e077678b7ec9,7323088226725338,55875360,0,27.712334,18.147509,27.628549,20.873479
0726d4ff29dbcd4e21c4dbe5291ee4165d15d1c6,4362790219307564,55875360,0,30.270670,20.798376,27.628549,20.873479
998ec1dfa3664fbf40f8f2f5ad38e1bd71841baf,2376754785900600,55875360,0,31.512472,22.541913,27.628549,20.873479
a6df8a36e41707014adf056496e329709bee902a,2376754785900600,55875360,1,31.512472,22.541913,27.628549,20.873479
...,...,...,...,...,...,...,...
70072c846b4aca1c1dda9a1537ba43ae831db35f,7760361189551290,26508045,0,60.422202,49.066317,65.339428,49.604206
cee6f272e7d83e61f647d268d728409e1b1d9ea3,4712914414618850,26508045,0,61.728967,49.043844,65.339428,49.604206
208ed1fcbefbbb2fc71851a2a04eaad9c2cc4ddc,3473580532934853,26508045,0,65.997976,49.402127,65.339428,49.604206
eece6a88d031195d838725f91143c6841054616c,84525978769760,26508045,0,68.423755,47.196565,65.339428,49.604206


### Adding a column called distTC: which is the dsitance between the terminal and the customer

In [12]:
transactions_train_merged['distTC'] = np.sqrt( (transactions_train_merged.x_customer_id-transactions_train_merged.x_terminal_id)**2 + (transactions_train_merged.y_customer_id-transactions_train_merged.y_terminal__id)**2 )

In [13]:
transactions_train_merged.head()

Unnamed: 0,TX_ID,CUSTOMER_ID,TERMINAL_ID,TX_FRAUD,x_customer_id,y_customer_id,x_terminal_id,y_terminal__id,distTC
0,c6dde46458f3d1cfeb9256f8add45a14fb349f8f,7323088226725338,55875360,0,27.712334,18.147509,27.628549,20.873479,2.727257
1,988e8385f2934716ff5e57566617e077678b7ec9,7323088226725338,55875360,0,27.712334,18.147509,27.628549,20.873479,2.727257
2,0726d4ff29dbcd4e21c4dbe5291ee4165d15d1c6,4362790219307564,55875360,0,30.27067,20.798376,27.628549,20.873479,2.643188
3,998ec1dfa3664fbf40f8f2f5ad38e1bd71841baf,2376754785900600,55875360,0,31.512472,22.541913,27.628549,20.873479,4.227118
4,a6df8a36e41707014adf056496e329709bee902a,2376754785900600,55875360,1,31.512472,22.541913,27.628549,20.873479,4.227118


In [15]:
fraudalent_cases = transactions_train_merged[(transactions_train_merged['TX_FRAUD'] == 1)][['TX_ID','TX_FRAUD','distTC']]
authentic_cases = transactions_train_merged[(transactions_train_merged['TX_FRAUD'] == 0)][['TX_ID','TX_FRAUD','distTC']]

In [18]:
fraudalent_cases.describe()

Unnamed: 0,TX_FRAUD,distTC
count,27564.0,27564.0
mean,1.0,3.302074
std,0.0,1.188337
min,1.0,0.0
25%,1.0,2.47214
50%,1.0,3.496804
75%,1.0,4.304897
max,1.0,4.999809


In [19]:
authentic_cases.describe()

Unnamed: 0,TX_FRAUD,distTC
count,1038876.0,1038876.0
mean,0.0,3.307505
std,0.0,1.190866
min,0.0,0.0
25%,0.0,2.462597
50%,0.0,3.505528
75%,0.0,4.315153
max,0.0,4.99999


In [25]:
sns.barplot(x = np.arange(fraudalent_cases.size()), y= fraudalent_cases['distTC'])

KeyboardInterrupt: 