In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
df = pd.read_csv('card_transdata.csv')

In [23]:
df.head(10)

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,True,True,False,False,False
1,10.829943,0.175592,1.294219,True,False,False,False,False
2,5.091079,0.805153,0.427715,True,False,False,True,False
3,2.247564,5.600044,0.362663,True,True,False,True,False
4,44.190936,0.566486,2.222767,True,True,False,True,False
5,5.586408,13.261073,0.064768,True,False,False,False,False
6,3.724019,0.956838,0.278465,True,False,False,True,False
7,4.848247,0.320735,1.27305,True,False,True,False,False
8,0.876632,2.503609,1.516999,False,False,False,False,False
9,8.839047,2.970512,2.361683,True,False,False,True,False


In [22]:
df.tail(10)

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
999990,20.334489,11.437333,0.699527,True,True,False,True,False
999991,26.807213,0.102574,2.746114,True,False,False,True,False
999992,9.844674,0.250108,0.814046,True,False,False,True,False
999993,4.846253,2.84445,0.86774,True,False,False,True,False
999994,3.295884,0.085712,0.831991,True,False,False,True,False
999995,2.207101,0.112651,1.626798,True,True,False,False,False
999996,19.872726,2.683904,2.778303,True,True,False,False,False
999997,2.914857,1.472687,0.218075,True,True,False,True,False
999998,4.258729,0.242023,0.475822,True,False,False,True,False
999999,58.108125,0.31811,0.38692,True,True,False,True,False


In [None]:
# Initial Questions:

# 1. What kind of high level information does this dataset give us? Why would this dataset be needed?
    # To understand trends in fraudulent transactions.
# 2. What features does it have? What are their data types? How many observations do we have?
    # Answered later.
# 3. What is the granularity of the data?
    # Transaction Level

# Assumptions:

    # We would be able to understand which types of transactions are more likely to be fraud after analysing this dataset.
    # We can do this by focusing on the properties/feautures of the transaction.
    # Fraudulent transactions are more likely to happen if chip is not used, and/or pin is not used.
    # For online orders, we would expect fraud rate to be lower.
    # We would expect more fraud to be where the ratio to median would be 1 +- 0.3?

# Concerns around dataset:

    # Documentation of column repeat_retailer is not great reducing the usability of the dataset.
    # We do not have a transaction_id either which would have been helpful in identifying duplicate data.

In [30]:
df.dtypes

distance_from_home                float64
distance_from_last_transaction    float64
ratio_to_median_purchase_price    float64
repeat_retailer                   float64
used_chip                         float64
used_pin_number                   float64
online_order                      float64
fraud                             float64
dtype: object

In [31]:
# We see that data types which must be boolean/int are stored as floats, which might be inconvinient since we want to treat these as categorical variables rather than continuous ones.
# Hence, we change the data types to 'clean' the data.

df[['repeat_retailer', 'used_chip', 'used_pin_number', 'online_order', 'fraud']] = df[['repeat_retailer', 'used_chip', 'used_pin_number', 'online_order', 'fraud']].astype(int)
print(df.head(1))

# Now we have all columns in data types we need.

   distance_from_home  distance_from_last_transaction  \
0           57.877857                         0.31114   

   ratio_to_median_purchase_price  repeat_retailer  used_chip  \
0                         1.94594                1          1   

   used_pin_number  online_order  fraud  
0                0             0      0  


In [26]:
df.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,26.628792,5.036519,1.824182,0.881536,0.350399,0.100608,0.650552,0.087403
std,65.390784,25.843093,2.799589,0.323157,0.477095,0.300809,0.476796,0.282425
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.878008,0.296671,0.475673,1.0,0.0,0.0,0.0,0.0
50%,9.96776,0.99865,0.997717,1.0,0.0,0.0,1.0,0.0
75%,25.743985,3.355748,2.09637,1.0,1.0,0.0,1.0,0.0
max,10632.723672,11851.104565,267.802942,1.0,1.0,1.0,1.0,1.0


In [None]:
# From the above statistics, we care most about the mean as it tells us key benchmarks:
# Most transactions are from repeat retailers (88%).
# Most transactions don't have chip used (35%).
# Most transactions don't use pin number (10%).
# Most transactions are online (65%).
# Fraudulent are ~9%.

In [39]:
# Since we do not know much about the context behind repeat_retailer, we choose to drop it.

df = df.drop('repeat_retailer', axis = 1, errors = 'ignore')
df.head(1)


Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1,0,0,0


In [41]:
# Looking for duplicate data

duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

# No duplicates found. However, should some come in the future, we should anyway drop them:

df.drop_duplicates()

number of duplicate rows:  (0, 7)


Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.311140,1.945940,1,0,0,0
1,10.829943,0.175592,1.294219,0,0,0,0
2,5.091079,0.805153,0.427715,0,0,1,0
3,2.247564,5.600044,0.362663,1,0,1,0
4,44.190936,0.566486,2.222767,1,0,1,0
...,...,...,...,...,...,...,...
999995,2.207101,0.112651,1.626798,1,0,0,0
999996,19.872726,2.683904,2.778303,1,0,0,0
999997,2.914857,1.472687,0.218075,1,0,1,0
999998,4.258729,0.242023,0.475822,0,0,1,0


In [42]:
# Checking for missing or null values

print(df.isnull().sum())

# No null values found.

distance_from_home                0
distance_from_last_transaction    0
ratio_to_median_purchase_price    0
used_chip                         0
used_pin_number                   0
online_order                      0
fraud                             0
dtype: int64


In [None]:
# Scatterplots