In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

In [2]:
def split_data(df, target=None):
    
    if target == None:
        target = list(df.columns)[-1]
    return df.drop(columns=[target], axis=1), df[[target]]

def nan_ratio_col(df, col):
    nan_ratio = sum(df[col].isna()) / len(df[col])
    return nan_ratio

# Initial data handling
The first step to a data science project is ensuring you understand your data. Copied from Kaggle, the features are:
- distance_from_home - the distance from home where the transaction happened.
- distance_from_last_transaction - the distance from the last transaction that happened.
- ratio_to_median_purchase_price - Ratio of purchased price to median purchase price.
- repeat_retailer - If the transaction happened from the same retailer.
- used_chip - If the transaction is through chip (credit card).
- used_pin_number - If the transaction used PIN.
- online_order - If the transaction is an online order.
- fraud - If the transaction is fraudulent.

In [3]:
df = pd.read_csv("card_transdata.csv")
assert (len(df) == len(df.drop_duplicates()))

If we split the dataset before dropping duplicates, a duplicate sample may be in both the train and test/validation splits. This is known as data leakage and it reduces the reliability of our evaluation metrics. To avoid this you must drop all duplicates before splitting your dataset. In this dataset, there are no duplicate values.


In [4]:
X, y = split_data(df)

# Feature engineering

I have decided to test two engineered features. The goal of feature engineering is to transform one or more features into a new feature that gives our ML model more predictive power by reducing noise or making complex patterns more simple (Typically by combining features in a meaningful way).

Feature engineering ideas:

- jumping_transaction
- chip_and_pin
- high_risk_distance
- price_spike
- risk_score
- no_security_online
- distance_consistency

Descriptions are in the corresponding cells.

In [5]:
# jumping_transaction
distant_transaction_threshold = X["distance_from_last_transaction"].quantile(.8)
distant_transaction = X["distance_from_last_transaction"] > distant_transaction_threshold
X['jumping_transaction'] = (distant_transaction & X["online_order"]).astype(int)

In [6]:
#chip_and_pin
X["chip_and_pin"] = ((X["used_chip"]==1) & (X["used_pin_number"]==1))
X["chip_and_pin"].value_counts()

chip_and_pin
False    964947
True      35053
Name: count, dtype: int64

In [7]:
# Captures if a transaction is unusually far from both home AND last transaction
# Could indicate stolen card being used far from normal patterns
X['high_risk_distance'] = ((X['distance_from_home'] > X['distance_from_home'].quantile(0.75)) & 
                          (X['distance_from_last_transaction'] > X['distance_from_last_transaction'].quantile(0.75))).astype(int)

In [8]:
# Identifies transactions with unusually high purchase amounts
# Fraudsters often make large purchases once they get hold of a card
X['price_spike'] = (X['ratio_to_median_purchase_price'] > X['ratio_to_median_purchase_price'].quantile(0.9)).astype(int)

In [9]:
# Combines multiple risk factors: online order, no chip, no PIN, and non-repeat retailer
# More risk factors = higher score
X['risk_score'] = ((1-X['used_chip']) + 
                   (1-X['used_pin_number']) + 
                   (X['online_order']) + 
                   (1-X['repeat_retailer']))

In [10]:
# Flags online transactions with no security features
X['no_security_online'] = (
    (X['online_order'] == 1) & 
    (X['used_pin_number'] == 0) & 
    (X['used_chip'] == 0)
)

In [11]:
# Measures if the transaction location is consistent with previous patterns
X['distance_consistency'] = np.abs(X['distance_from_home'] - X['distance_from_last_transaction']) / (X['distance_from_home'] + 1)

# Handling imbalanced dataset
The data is incredibly imbalanced. With fraudulent cases making up only ~10% of our samples. Oversampling would create excess noise, which I think is unnecessary so I'll use undersampling. With only 5 features and more than 87,000 samples in the minority class, I believe the model will have enough data to learn to differentiate each class. 

In [23]:
y.value_counts()

fraud
0.0      912597
1.0       87403
Name: count, dtype: int64

In [12]:
tl_undersample_pipeline = Pipeline([
    ('tomek', TomekLinks()),
    ('rus', RandomUnderSampler(random_state=42))
])

X_resample, y_resample = tl_undersample_pipeline.fit_resample(X, y)

In [24]:
y_resample.value_counts()

fraud
0.0      87403
1.0      87403
Name: count, dtype: int64

It's important to tune your model based on the results on a test dataset, and to keep a separate final validation dataset to report to stakeholders. If you tune your model on test data and report the test metrics as your final model metrics, the model will inevitably underperform that score when applied to real unseen data. That is why we keep a validation dataset hidden until evaluation.

We will keep the validation set for final metrics and use cross-validation with the train set to tune. 

In [13]:
# splitting train and val datasets on resampled data
# Use resampled data because we have enough data to get accurate metrics without including the full dataset
X_train_rs, X_val_rs, y_train_rs, y_val_rs = train_test_split(
    X_resample, y_resample, test_size=0.2, random_state=42)


In [14]:
dtc = DecisionTreeClassifier(random_state=42)
folds = 3

# Feature selection

Now that we have engineered our features, and undersampled the dataset, we want to actualyl figure out which featrues to keep. We will automate this process using forward selection. 

In [15]:
feature_selector = SFS(
    dtc,
    direction="backward",
    scoring="recall",
    n_jobs=-1,
    cv=folds,
    tol=None, 
    
)

feature_selector.fit_transform(X_train_rs, y_train_rs)

array([[ 0.9223234 ,  1.7251314 ,  7.2202343 , ...,  0.        ,
         3.        ,  0.        ],
       [ 2.06008492,  3.66930104,  1.56741663, ...,  0.        ,
         1.        ,  0.        ],
       [ 4.48090284,  0.11588125,  6.06085202, ...,  0.        ,
         3.        ,  1.        ],
       ...,
       [12.23050329,  1.55378339,  4.68407779, ...,  0.        ,
         3.        ,  1.        ],
       [16.29903161,  2.05343662,  4.38366834, ...,  0.        ,
         3.        ,  1.        ],
       [41.34026287,  3.43021101,  4.62273095, ...,  0.        ,
         2.        ,  0.        ]])

In [16]:
selected_indices = feature_selector.get_support(indices=True)
selected_features = X_train_rs.columns[selected_indices]


In [17]:
initial_features = set(X_train_rs.columns)
final_features = set(list(selected_features))
unselected_features = initial_features - final_features
print(f"We removed the following features: {unselected_features}")


We removed the following features: {'jumping_transaction', 'price_spike', 'repeat_retailer', 'distance_consistency', 'high_risk_distance', 'chip_and_pin', 'online_order'}


In [18]:
X_train_rs = X_train_rs[selected_features]
X_train_rs

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,used_chip,used_pin_number,risk_score,no_security_online
179805,0.922323,1.725131,7.220234,0.0,0.0,3.0,False
593681,2.060085,3.669301,1.567417,1.0,0.0,1.0,False
605854,4.480903,0.115881,6.060852,0.0,0.0,3.0,True
239627,2.904422,1.259832,1.373941,0.0,0.0,2.0,False
641242,14.203281,2.062613,0.622513,0.0,0.0,3.0,True
...,...,...,...,...,...,...,...
373502,10.470896,0.253114,7.054097,0.0,0.0,3.0,True
187111,248.303907,0.587331,3.213880,0.0,0.0,3.0,True
508999,12.230503,1.553783,4.684078,0.0,0.0,3.0,True
678624,16.299032,2.053437,4.383668,0.0,0.0,3.0,True


# Modelling
A good definition for Recall is "the fraction of relevant instances that were retrieved". In the case of fraud detection, we want to optimise the number of fraudulent transactions that our model detects. False positives are OK. 

However, in the case of fraud detection for B2B transfers, we would want to balance customer experience with rejecting fraudulent requests. For that, we could use a metric like F2 which incorporates recall and precision but weight recall more heavily, or alternatively use probability thresholds.

In [31]:
cv_table = cross_validate(
    estimator=dtc, 
    X=X_train_rs, 
    y=y_train_rs, 
    scoring="recall", 
    cv=3, 
    n_jobs=-1,
    return_estimator=True)

In [32]:
pd.DataFrame(cv_table)["test_score"].mean()

0.9998712722591718

In [26]:
print(f"No feature engineering    {0.9729957805907173}")
print(f"W  feature engineering    {0.9998712722591718}")

No feature engineering    0.9729957805907173
W  feature engineering    0.9998712722591718


# Evaluating
Here we get the final metric, the one that will be reported to stakeholders.

In [37]:
predictions = cv_table["estimator"][0].predict(X_val_rs[selected_features])
recall = recall_score(predictions, y_val_rs)
print(f"The final score for the best version of the model is a recall of {recall}")

The final score for the best version of the model is a recall of 0.9997712717291857
