In [1]:
import pandas as pd # Data manipulation and analysis.
import numpy as np # Numerical operations and array handling.
import matplotlib.pyplot as plt # More control, lower-level, basic plotting.
import seaborn as sns # Higher-level, more aesthetically pleasing plots.
from scipy import stats # Statistical functions and tests.


pd.set_option('display.max_columns', None) # Display all columns in DataFrame output.
pd.set_option('display.max_rows', None) # Display all rows in DataFrame output.

### Load all CSV data

In [2]:
df = pd.read_csv('/Users/xiaowenrou/Downloads/COMP647/financial_fraud_detection_dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 18 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   transaction_id               object 
 1   timestamp                    object 
 2   sender_account               object 
 3   receiver_account             object 
 4   amount                       float64
 5   transaction_type             object 
 6   merchant_category            object 
 7   location                     object 
 8   device_used                  object 
 9   is_fraud                     bool   
 10  fraud_type                   object 
 11  time_since_last_transaction  float64
 12  spending_deviation_score     float64
 13  velocity_score               int64  
 14  geo_anomaly_score            float64
 15  payment_channel              object 
 16  ip_address                   object 
 17  device_hash                  object 
dtypes: bool(1), float64(4), int64(1), object(1

### Remove irrelevant data

When building a fraud detection model, each feature must be available in real-time in real-world transaction processing scenarios and have the ability to distinguish between legitimate and fraudulent transactions.

- `transaction_id`: The transaction ID is a unique identifier generated by the system and does not contain any predictive information.
- `timestamp`: The dataset already contains time_since_last_transaction, which is a derived feature that is more valuable than the original timestamp.
- `sender_account` & `receiver_account`: Account identifiers can lead to model overfitting: most transactions involve new account combinations that have never been seen before.
- `fraud_type`: We only need binary classification, not classification of fraud types, so remove this column.
- `ip_address`: IP addresses are dynamically allocated, so the same user may use different IP addresses, and different users may share the same IP address.

In [4]:
columns_to_remove = [
    'transaction_id',
    'timestamp',
    'sender_account',
    'receiver_account',
    'fraud_type',
    'ip_address'
]

df = df.drop(columns = columns_to_remove)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 12 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   amount                       float64
 1   transaction_type             object 
 2   merchant_category            object 
 3   location                     object 
 4   device_used                  object 
 5   is_fraud                     bool   
 6   time_since_last_transaction  float64
 7   spending_deviation_score     float64
 8   velocity_score               int64  
 9   geo_anomaly_score            float64
 10  payment_channel              object 
 11  device_hash                  object 
dtypes: bool(1), float64(4), int64(1), object(6)
memory usage: 424.4+ MB


### Find and remove all rows contains Null value
In this section, we use different data checking strategies for different data types.

- For categorical variables (`transaction_type`, `merchant_category`, `location`, `device_used`, `payment_channel`, `device_hash`), a complete deletion strategy is adopted. When missing values ​​appear in these fields, the entire row of data containing the missing value is directly deleted. For the missing of such categorical variables, forced filling will introduce noise and may mislead model learning.

- For numeric variables (`amount`, `time_since_last_transaction`, `spending_deviation_score`, `velocity_score`, `geo_anomaly_score`), the fill method is used. `time_since_last_transaction`: Missing values ​​are filled with -1 to specifically identify first-time transaction users.

In [5]:
missing_values = df.isnull().sum()
print(missing_values)

rows_before = len(df)

# Handling categorical variables: remove the row if missing
categorical_cols = ['transaction_type', 'merchant_category', 'location', 'device_used', 'payment_channel', 'device_hash']
for col in categorical_cols:
    if col in df.columns and df[col].isnull().any():
        missing_count_before = df[col].isnull().sum()
        df = df.dropna(subset=[col])

# Handling numbric variables: 
numerical_cols = ['amount', 'time_since_last_transaction', 'spending_deviation_score', 'velocity_score', 'geo_anomaly_score']
for col in numerical_cols:
    if col in df.columns and df[col].isnull().any():
        if col == 'time_since_last_transaction':
            # For first-time transaction users, setting this to -1 means no transaction history
            df[col] = df[col].fillna(-1)
        else:
            # Other values ​​are filled with median
            df[col] = df[col].fillna(df[col].median())

rows_after = len(df)
print(f"{rows_before - rows_after} rows are removed from data set")
df.info()


amount                              0
transaction_type                    0
merchant_category                   0
location                            0
device_used                         0
is_fraud                            0
time_since_last_transaction    896513
spending_deviation_score            0
velocity_score                      0
geo_anomaly_score                   0
payment_channel                     0
device_hash                         0
dtype: int64
0 rows are removed from data set
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 12 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   amount                       float64
 1   transaction_type             object 
 2   merchant_category            object 
 3   location                     object 
 4   device_used                  object 
 5   is_fraud                     bool   
 6   time_since_last_transaction  float64
 7   sp

### Remove Duplicates and Constant Features

- Duplicate value handling strategy: Use a complete deletion strategy, retaining the first record in each set of duplicate data. This approach is simple and direct, avoiding errors that may be introduced by complex duplicate data merging logic.
- Constant feature identification strategy: Each feature column is checked for the number of unique values ​​`nunique()`. If the number of unique values ​​is less than or equal to 1, the feature is considered a constant feature. All constant feature columns are deleted, but the target variable `is_fraud` is particularly protected. Even in extremely imbalanced datasets, the target variable should not be mistakenly deleted because it is the core basis for model learning.

In [6]:
duplicated_values = df.duplicated().sum()
print(f"Duplicated rows: {duplicated_values}")

# Remove duplicate rows if found
if duplicated_values > 0:
   df = df.drop_duplicates()
   print(f"{duplicated_values} duplicated rows are removed from data set")

# Remove Constant Features  
constant_features = []
for col in df.columns:
   if col != 'is_fraud':
       if df[col].nunique() <= 1:
           constant_features.append(col)

if constant_features:
   df = df.drop(columns=constant_features)
   print(f"{len(constant_features)} constant features are removed: {constant_features}")

df.info()

Duplicated rows: 0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 12 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   amount                       float64
 1   transaction_type             object 
 2   merchant_category            object 
 3   location                     object 
 4   device_used                  object 
 5   is_fraud                     bool   
 6   time_since_last_transaction  float64
 7   spending_deviation_score     float64
 8   velocity_score               int64  
 9   geo_anomaly_score            float64
 10  payment_channel              object 
 11  device_hash                  object 
dtypes: bool(1), float64(4), int64(1), object(6)
memory usage: 424.4+ MB


In [8]:
print(f"fraud rate in example data: {df['is_fraud'].mean():.4%}")

fraud rate in example data: 3.5911%
