In [3]:
import pandas as pd # Data manipulation and analysis.
import numpy as np # Numerical operations and array handling.
import matplotlib.pyplot as plt # More control, lower-level, basic plotting.
import seaborn as sns # Higher-level, more aesthetically pleasing plots.
from scipy import stats # Statistical functions and tests.


pd.set_option('display.max_columns', None) # Display all columns in DataFrame output.
pd.set_option('display.max_rows', None) # Display all rows in DataFrame output.

### 1. Load all CSV data

In [4]:
df = pd.read_csv('C:\\Users\\liuj4o\\Downloads\\financial_fraud_detection_dataset.csv')
df = df.drop('fraud_type', axis = 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 17 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   transaction_id               object 
 1   timestamp                    object 
 2   sender_account               object 
 3   receiver_account             object 
 4   amount                       float64
 5   transaction_type             object 
 6   merchant_category            object 
 7   location                     object 
 8   device_used                  object 
 9   is_fraud                     bool   
 10  time_since_last_transaction  float64
 11  spending_deviation_score     float64
 12  velocity_score               int64  
 13  geo_anomaly_score            float64
 14  payment_channel              object 
 15  ip_address                   object 
 16  device_hash                  object 
dtypes: bool(1), float64(4), int64(1), object(11)
memory usage: 615.1+ MB


### 2. Find and remove all rows contains Null value

In [None]:
missing_values = df.isnull().sum()
print(missing_values)
if missing_values.sum() > 0:
    df = df.dropna(subset=[c for c in df.columns if c != 'time_since_last_transaction'])

transaction_id                      0
timestamp                           0
sender_account                      0
receiver_account                    0
amount                              0
transaction_type                    0
merchant_category                   0
location                            0
device_used                         0
is_fraud                            0
time_since_last_transaction    896513
spending_deviation_score            0
velocity_score                      0
geo_anomaly_score                   0
payment_channel                     0
ip_address                          0
device_hash                         0
dtype: int64


### 3.Remove Duplicates and Constant

In [11]:
duplicated_values = df.duplicated().sum()
print(duplicated_values)
if (duplicated_values > 0):
    df = df.drop_duplicates()

0


In [None]:
constant_features = [col for col in df.columns if df[col].nunique() == 1]
print("Constant features:", constant_features)
if len(constant_features) > 1:
    df = df.drop(columns = constant_features)

Constant features: []


In [None]:
print(f"\nis_fraud unique values: {df['is_fraud'].unique()}")
print(f"is_fraud data type: {df['is_fraud'].dtype}")


is_fraud unique values: [False  True]
is_fraud data type: bool


In [19]:
numeric_columns = ['amount', 'spending_deviation_score', 'velocity_score', 'geo_anomaly_score']
print("\nNumeric columns verification:")
for col in numeric_columns:
    if col in df.columns:
        print(f"{col}: {df[col].dtype} - Can convert to numeric: {pd.to_numeric(df[col], errors='coerce').notna().all()}")


Numeric columns verification:
amount: float64 - Can convert to numeric: True
spending_deviation_score: float64 - Can convert to numeric: True
velocity_score: int64 - Can convert to numeric: True
geo_anomaly_score: float64 - Can convert to numeric: True


In [None]:
print(f"fraud rate in example data: {df['is_fraud'].mean():.4%}")

fraud rate in example data: 3.5911%
