# Fraud Detection - Data Preprocessing

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import numpy as np

import warnings
warnings.filterwarnings('ignore')

## Load Raw Data

In [None]:
df = pd.read_csv("../DATA/raw/financial_fraud_detection_dataset.csv")
print(f"Initial shape: {df.shape}")
df.head()

## Handle Missing Values

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isna().sum())

Missing values per column:
transaction_id                       0
timestamp                            0
sender_account                       0
receiver_account                     0
amount                               0
transaction_type                     0
merchant_category                    0
location                             0
device_used                          0
is_fraud                             0
fraud_type                     4820447
time_since_last_transaction     896513
spending_deviation_score             0
velocity_score                       0
geo_anomaly_score                    0
payment_channel                      0
ip_address                           0
device_hash                          0
dtype: int64


In [None]:
imputer = SimpleImputer(strategy="mean")
df[df.select_dtypes(include=np.number).columns] = imputer.fit_transform(
    df.select_dtypes(include=np.number)
)

In [None]:
df.isna().sum()

transaction_id                       0
timestamp                            0
sender_account                       0
receiver_account                     0
amount                               0
transaction_type                     0
merchant_category                    0
location                             0
device_used                          0
is_fraud                             0
fraud_type                     4820447
time_since_last_transaction          0
spending_deviation_score             0
velocity_score                       0
geo_anomaly_score                    0
payment_channel                      0
ip_address                           0
device_hash                          0
dtype: int64

## Drop Unnecessary Columns

In [None]:
columns_to_drop = ['transaction_id', 'fraud_type']
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
print(f"Shape after dropping columns: {df.shape}")

Shape after dropping columns: (5000000, 16)


## Parse Datetime Features

In [None]:
# Convert timestamp to datetime and extract temporal features
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
df['hour'] = df['timestamp'].dt.hour
df['day'] = df['timestamp'].dt.day
df['day_of_week'] = df['timestamp'].dt.weekday
df['month'] = df['timestamp'].dt.month

print("Temporal features extracted:")
print(df[['timestamp', 'hour', 'day', 'day_of_week', 'month']].head())

Temporal features extracted:
                   timestamp  hour   day  day_of_week  month
0 2023-08-22 09:22:43.516168   9.0  22.0          1.0    8.0
1 2023-08-04 01:58:02.606711   1.0   4.0          4.0    8.0
2 2023-05-12 11:39:33.742963  11.0  12.0          4.0    5.0
3 2023-10-10 06:04:43.195112   6.0  10.0          1.0   10.0
4 2023-09-24 08:09:02.700162   8.0  24.0          6.0    9.0


In [None]:
df[df['hour'].isna()]

Unnamed: 0,timestamp,sender_account,receiver_account,amount,transaction_type,merchant_category,location,device_used,is_fraud,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel,ip_address,device_hash,hour,day,day_of_week,month
1891874,NaT,ACC375307,ACC630432,195.84,deposit,grocery,Dubai,web,False,4429.861012,0.87,5.0,0.85,wire_transfer,50.135.186.243,D8972916,,,,
2833396,NaT,ACC969484,ACC369350,4.55,payment,travel,Dubai,web,False,-352.835279,1.2,9.0,0.58,wire_transfer,228.70.5.198,D3068177,,,,
3214871,NaT,ACC444490,ACC483161,28.2,transfer,online,Singapore,atm,False,6481.381027,-1.32,12.0,0.32,ACH,234.146.217.89,D1189472,,,,


## Convert Target Variable

In [None]:
# Convert is_fraud to integer
df['is_fraud'] = df['is_fraud'].astype(int)
print(f"Target variable distribution:\n{df['is_fraud'].value_counts()}")

Target variable distribution:
is_fraud
0    4820447
1     179553
Name: count, dtype: int64


## Sort Data Chronologically

In [None]:
# Sort data by timestamp - CRITICAL for time-based features
df = df.sort_values('timestamp').reset_index(drop=True)
print(f"Data sorted by timestamp. Shape: {df.shape}")

Data sorted by timestamp. Shape: (5000000, 20)


## Encode Categorical Variables

In [None]:
# Apply label encoding to categorical columns
print("Encoding categorical variables...")
for col in df.select_dtypes(include=["object","category"]).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    print(f"  - Encoded: {col}")

df.head()

Encoding categorical variables...
  - Encoded: sender_account
  - Encoded: receiver_account
  - Encoded: transaction_type
  - Encoded: merchant_category
  - Encoded: location
  - Encoded: device_used
  - Encoded: payment_channel
  - Encoded: ip_address
  - Encoded: device_hash


Unnamed: 0,timestamp,sender_account,receiver_account,amount,transaction_type,merchant_category,location,device_used,is_fraud,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel,ip_address,device_hash,hour,day,day_of_week,month
0,2023-01-01 00:09:26.241974,158871,89037,12.25,2,1,1,0,0,-7559.966086,-0.32,2.0,0.96,1,4781010,3735305,0.0,1.0,6.0,1.0
1,2023-01-01 00:11:36.452582,376957,4992,1347.27,0,6,5,1,0,1.525799,-0.68,5.0,0.67,1,232516,550645,0.0,1.0,6.0,1.0
2,2023-01-01 00:12:48.028557,727819,491398,20.79,1,6,6,3,0,-4568.663237,0.14,8.0,0.42,3,2541431,3740925,0.0,1.0,6.0,1.0
3,2023-01-01 00:21:19.560899,545869,270273,740.73,0,7,0,1,0,1.525799,-2.09,20.0,0.86,1,609531,1437485,0.0,1.0,6.0,1.0
4,2023-01-01 00:23:15.259766,880977,66830,228.67,0,1,1,1,0,-48.7387,0.16,18.0,0.82,1,736628,1085200,0.0,1.0,6.0,1.0


In [None]:
df.isna().sum()

timestamp                      3
sender_account                 0
receiver_account               0
amount                         0
transaction_type               0
merchant_category              0
location                       0
device_used                    0
is_fraud                       0
time_since_last_transaction    0
spending_deviation_score       0
velocity_score                 0
geo_anomaly_score              0
payment_channel                0
ip_address                     0
device_hash                    0
hour                           3
day                            3
day_of_week                    3
month                          3
dtype: int64

## Data Quality Checks

In [None]:
# Check for any remaining issues
print("Data Quality Summary:")
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print(f"Missing values: {df.isna().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")

Data Quality Summary:
Total rows: 5000000
Total columns: 20
Missing values: 15
Duplicate rows: 0


In [None]:
df = df.dropna(subset=['timestamp', 'hour', 'day', 'day_of_week', 'month'])

In [None]:
# Check for any remaining issues
print("Data Quality Summary:")
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print(f"Missing values: {df.isna().sum().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")

Data Quality Summary:
Total rows: 4999997
Total columns: 20
Missing values: 0
Duplicate rows: 0


In [None]:
# Display data types
print("\nData Types:")
print(df.dtypes)


Data Types:
timestamp                      datetime64[us]
sender_account                          int64
receiver_account                        int64
amount                                float64
transaction_type                        int64
merchant_category                       int64
location                                int64
device_used                             int64
is_fraud                                int64
time_since_last_transaction           float64
spending_deviation_score              float64
velocity_score                        float64
geo_anomaly_score                     float64
payment_channel                         int64
ip_address                              int64
device_hash                             int64
hour                                  float64
day                                   float64
day_of_week                           float64
month                                 float64
dtype: object


In [None]:
# Display basic statistics
df.describe()

Unnamed: 0,timestamp,sender_account,receiver_account,amount,transaction_type,merchant_category,location,device_used,is_fraud,time_since_last_transaction,spending_deviation_score,velocity_score,geo_anomaly_score,payment_channel,ip_address,device_hash,hour,day,day_of_week,month
count,4999997,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0,4999997.0
mean,2023-07-03 00:10:10.418651,448352.0,448373.5,358.9344,1.499402,3.50039,3.500437,1.499932,0.03591062,1.523688,-0.0003882662,10.50132,0.5000292,1.500378,2498523.0,1917832.0,11.49665,15.72276,2.999544,6.527235
min,2023-01-01 00:09:26.241974,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,-8777.814,-5.26,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
25%,2023-04-02 17:32:38.758834,224259.0,224204.0,26.57,0.0,1.0,2.0,1.0,0.0,-1920.275,-0.68,5.0,0.25,0.0,1249276.0,958762.0,5.0,8.0,1.0,4.0
50%,2023-07-03 01:27:16.670582,448329.0,448492.0,138.67,1.0,4.0,4.0,1.0,0.0,1.525799,0.0,11.0,0.5,2.0,2498488.0,1917991.0,11.0,16.0,3.0,7.0
75%,2023-10-02 05:43:06.393417,672569.0,672411.0,503.89,2.0,5.0,6.0,3.0,0.0,1923.793,0.67,16.0,0.75,3.0,3747773.0,2876538.0,17.0,23.0,5.0,10.0
max,2024-01-01 22:58:30.131850,896512.0,896638.0,3520.57,3.0,7.0,7.0,3.0,1.0,8757.758,5.02,20.0,1.0,3.0,4997067.0,3835722.0,23.0,31.0,6.0,12.0
std,,258792.0,258806.9,469.9334,1.117896,2.291134,2.291127,1.117982,0.1860673,3240.097,1.000807,5.766844,0.288635,1.118295,1442522.0,1107195.0,6.921329,8.796412,2.003172,3.447722


## Save Processed Data


Preprocessed data saved to: ../DATA/processed.csv
Final shape: (4999997, 20)
