# 02_preprocessing - Data Cleaning and Preprocessing

This notebook demonstrates simple preprocessing steps using `src/preprocessing.py`. Each step is in its own cell for clarity.

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
import pandas as pd
from IPython.display import display

RAW_DIR = os.path.abspath(os.path.join('..', 'data', 'raw'))
CSV_LIST = [f for f in os.listdir(RAW_DIR) if f.lower().endswith('.csv')] if os.path.exists(RAW_DIR) else []
print('Found CSV files:', CSV_LIST)


Found CSV files: ['PS_20174392719_1491204439457_log.csv', 'sample.csv']


In [2]:
# Load the dataset using src/load_data
if not CSV_LIST:
    print('No CSV in data/raw. Please add dataset file and re-run.')
else:
    dataset_path = os.path.join(RAW_DIR, CSV_LIST[0])
    from load_data import load_dataset
    df = load_dataset(dataset_path)
    print('\nLoaded into variable `df`.')


Loading dataset from: c:\Users\mazen\Desktop\Uni\Term 9\Data Mining\Anti-Money Laundering (AML) Detection System\aml_project\data\raw\PS_20174392719_1491204439457_log.csv

Dataset loaded successfully!
Shape: 6362620 rows, 11 columns

First 5 rows:
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    

In [3]:
# Show first rows
try:
    display(df.head())
except NameError:
    print('`df` is not defined. Load the CSV first.')


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
# Show shape and columns
try:
    print('Shape:', df.shape)
    print('Columns:', df.columns.tolist())
except NameError:
    pass


Shape: (6362620, 11)
Columns: ['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']


In [5]:
# DataFrame info
try:
    print(df.info())
except NameError:
    pass


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None


In [6]:
# Basic descriptive statistics
try:
    display(df.describe(include='all'))
except NameError:
    pass


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620,6362620.0,6362620,6362620.0,6362620.0,6362620,6362620.0,6362620.0,6362620.0,6362620.0
unique,,5,,6353307,,,2722362,,,,
top,,CASH_OUT,,C1677795071,,,C1286084959,,,,
freq,,2237500,,3,,,113,,,,
mean,243.3972,,179861.9,,833883.1,855113.7,,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,,603858.2,,2888243.0,2924049.0,,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0
25%,156.0,,13389.57,,0.0,0.0,,0.0,0.0,0.0,0.0
50%,239.0,,74871.94,,14208.0,0.0,,132705.7,214661.4,0.0,0.0
75%,335.0,,208721.5,,107315.2,144258.4,,943036.7,1111909.0,0.0,0.0


In [7]:
# Missing values and uniques
try:
    miss = df.isna().sum().sort_values(ascending=False)
    display(miss.head(20))
    uniques = df.nunique().sort_values()
    display(uniques.head(20))
except NameError:
    pass


step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

isFlaggedFraud          2
isFraud                 2
type                    5
step                  743
oldbalanceOrg     1845844
newbalanceOrig    2682586
nameDest          2722362
newbalanceDest    3555499
oldbalanceDest    3614697
amount            5316900
nameOrig          6353307
dtype: int64

In [8]:
# Preprocessing: run simple pipeline from src.preprocessing
from preprocessing import preprocess, save_cleaned

print('Before shape:', df.shape)
cleaned = preprocess(df, drop_thresh=0.6, date_col='Date', time_col='Time', save_path=os.path.abspath(os.path.join('..','data','processed','cleaned.csv')))
print('After shape:', cleaned.shape)


Before shape: (6362620, 11)
After shape: (6362620, 15)


In [9]:
# Show first rows of cleaned data
try:
    display(cleaned.head())
    print('Cleaned file saved to data/processed/cleaned.csv')
except NameError:
    print('No cleaned dataframe found. Run the preprocessing cell first.')


Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,757869,170136.0,160296.36,1662094,0.0,0.0,0,0,False,False,False,True,False
1,1,1864.28,2188998,21249.0,19384.72,1733924,0.0,0.0,0,0,False,False,False,True,False
2,1,181.0,1002156,181.0,0.0,439685,0.0,0.0,1,0,False,False,False,False,True
3,1,181.0,5828262,181.0,0.0,391696,21182.0,0.0,1,0,False,True,False,False,False
4,1,11668.14,3445981,41554.0,29885.86,828919,0.0,0.0,0,0,False,False,False,True,False


Cleaned file saved to data/processed/cleaned.csv
