## **Group Members:**
| Name                    | Matric Number |
|-------------------------|---------------|
| **GOH JING YANG** | A22EC0052     |
| **LOO JIA CHANG**           | A22EC0074     | **bold text**

## Pandas

In [1]:
#  Install Pandas (if not already installed)
!pip install pandas


Defaulting to user installation because normal site-packages is not writeable


In [17]:
#  Import libraries
import pandas as pd
import time
import psutil
import os

# Get process for memory tracking
process = psutil.Process(os.getpid())
mem_before = 0
mem_before = process.memory_info().rss / 1024 ** 2
start_time = time.time()


#  STEP 1: Load Full Dataset (Unoptimized)
df_pd = pd.read_csv("Transactions.csv", low_memory=False)

#  STEP 2: Inspect the Dataset Structure
print(" Dataset Shape:", df_pd.shape)
print(" Column Names:", df_pd.columns.tolist())
print("\n Data Types:\n", df_pd.dtypes)

#  STEP 3: Basic Data Cleaning (No Optimization Applied)
df_pd = df_pd.drop_duplicates()
df_pd = df_pd.dropna()

# Record memory usage after processing
mem_after =0
mem_after = process.memory_info().rss / 1024 ** 2
# Record end time for execution measurement
end_time = time.time()


 Dataset Shape: (7483766, 24)
 Column Names: ['transaction_id', 'customer_id', 'card_number', 'timestamp', 'merchant_category', 'merchant_type', 'merchant', 'amount', 'currency', 'country', 'city', 'city_size', 'card_type', 'card_present', 'device', 'channel', 'device_fingerprint', 'ip_address', 'distance_from_home', 'high_risk_merchant', 'transaction_hour', 'weekend_transaction', 'velocity_last_hour', 'is_fraud']

 Data Types:
 transaction_id          object
customer_id             object
card_number              int64
timestamp               object
merchant_category       object
merchant_type           object
merchant                object
amount                 float64
currency                object
country                 object
city                    object
city_size               object
card_type               object
card_present              bool
device                  object
channel                 object
device_fingerprint      object
ip_address              object
distance_

In [7]:
# Print Preview Data
print("\n Data Preview:\n", df_pd.head())


 Data Preview:
   transaction_id customer_id       card_number  \
0    TX_a0ad2a2a  CUST_72886  6646734767813109   
1    TX_3599c101  CUST_70474   376800864692727   
2    TX_a9461c6d  CUST_10715  5251909460951913   
3    TX_7be21fc4  CUST_16193   376079286931183   
4    TX_150f490b  CUST_87572  6172948052178810   

                          timestamp merchant_category merchant_type  \
0  2024-09-30 00:00:01.034820+00:00        Restaurant     fast_food   
1  2024-09-30 00:00:01.764464+00:00     Entertainment        gaming   
2  2024-09-30 00:00:02.273762+00:00           Grocery      physical   
3  2024-09-30 00:00:02.297466+00:00               Gas         major   
4  2024-09-30 00:00:02.544063+00:00        Healthcare       medical   

         merchant     amount currency    country  ...   device channel  \
0       Taco Bell     294.87      GBP         UK  ...  iOS App  mobile   
1           Steam    3368.97      BRL     Brazil  ...     Edge     web   
2     Whole Foods  102582.38     

In [19]:
#  STEP 4: Display Performance Metrics
print("\n Performance Measurement")
print(f"Execution Time: {end_time - start_time:.2f} seconds")
print(f"Memory Used: {mem_after - mem_before:.2f} MB")


 Performance Measurement
Execution Time: 259.60 seconds
Memory Used: 1564.06 MB


## Dask

In [15]:
#  Install Dask (if not already installed)
!pip install dask

Defaulting to user installation because normal site-packages is not writeable


In [1]:
# Import libraries for Dask
import dask.dataframe as dd
import time
import psutil
import os

# Get process for memory tracking
process = psutil.Process(os.getpid())
mem_after = 0
mem_before = 0
mem_before = process.memory_info().rss / 1024 ** 2
start_time = time.time()

#Define the required columns    
required_columns = [
    "transaction_id", 
    "customer_id", 
    "card_number", 
    "timestamp", 
    "merchant_category", 
    "merchant_type", 
    "merchant", 
    "amount", 
    "currency"
]

#  STEP 1: Load Full Dataset (Unoptimized)
df_dask = dd.read_csv("Transactions.csv" ,usecols=required_columns)

#  STEP 2: Inspect the Dataset Structure
print(" Dataset Shape:", df_dask.shape)
print(" Column Names:", df_dask.columns.tolist())
print("\n Data Types:\n", df_dask.dtypes)

# Optimize data types
df_dask = df_dask.astype({
    "transaction_id": "int64",
    "customer_id": "int64",
    "card_number": "object",
    "timestamp": "datetime64[ns]",
    "merchant_category": "category",
    "merchant_type": "category",
    "merchant": "object",
    "amount": "float64",
    "currency": "category"
})

#  STEP 3: Basic Data Cleaning (No Optimization Applied)
df_dask = df_dask.drop_duplicates()
df_dask = df_dask.dropna()

# Record memory usage after processing
mem_after = process.memory_info().rss / 1024 ** 2
# Record end time for execution measurement
end_time = time.time()



 Dataset Shape: (<dask_expr.expr.Scalar: expr=ReadCSV(accb4a7).size() // 9, dtype=int32>, 9)
 Column Names: ['transaction_id', 'customer_id', 'card_number', 'timestamp', 'merchant_category', 'merchant_type', 'merchant', 'amount', 'currency']

 Data Types:
 transaction_id       string[pyarrow]
customer_id          string[pyarrow]
card_number                    int64
timestamp            string[pyarrow]
merchant_category    string[pyarrow]
merchant_type        string[pyarrow]
merchant             string[pyarrow]
amount                       float64
currency             string[pyarrow]
dtype: object


In [2]:
#  STEP 4: Display Memory Usage and Execution Time
print("\n Performance Measurement")
print(f"Execution Time: {end_time - start_time:.2f} seconds")
print(f"Memory Used: {mem_after - mem_before:.2f} MB")


 Performance Measurement
Execution Time: 2.29 seconds
Memory Used: 16.00 MB


## Polars

In [16]:
#  Install Polars (if not already installed)
!pip install polars

Defaulting to user installation because normal site-packages is not writeable


In [9]:
# Import libraries for Polars
import polars as pl
import time
import psutil
import os

# Get process for memory tracking
process = psutil.Process(os.getpid())
mem_after = 0
mem_before = 0
mem_before = process.memory_info().rss / 1024 ** 2
start_time = time.time()

required_columns = [
    "transaction_id", 
    "customer_id", 
    "card_number", 
    "timestamp", 
    "merchant_category", 
    "merchant_type", 
    "merchant", 
    "amount", 
    "currency"
]

#  STEP 1: Load Full Dataset (Unoptimized)
df_polars = pl.read_csv("Transactions.csv",columns=required_columns)

#  STEP 2: Inspect the Dataset Structure
print(" Dataset Shape:", df_polars.shape)
print(" Column Names:", df_polars.columns)
print("\n Data Types:\n", df_polars.dtypes)

# Optimize data types
df_polars = df_polars.with_columns([
    pl.col("transaction_id").cast(pl.Utf8),  # Keep as string since it contains "TX_" prefix
    pl.col("customer_id").cast(pl.Utf8),     # Keep as string since it contains "CUST_" prefix
    pl.col("card_number").cast(pl.Utf8),
    pl.col("timestamp").str.replace(r"\+00:00$", "").str.to_datetime(format="%Y-%m-%d %H:%M:%S%.f"),  # Remove timezone and parse
    pl.col("merchant_category").cast(pl.Categorical),
    pl.col("merchant_type").cast(pl.Categorical),
    pl.col("merchant").cast(pl.Utf8),
    pl.col("amount").cast(pl.Float64),
    pl.col("currency").cast(pl.Categorical)
])


#  STEP 3: Basic Data Cleaning (No Optimization Applied)
df_polars = df_polars.unique()
df_polars = df_polars.drop_nulls()


# Record memory usage after processing
mem_after = process.memory_info().rss / 1024 ** 2
# Record end time for execution measurement
end_time = time.time()



 Dataset Shape: (7483766, 9)
 Column Names: ['transaction_id', 'customer_id', 'card_number', 'timestamp', 'merchant_category', 'merchant_type', 'merchant', 'amount', 'currency']

 Data Types:
 [String, String, Int64, String, String, String, String, Float64, String]


In [11]:
#  STEP 4: Display Memory Usage and Execution Time
print("\n Performance Measurement")
print(f"Execution Time: {end_time - start_time:.2f} seconds")
print(f"Memory Used: {mem_after - mem_before:.2f} MB")


 Performance Measurement
Execution Time: 17.87 seconds
Memory Used: 2425.53 MB
