# Importing Dataset from Kaggle

In [None]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"nicolelimty","key":"95d2d88da306148c6583aaa87fe6df73"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets download -d ismetsemedov/transactions


Dataset URL: https://www.kaggle.com/datasets/ismetsemedov/transactions
License(s): apache-2.0
Downloading transactions.zip to /content
 98% 739M/754M [00:02<00:00, 245MB/s]
100% 754M/754M [00:02<00:00, 303MB/s]


In [None]:
import zipfile

with zipfile.ZipFile("transactions.zip", "r") as zip_ref:
    zip_ref.extractall("transactions")


# **Polars**

In [None]:
pip install polars




In [None]:
import polars as pl
import time

# Replace with your actual file path if using Google Drive
file_path = "transactions/synthetic_fraud_data.csv"


In [None]:
start_time = time.time()

# Load the CSV file into a Polars DataFrame
df_polars = pl.read_csv(file_path)

end_time = time.time()

print("Polars")
print("Time:", end_time - start_time, "seconds")
# Polars has a convenient estimated_size method
print("Memory:", df_polars.estimated_size('mb'), "MB")

print(df_polars.head())

Polars
Time: 11.984588861465454 seconds
Memory: 2528.1607761383057 MB
shape: (5, 24)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ transacti ┆ customer_ ┆ card_numb ┆ timestamp ┆ … ┆ transacti ┆ weekend_t ┆ velocity_ ┆ is_fraud │
│ on_id     ┆ id        ┆ er        ┆ ---       ┆   ┆ on_hour   ┆ ransactio ┆ last_hour ┆ ---      │
│ ---       ┆ ---       ┆ ---       ┆ str       ┆   ┆ ---       ┆ n         ┆ ---       ┆ bool     │
│ str       ┆ str       ┆ i64       ┆           ┆   ┆ i64       ┆ ---       ┆ str       ┆          │
│           ┆           ┆           ┆           ┆   ┆           ┆ bool      ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ TX_a0ad2a ┆ CUST_7288 ┆ 664673476 ┆ 2024-09-3 ┆ … ┆ 0         ┆ false     ┆ {'num_tra ┆ false    │
│ 2a        ┆ 6         ┆ 7813109   ┆ 0 00:00:0 ┆   ┆           ┆           ┆ nsactions ┆          │
│     

## Columns Selection

In [None]:
start_time = time.time()

df_less_data_polars = pl.read_csv(
    file_path,
    columns=['transaction_id', 'customer_id', 'timestamp', 'amount', 'high_risk_merchant', 'merchant_category', 'is_fraud']
)

end_time = time.time()

print("Polars - Load Less Data")
print("Time:", end_time - start_time, "seconds")
print("Memory:", df_less_data_polars.estimated_size('mb'), "MB")

df_less_data_polars.head()

Polars - Load Less Data
Time: 6.203439474105835 seconds
Memory: 494.2481870651245 MB


transaction_id,customer_id,timestamp,merchant_category,amount,high_risk_merchant,is_fraud
str,str,str,str,f64,bool,bool
"""TX_a0ad2a2a""","""CUST_72886""","""2024-09-30 00:00:01.034820+00:…","""Restaurant""",294.87,False,False
"""TX_3599c101""","""CUST_70474""","""2024-09-30 00:00:01.764464+00:…","""Entertainment""",3368.97,True,True
"""TX_a9461c6d""","""CUST_10715""","""2024-09-30 00:00:02.273762+00:…","""Grocery""",102582.38,False,False
"""TX_7be21fc4""","""CUST_16193""","""2024-09-30 00:00:02.297466+00:…","""Gas""",630.6,False,False
"""TX_150f490b""","""CUST_87572""","""2024-09-30 00:00:02.544063+00:…","""Healthcare""",724949.27,False,True


## Chunking

In [None]:
start_time = time.time()

# Simulating chunking by reading limited rows
chunk = pl.read_csv(file_path, n_rows=50000)

end_time = time.time()

print("Polars - Simulated Chunking (First 50k rows)")
print("Time:", end_time - start_time, "seconds")
print("Memory:", chunk.estimated_size('mb'), "MB")


Polars - Simulated Chunking (First 50k rows)
Time: 3.688019275665283 seconds
Memory: 16.971336364746094 MB


## Optimize Data Type

In [None]:
start_time = time.time()

df_opt_polars = pl.read_csv(file_path).with_columns([
    pl.col("amount").cast(pl.Float32),
    pl.col("distance_from_home").cast(pl.Int32),
    pl.col("transaction_hour").cast(pl.Int8),
    pl.col("is_fraud").cast(pl.Categorical)
])

end_time = time.time()

print("Polars - Data Type Optimization")
print("Time:", end_time - start_time, "seconds")
print("Memory:", df_opt_polars.estimated_size('mb'), "MB")


Polars - Data Type Optimization
Time: 14.409408569335938 seconds
Memory: 2448.760739326477 MB


## Sampling

In [None]:
start_time = time.time()

df_sample_polars = pl.read_csv(file_path)
# Group by is_fraud
fraud_true = df_sample_polars.filter(pl.col("is_fraud") == "true").sample(fraction=0.1, seed=42)
fraud_false = df_sample_polars.filter(pl.col("is_fraud") == "false").sample(fraction=0.1, seed=42)

stratified_sample_polars = pl.concat([fraud_true, fraud_false])

end_time = time.time()

print("Polars - Stratified Sampling (Manual)")
print("Time:", end_time - start_time, "seconds")
print("Sample Size:", stratified_sample_polars.height)
print("Fraud Distribution:\n", stratified_sample_polars["is_fraud"].value_counts())
print("Memory:", stratified_sample_polars.estimated_size('mb'), "MB")


Polars - Stratified Sampling (Manual)
Time: 17.19397258758545 seconds
Sample Size: 748375
Fraud Distribution:
 shape: (2, 2)
┌──────────┬────────┐
│ is_fraud ┆ count  │
│ ---      ┆ ---    │
│ bool     ┆ u32    │
╞══════════╪════════╡
│ false    ┆ 598904 │
│ true     ┆ 149471 │
└──────────┴────────┘
Memory: 252.81647300720215 MB
