# Import from Kaggle

In [1]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"nicolelimty","key":"95d2d88da306148c6583aaa87fe6df73"}'}

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [3]:
!kaggle datasets download -d ismetsemedov/transactions


Dataset URL: https://www.kaggle.com/datasets/ismetsemedov/transactions
License(s): apache-2.0
Downloading transactions.zip to /content
 93% 702M/754M [00:02<00:00, 282MB/s]
100% 754M/754M [00:02<00:00, 300MB/s]


In [4]:
import zipfile

with zipfile.ZipFile("transactions.zip", "r") as zip_ref:
    zip_ref.extractall("transactions")


# Dask Parallel Processing

In [8]:
import dask.dataframe as dd
import time
file_path = "transactions/synthetic_fraud_data.csv"

In [9]:
# Load with Dask
start_time_dask = time.time()
df_dask = dd.read_csv(file_path)
#df_dask_computed = df_dask.compute() # Trigger computation to bring data into memory
end_time_dask = time.time()

print("Load with Dask:")
print("Time:", end_time_dask - start_time_dask, "seconds")
#print("Memory:", memory_usage(df_dask))

Load with Dask:
Time: 0.0668489933013916 seconds


In [10]:
df_dask.head()

Unnamed: 0,transaction_id,customer_id,card_number,timestamp,merchant_category,merchant_type,merchant,amount,currency,country,...,device,channel,device_fingerprint,ip_address,distance_from_home,high_risk_merchant,transaction_hour,weekend_transaction,velocity_last_hour,is_fraud
0,TX_a0ad2a2a,CUST_72886,6646734767813109,2024-09-30 00:00:01.034820+00:00,Restaurant,fast_food,Taco Bell,294.87,GBP,UK,...,iOS App,mobile,e8e6160445c935fd0001501e4cbac8bc,197.153.60.199,0,False,0,False,"{'num_transactions': 1197, 'total_amount': 334...",False
1,TX_3599c101,CUST_70474,376800864692727,2024-09-30 00:00:01.764464+00:00,Entertainment,gaming,Steam,3368.97,BRL,Brazil,...,Edge,web,a73043a57091e775af37f252b3a32af9,208.123.221.203,1,True,0,False,"{'num_transactions': 509, 'total_amount': 2011...",True
2,TX_a9461c6d,CUST_10715,5251909460951913,2024-09-30 00:00:02.273762+00:00,Grocery,physical,Whole Foods,102582.38,JPY,Japan,...,Firefox,web,218864e94ceaa41577d216b149722261,10.194.159.204,0,False,0,False,"{'num_transactions': 332, 'total_amount': 3916...",False
3,TX_7be21fc4,CUST_16193,376079286931183,2024-09-30 00:00:02.297466+00:00,Gas,major,Exxon,630.6,AUD,Australia,...,iOS App,mobile,70423fa3a1e74d01203cf93b51b9631d,17.230.177.225,0,False,0,False,"{'num_transactions': 764, 'total_amount': 2201...",False
4,TX_150f490b,CUST_87572,6172948052178810,2024-09-30 00:00:02.544063+00:00,Healthcare,medical,Medical Center,724949.27,NGN,Nigeria,...,Chrome,web,9880776c7b6038f2af86bd4e18a1b1a4,136.241.219.151,1,False,0,False,"{'num_transactions': 218, 'total_amount': 4827...",True


## Columns Selection

In [None]:

use_cols = [
    'transaction_id', 'customer_id', 'timestamp', 'high_risk_merchant',
    'amount', 'merchant_category', 'is_fraud'
]

start_time = time.time()

# Load selected columns using Dask (parallel reading)
df_less_data = dd.read_csv(file_path, usecols=use_cols)

# Compute the Dask DataFrame to bring it into memory
#df_less_data_computed = df_less_data.compute()

end_time = time.time()

# Check memory usage using pandas
#memory = df_less_data_computed.memory_usage(deep=True).sum() / 1024**2

print("Load Less Data (Dask - Parallel)")
print("Time:", end_time - start_time, "seconds")
#print(f"Memory: {memory:.2f} MB")
#df_less_data_computed.head()


Load Less Data (Dask - Parallel)
Time: 0.025038480758666992 seconds


## Chunking

In [None]:
#Dask's blocksize is based on file size (bytes)
total_rows = 7483766
desired_chunk_rows = 50000

# 1. Get the total file size in bytes
file_size_bytes = os.path.getsize(file_path)

# 2. Calculate the average row size in bytes
average_row_size_bytes = file_size_bytes / total_rows

# 3. Estimate the blocksize for the desired number of rows
estimated_blocksize = average_row_size_bytes * desired_chunk_rows

print(estimated_blocksize)

19603980.24203322


In [None]:
start_time = time.time()

# Read CSV using Dask with the estimated blocksize
df_dask = dd.read_csv(file_path, int(estimated_blocksize))

# Get only the first partition
#first_partition = df_dask.get_partition(0).compute()
first_partition = df_dask.get_partition(0)

end_time = time.time()

print(f"Chunking (Dask - First Partition with Estimated Blocksize of {estimated_blocksize:.2f} bytes)")
print("Time:", end_time - start_time, "seconds")


#print("Memory of first chunk:", memory_usage(first_partition))
print("First chunk size:", len(first_partition))

Chunking (Dask - First Partition with Estimated Blocksize of 19603980.24 bytes)
Time: 0.07792377471923828 seconds
First chunk size: 50102


## Optimized Datatype

In [None]:
df_opt = dd.read_csv(file_path)
df_opt.dtypes

Unnamed: 0,0
transaction_id,string[pyarrow]
customer_id,string[pyarrow]
card_number,int64
timestamp,string[pyarrow]
merchant_category,string[pyarrow]
merchant_type,string[pyarrow]
merchant,string[pyarrow]
amount,float64
currency,string[pyarrow]
country,string[pyarrow]


In [None]:
start_time = time.time()

# Load the dataset
df_opt = dd.read_csv(file_path)

# Convert eligible 'object' columns to 'category'
for col in df_opt.select_dtypes(include='object').columns:
    nunique = df_opt[col].nunique()
    total = len(df_opt[col])
    if nunique / total < 0.5:
        df_opt[col] = df_opt[col].astype('category')

# Downcast numeric columns
df_opt['amount'] = df_opt['amount'].astype('float32')
df_opt['distance_from_home'] = df_opt['distance_from_home'].astype('int32')
df_opt['transaction_hour'] = df_opt['transaction_hour'].astype('int32')

# Trigger computation
#df_opt_pd = df_opt.compute()

end_time = time.time()

# Show results
print("Optimize Data Types (Dask)")
print("Time:", end_time - start_time, "seconds")
#print("Memory Usage:", df_opt_pd.memory_usage(deep=True).sum() / 1024 ** 2, "MB")

Optimize Data Types (Dask)
Time: 0.02937626838684082 seconds


In [None]:
df_opt.dtypes

Unnamed: 0,0
transaction_id,string[pyarrow]
customer_id,string[pyarrow]
card_number,int64
timestamp,string[pyarrow]
merchant_category,string[pyarrow]
merchant_type,string[pyarrow]
merchant,string[pyarrow]
amount,float32
currency,string[pyarrow]
country,string[pyarrow]


## Sampling

In [None]:
start_time = time.time()

# Define a function to sample from a Pandas DataFrame
def sample_group(df_group):
    # Sample 10% from each group. Adjust 'frac' as needed.
    return df_group.sample(frac=0.1, random_state=42)

fraud_df_dask = df_dask[df_dask['is_fraud'] == 1]
not_fraud_df_dask = df_dask[df_dask['is_fraud'] == 0]

# Sample from each group. Adjust frac to get the desired overall sample size and stratification.
# For a 10% overall sample with preserved ratio, you'd sample 10% from each.
sampled_fraud_dask = fraud_df_dask.sample(frac=0.1, random_state=42)
sampled_not_fraud_dask = not_fraud_df_dask.sample(frac=0.1, random_state=42)

# Concatenate the samples
stratified_sample_dask = dd.concat([sampled_fraud_dask, sampled_not_fraud_dask])

# Trigger computation
#stratified_sample_computed = stratified_sample_dask.compute()

end_time = time.time()

print("Stratified Sampling (Dask - Conceptual)")
print("Time:", end_time - start_time, "seconds")
#print("Sample Size:", len(stratified_sample_computed))
print("Sample Size:", len(stratified_sample_dask))
#print("Fraud Rate:\n", stratified_sample_computed['is_fraud'].value_counts(normalize=True))
#print("Memory:", memory_usage(stratified_sample_computed))

Stratified Sampling (Dask - Conceptual)
Time: 0.01505422592163086 seconds
Sample Size: 748384
