# Importing Dataset from Kaggle

In [None]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle (1).json


{'kaggle (1).json': b'{"username":"nicolelimty","key":"95d2d88da306148c6583aaa87fe6df73"}'}

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets download -d ismetsemedov/transactions


Dataset URL: https://www.kaggle.com/datasets/ismetsemedov/transactions
License(s): apache-2.0
transactions.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
import zipfile

with zipfile.ZipFile("transactions.zip", "r") as zip_ref:
    zip_ref.extractall("transactions")


# **Pandas**

In [1]:
import pandas as pd
import time

# Replace with your actual file path if using Google Drive
file_path = "transactions/synthetic_fraud_data.csv"

# Function to calculate memory usage of a DataFrame
def memory_usage(df):
    return f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB"


## No applied strategies

In [None]:
start_time = time.time()

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

end_time = time.time()

# Calculate time taken
time_taken = end_time - start_time

# Calculate memory usage in MB
memory_used = df.memory_usage(deep=True).sum() / (1024 * 1024)

print("No Applied Strategies")
print("Time:", end_time - start_time, "seconds")
print("Memory:", memory_usage(df))

No Applied Strategies
Time: 82.33181047439575 seconds
Memory: 9062.15 MB


## Columns Selection

In [None]:
use_cols = [
    'transaction_id', 'customer_id', 'timestamp',
    'amount', 'merchant_category', 'high_risk_merchant', 'is_fraud'
]

start_time = time.time()

# Load only specific columns from the CSV
df_less_data = pd.read_csv(file_path, usecols=use_cols)


end_time = time.time()

print("Load Less Data")
print("Time:", end_time - start_time, "seconds")
print("Memory:", memory_usage(df_less_data))
df_less_data.head()


Load Less Data
Time: 48.43134427070618 seconds
Memory: 2133.99 MB


Unnamed: 0,transaction_id,customer_id,timestamp,merchant_category,amount,high_risk_merchant,is_fraud
0,TX_a0ad2a2a,CUST_72886,2024-09-30 00:00:01.034820+00:00,Restaurant,294.87,False,False
1,TX_3599c101,CUST_70474,2024-09-30 00:00:01.764464+00:00,Entertainment,3368.97,True,True
2,TX_a9461c6d,CUST_10715,2024-09-30 00:00:02.273762+00:00,Grocery,102582.38,False,False
3,TX_7be21fc4,CUST_16193,2024-09-30 00:00:02.297466+00:00,Gas,630.6,False,False
4,TX_150f490b,CUST_87572,2024-09-30 00:00:02.544063+00:00,Healthcare,724949.27,False,True


## Chunking

In [None]:
chunk_size = 50000  # Number of rows per chunk

start_time = time.time()

chunks = []

for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    chunks.append(chunk)
    break  # Just read the first chunk

end_time = time.time()

print("Chunking (First Chunk Only)")
print("Time:", end_time - start_time, "seconds")
print("Memory of first chunk:", memory_usage(chunks[0]))
print("First chunk size:", len(chunks[0]))


Chunking (First Chunk Only)
Time: 0.5931453704833984 seconds
Memory of first chunk: 60.63 MB
First chunk size: 50000


## Optimize Data Types

In [2]:
start_time = time.time()

df_opt_pandas = pd.read_csv(
    file_path,
    dtype={
        'amount': 'float32',
        'distance_from_home': 'int32',
        'transaction_hour': 'int8',
        'is_fraud': 'category'
    }
)

end_time = time.time()

print("Pandas - Data Type Optimization")
print("Time:", end_time - start_time, "seconds")
print("Memory:", df_opt_pandas.memory_usage(deep=True).sum() / (1024 ** 2), "MB")

Pandas - Data Type Optimization
Time: 84.25377464294434 seconds
Memory: 8955.09714794159 MB


## Sampling

### Stratified Sampling

In [2]:
df_sample = pd.read_csv(file_path)

from sklearn.model_selection import train_test_split

df_sample['is_fraud'] = df_sample['is_fraud'].astype('category')

start_time = time.time()

# Split the data while preserving the fraud distribution
_, stratified_sample = train_test_split(
    df_sample,
    test_size=0.1,
    stratify=df_sample['is_fraud'],
    random_state=42
)


end_time = time.time()

print("Stratified Sampling")
print("Time:", end_time - start_time, "seconds")
print("Sample Size:", len(stratified_sample))
print("Fraud Rate:\n", stratified_sample['is_fraud'].value_counts(normalize=True))
print("Memory:", memory_usage(stratified_sample))


Stratified Sampling
Time: 37.037365436553955 seconds
Sample Size: 748377
Fraud Rate:
 is_fraud
False    0.800272
True     0.199728
Name: proportion, dtype: float64
Memory: 911.93 MB


In [3]:
fraudulent_transactions = df_sample[df_sample['is_fraud'] == True]
print(len(fraudulent_transactions))


1494719
