## Importing Libraries

In [0]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
np.random.seed(42)

### Pandas memory usage

In [0]:
def generate_synthetic_data(num_records) -> pd.DataFrame:
    """
    Generates a synthetic dataset of customer transactions.

    Args:
        num_records (int): The number of records to generate.

    Returns:
        pd.DataFrame: A DataFrame containing the synthetic data with the following columns:
            - customer_id (int): A randomly generated customer ID.
            - category (str): A randomly chosen category from a predefined list.
            - amount (float): A transaction amount generated from a normal distribution with a mean of 100 and a standard deviation of 50.
            - transaction_date (datetime): A transaction date randomly chosen within the last year.
    """ 
    categories = ['Electronics', 'Clothing', 'Food', 'Books', 'Home']
    customer_ids = np.random.randint(1, 10000, num_records)
    categories = np.random.choice(categories, num_records)
    amounts = np.random.normal(100, 50, num_records)  # Mean of 100, std dev of 50
    dates = [datetime.now() - timedelta(days=int(x)) for x in np.random.randint(0, 365, num_records)]

    return pd.DataFrame({
        'customer_id': customer_ids,
        'category': categories,
        'amount': amounts,
        'transaction_date': dates
    })


In [0]:
generate_synthetic_data(15)

Unnamed: 0,customer_id,category,amount,transaction_date
0,7271,Books,107.12323,2024-12-26 23:27:18.782482
1,861,Home,98.267391,2024-11-04 23:27:18.782494
2,5391,Electronics,156.716963,2024-06-03 23:27:18.782495
3,5192,Books,94.762723,2024-04-14 23:27:18.782497
4,5735,Clothing,73.743857,2024-08-11 23:27:18.782498
5,6266,Home,195.638564,2024-07-15 23:27:18.782499
6,467,Books,-1.335981,2025-01-05 23:27:18.782500
7,4427,Electronics,155.971181,2024-09-09 23:27:18.782502
8,5579,Electronics,138.959632,2024-07-31 23:27:18.782503
9,8323,Food,44.945112,2024-05-24 23:27:18.782505


In [0]:
def compare_pandas_storage(sizes: list[int], column_info: bool = False) -> None:
    """
    Compares the memory usage of pandas DataFrames with different sizes of synthetic data.
    This function generates synthetic data for various sizes.

    The sizes of the datasets are defined in the `sizes` list, and the memory usage is
    reported in megabytes (MB).
    """
    for size in sizes:
        start_time = time.time()
        df = generate_synthetic_data(size)
        end_time = time.time()
        creation_time = end_time - start_time
        
        memory_usage = df.memory_usage(deep=True).sum() / (1024 * 1024)  # Convert to MB
        print(f"\nDataset with {size:,} records:")
        print(f"Memory usage: {memory_usage:.2f} MB")
        print(f"Data creation time: {creation_time:.2f} seconds")
        
        if column_info:
            print("\nMemory usage by column:")
            for column in df.columns:
                column_memory = df[column].memory_usage(deep=True) / (1024 * 1024)
                print(f"{column}: {column_memory:.2f} MB")

def pandas_analysis(df):
    """Perform analysis using Pandas"""
    start_time = time.time()
    
    # Calculate average amount by category
    result = df.groupby('category')['amount'].agg(['mean', 'count']).reset_index()
    print(result)
    
    end_time = time.time()
    processing_time = end_time - start_time
    
    return result, processing_time

In [0]:
compare_pandas_storage([10_000, 100_000, 1_000_000])



Dataset with 10,000 records:
Memory usage: 0.83 MB
Data creation time: 0.02 seconds

Dataset with 100,000 records:
Memory usage: 8.33 MB
Data creation time: 0.23 seconds

Dataset with 1,000,000 records:
Memory usage: 83.35 MB
Data creation time: 2.15 seconds


In [0]:


print("Starting performance comparison...\n")
sizes = [1_000_000, 10_000_000, 100_000_000, 1_000_000_000] # 1M (84 MB), 10M (840 MB), 100M (8.4 GB), 1B (84 GB)
for size in sizes:
    print(f"\nGenerating {size:,} records...")
    
    # Pandas Analysis
    print(f"Running Pandas analysis... for {size:,} records")
    pandas_df = generate_synthetic_data(size)
    pandas_result, pandas_time = pandas_analysis(pandas_df)
    print(f"Pandas processing time for {size:,} records: {pandas_time:.2f} seconds")
    print("Pandas Results:")
    print(pandas_result)


[0;31m---------------------------------------------------------------------------[0m
[0;31mThe Python process exited with exit code 137 (SIGKILL: Killed). This may have been caused by an OOM error. Check your command's memory usage.[0m
[0;31m[0m
[0;31m[0m
[0;31m[0m
[0;31mThe last 10 KB of the process's stderr and stdout can be found below. See driver logs for full logs.[0m
[0;31m---------------------------------------------------------------------------[0m
[0;31mLast messages on stderr:[0m
[0;31mMon Feb 17 23:24:10 2025 Connection to spark from PID  4145[0m
[0;31mMon Feb 17 23:24:10 2025 Initialized gateway on port 40233[0m
[0;31mMon Feb 17 23:24:10 2025 Connected to spark.[0m
[0;31m---------------------------------------------------------------------------[0m
[0;31mLast messages on stdout:[0m
[0;31mNOTE: When using the `ipython kernel` entry point, Ctrl-C will not work.[0m
[0;31m[0m
[0;31mTo exit, you will have to explicitly quit this process, by either 