In [None]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import time
import os
import gzip  # <-- 1. IMPORT THE GZIP MODULE

# --- Step 1: Generate a Large CSV File ---
print("Generating a large CSV file...")

# Define the number of rows and chunks for generation
total_rows = 100_000_000
chunk_size = 500_000
num_chunks = total_rows // chunk_size

# Define column data
categories = ['A', 'B', 'C', 'D', 'E']

# Open a file to write to
with open('large_dataset.csv', 'w') as f:
    # Write the header
    f.write('id,timestamp,category,value\n')

    # Generate and write data in chunks to avoid memory issues during creation
    for i in range(num_chunks):
        # Create a chunk of data
        chunk_df = pd.DataFrame({
            'id': range(i * chunk_size, (i + 1) * chunk_size),
            'timestamp': pd.date_range(start='2023-01-01', periods=chunk_size, freq='s'),
            'category': np.random.choice(categories, chunk_size),
            'value': np.random.rand(chunk_size) * 100
        })
        # Append chunk to the CSV file (without the header)
        chunk_df.to_csv(f, header=False, index=False)
        print(f"Generated chunk {i+1}/{num_chunks}")

print("\n'large_dataset.csv' created successfully.")


# --- Step 1: Create the compressed file without loading the whole original file into memory ---
print("\nStep 1: Compressing the file (this may take a while)...")
# We read the original large CSV in chunks and write each chunk to a new compressed file.
# This avoids the out-of-memory error during the compression process.
chunk_iterator_for_compression = pd.read_csv('large_dataset.csv', chunksize=chunk_size)

# We need to write the header first, then append the data chunks.
# Use gzip.open to actually compress the data as it's written.
first_chunk = True
# --- 2. USE gzip.open() INSTEAD OF open() ---
with gzip.open('large_dataset.csv.gz', 'wt') as f_out:
    for chunk in chunk_iterator_for_compression:
        chunk.to_csv(f_out, header=first_chunk, index=False)
        first_chunk = False # Ensure header is only written once

print("File compressed to 'large_dataset.csv.gz'")
original_size = os.path.getsize('large_dataset.csv') / (1024**3) # in GB
compressed_size = os.path.getsize('large_dataset.csv.gz') / (1024**3) # in GB
print(f"Original file size: {original_size:.2f} GB")
print(f"Compressed file size: {compressed_size:.2f} GB")

Generating a large CSV file...
Generated chunk 1/200
Generated chunk 2/200
Generated chunk 3/200
Generated chunk 4/200
Generated chunk 5/200
Generated chunk 6/200
Generated chunk 7/200
Generated chunk 8/200
Generated chunk 9/200
Generated chunk 10/200
Generated chunk 11/200
Generated chunk 12/200
Generated chunk 13/200
Generated chunk 14/200
Generated chunk 15/200
Generated chunk 16/200
Generated chunk 17/200
Generated chunk 18/200
Generated chunk 19/200
Generated chunk 20/200
Generated chunk 21/200
Generated chunk 22/200
Generated chunk 23/200
Generated chunk 24/200
Generated chunk 25/200
Generated chunk 26/200
Generated chunk 27/200
Generated chunk 28/200
Generated chunk 29/200
Generated chunk 30/200
Generated chunk 31/200
Generated chunk 32/200
Generated chunk 33/200
Generated chunk 34/200
Generated chunk 35/200
Generated chunk 36/200
Generated chunk 37/200
Generated chunk 38/200
Generated chunk 39/200
Generated chunk 40/200
Generated chunk 41/200
Generated chunk 42/200
Generated ch

In [None]:
!pip install psutil



In [None]:
import psutil

def get_memory_usage():
    """Returns the current memory usage of the process in MB."""
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return mem_info.rss / (1024 ** 2) # Resident Set Size in MB
# --- Method 1: Using Chunking ---
print("\n--- Method 1: Using pandas.read_csv(chunksize) ---")
start_time = time.time()
initial_mem = get_memory_usage()
peak_mem = initial_mem
chunk_iterator = pd.read_csv('large_dataset.csv', chunksize=50000)
total_sum = 0
total_count = 0

for chunk in chunk_iterator:
    total_sum += chunk['value'].sum()
    total_count += len(chunk)
    current_mem = get_memory_usage()
    if current_mem > peak_mem:
        peak_mem = current_mem


average_value = total_sum / total_count

end_time = time.time()

print(f"Calculated Average Value: {average_value:.4f}")
print(f"Execution Time: {end_time - start_time:.2f} seconds")
print(f"Storage Impact: Uses the original file size on disk ({original_size:.2f}) GB.")
print(f"Peak RAM used: {peak_mem - initial_mem:.2f} MB")
print("Memory Impact: Very low. Only one chunk is in memory at a time.")


--- Method 1: Using pandas.read_csv(chunksize) ---
Execution Time: 66.49 seconds
Calculated Average Value: 50.0031
Storage Impact: Uses the original file size on disk (4.57) GB.
Peak RAM used: 3.93 MB
Memory Impact: Very low. Only one chunk is in memory at a time.


In [None]:
# --- Method 2: Using Dask ---
print("\n--- Method 2: Using Dask ---")
start_time = time.time()
initial_mem = get_memory_usage()
# Dask reads the CSV file and creates a Dask DataFrame (lazy evaluation)
ddf = dd.read_csv('large_dataset.csv')

# Perform the aggregation. This is still lazy.
average_value_dask = ddf['value'].mean()

# .compute() triggers the actual calculation
average_value = average_value_dask.compute()
peak_mem = get_memory_usage()
end_time = time.time()

print(f"Calculated Average Value: {average_value:.4f}")
print(f"Execution Time: {end_time - start_time:.2f} seconds")
print(f"Storage Impact: Uses the original file size on disk ({original_size:.2f}) GB.")
print(f"Peak RAM used: {peak_mem - initial_mem:.2f} MB")
print("Memory Impact: Low. Dask intelligently manages memory and processes data in parallel.")



--- Method 2: Using Dask ---
Calculated Average Value: 50.0031
Execution Time: 73.22 seconds
Storage Impact: Uses the original file size on disk (4.57) GB.
Peak RAM used: 17.03 MB
Memory Impact: Low. Dask intelligently manages memory and processes data in parallel.


In [None]:

# --- Method 3: Using Compression (Read in one go) ---
print("\n--- Method 3: Using Compression (Reading entire file at once) ---")

# --- Read the entire compressed file into memory at once ---

print("This will only work if you have enough available RAM.")

start_time = time.time()
initial_mem = get_memory_usage()
try:
    # This reads the whole .gz file into one DataFrame
    df_compressed = pd.read_csv('large_dataset.csv.gz', compression='gzip')

    # Now perform the calculation on the in-memory DataFrame
    average_value = df_compressed['value'].mean()
    peak_mem = get_memory_usage()
    end_time = time.time()
    execution_time = end_time - start_time

    print(f"Calculated Average Value: {average_value:.4f}")
    print(f"Execution Time: {execution_time:.2f} seconds")
    print(f"Storage Impact: Uses the compressed file size on disk (~{compressed_size:.2f} MB).")
    print(f"Peak RAM used: {peak_mem - initial_mem:.2f} MB")
    print("Memory Impact: HIGH. The entire decompressed dataset is loaded into RAM.")

except MemoryError:
    end_time = time.time()
    execution_time = end_time - start_time
    print("\nERROR: Out of memory!")
    print(f"The script crashed after {execution_time:.2f} seconds.")
    print("The decompressed file is too large to fit into your system's RAM.")
    print("To process this file, you must use a memory-efficient method like Method 1 (chunking) or Method 2 (Dask).")


2. Comparison in Terms of Time, RAM and Storage
Now, let's summarize the findings in a comparison table. The exact execution times will vary based on hardware (CPU speed, number of cores, disk type - SSD vs. HDD), but the relative performance trends are generally consistent.

| Feature | `pandas.read_csv(chunksize)` | `Dask` | `Compression`|
| :--- | :--- | :--- | :--- |
| **Execution Time** | **fastest** (depending on chunk size) | **fast** (Parallel processing) | **slowest** (Slower due to decompression overhead) |
| **Storage (Disk)** | High (Original file size) | High (Original file size) | **Lowest** (Compressed file size, often 5-6x smaller) |
| **Memory (RAM) Usage**| **Lowest** (Size of one chunk) | **Low** (Managed intelligently, slightly higher overhead than chunking) | **high** (Size of the decompressed file)(Original file size) |
| **Ease of Use** | Easy (Standard pandas) but with aditional loop to read from each shunk | Easy (Familiar pandas-like API) | Easy (Two-step process, but straightforward) |
| **Best For** | Simple, sequential tasks; when you can't install new libraries. | Complex analyses, aggregations, and operations that can be parallelized. | Saving disk space and speeding up I/O-bound tasks, especially on slower disks. Ideal for archiving datasets. 
but its heavy to ram, so it's not recomended with low ram devices |

