In [None]:
import pandas as pd
import time
import memory_profiler
import gc

csv_file = "E:\\IBM\\LI-Medium_Trans.csv"

def clear_memory():
    gc.collect()
    time.sleep(5)

print("Using chunking...")
start_time = time.time()
memory_before = memory_profiler.memory_usage()[0]

for chunk in pd.read_csv(csv_file, chunksize=1000):
    pass  

chunk_time = time.time() - start_time
chunk_memory = memory_profiler.memory_usage()[0] - memory_before

clear_memory()

print(f"Chunking - Time: {chunk_time:.2f}s, Memory: {chunk_memory:.2f}MB")


In [None]:
import dask.dataframe as dd
import time
import memory_profiler
import gc

csv_file = "E:\\IBM\\LI-Medium_Trans.csv"

def clear_memory():
    gc.collect()
    time.sleep(5)

print("Using Dask...")
start_time = time.time()
memory_before = memory_profiler.memory_usage()[0]

df_dask = dd.read_csv(csv_file)
df_dask.compute()

dask_time = time.time() - start_time
dask_memory = memory_profiler.memory_usage()[0] - memory_before

del df_dask
clear_memory()

print(f"Dask - Time: {dask_time:.2f}s, Memory: {dask_memory:.2f}MB")


In [None]:
import pandas as pd
import time
import memory_profiler

csv_file = "E:\\IBM\\LI-Large_Trans.csv"  
gz_file = "E:\\IBM\\LI-Large_Trans.csv.gz" 

print("Using compression with large data...")
print("Compressing CSV file using Pandas chunking...")

with pd.read_csv(csv_file, chunksize=1000) as reader:
    for i, chunk in enumerate(reader):
        chunk.to_csv(gz_file, index=False, compression="gzip", mode='a', header=(i == 0))

print(f"File compressed and saved as: {gz_file}")
print("Reading compressed file...")

start_time = time.time()
memory_before = memory_profiler.memory_usage()[0]

df_compressed = pd.read_csv(gz_file, compression="gzip")

read_time = time.time() - start_time
memory_used = memory_profiler.memory_usage()[0] - memory_before

print(f"\nRead Time: {read_time:.2f} seconds")
print(f"Memory Used: {memory_used:.2f} MB")
