In [1]:
import pandas as pd
import os

# Read the large CSV file
df = pd.read_csv('../datasets/merged_emails.csv')

# Get file size in MB
file_size = os.path.getsize('../datasets/merged_emails.csv') / (1024 * 1024)
print(f"Original file size: {file_size:.2f} MB")

# Calculate number of chunks needed (target < 25MB per chunk)
num_chunks = int(file_size / 20) + 1  # Use 20MB as target to stay well under 25MB
chunk_size = len(df) // num_chunks

print(f"Splitting into {num_chunks} chunks of approximately {chunk_size} rows each")

# Create output directory if it doesn't exist
output_dir = '../datasets/email_chunks'
os.makedirs(output_dir, exist_ok=True)

# Split and save chunks
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size if i < num_chunks - 1 else len(df)
    
    chunk_df = df.iloc[start_idx:end_idx]
    output_file = f'{output_dir}/merged_emails_part_{i+1:02d}.csv'
    chunk_df.to_csv(output_file, index=False)
    
    # Check chunk size
    chunk_size_mb = os.path.getsize(output_file) / (1024 * 1024)
    print(f"Part {i+1}: {len(chunk_df)} rows, {chunk_size_mb:.2f} MB")

print(f"\nFiles saved to: {output_dir}")

Original file size: 227.99 MB
Splitting into 12 chunks of approximately 10945 rows each
Part 1: 10945 rows, 21.57 MB
Part 2: 10945 rows, 17.48 MB
Part 3: 10945 rows, 17.26 MB
Part 4: 10945 rows, 13.74 MB
Part 5: 10945 rows, 20.11 MB
Part 6: 10945 rows, 13.74 MB
Part 7: 10945 rows, 24.99 MB
Part 8: 10945 rows, 19.32 MB
Part 9: 10945 rows, 18.29 MB
Part 10: 10945 rows, 21.24 MB
Part 11: 10945 rows, 21.42 MB
Part 12: 10951 rows, 18.84 MB

Files saved to: ../datasets/email_chunks
