<a href="https://colab.research.google.com/github/Masta-cynat/OscarWebScraper/blob/main/UpdatedCleaningScript.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os

# Read the initial CSV file with separator modification and low memory usage
df = pd.read_csv("lifebear.csv", sep=';', low_memory=True)

# Display file details
print(df.head())
print(df.info())
print(df.describe())

# Function to split CSV by size
def split_csv_by_size(file_path, max_size_in_mb, output_prefix):
    # Convert max size to bytes
    max_size_in_bytes = max_size_in_mb * 1024 * 1024

    # Open the CSV file in chunks to handle large files efficiently
    chunk_iter = pd.read_csv(file_path, sep=';', chunksize=10000, low_memory=True)  # Read in 10,000 rows per chunk
    current_chunk = pd.DataFrame()  # Store the current chunk of data
    part_num = 1  # Keep track of the part number
    current_size = 0  # Track the current file size in memory

    for chunk in chunk_iter:
        # Append the chunk to the current batch
        current_chunk = pd.concat([current_chunk, chunk], ignore_index=True)

        # Estimate the size of the current chunk in memory (rows * columns * estimated row size)
        current_size += current_chunk.memory_usage(deep=True).sum()

        # Check if current size exceeds the max limit, and save the file
        if current_size >= max_size_in_bytes:
            output_file = f"{output_prefix}_part_{part_num}.csv"
            current_chunk.to_csv(output_file, index=False, sep=';')
            print(f"Saved {output_file} (size: {current_size / (1024 * 1024):.2f} MB)")

            # Reset for the next part
            part_num += 1
            current_chunk = pd.DataFrame()  # Clear current chunk
            current_size = 0  # Reset size counter

    # Save any remaining data that didn't reach the size threshold
    if not current_chunk.empty:
        output_file = f"{output_prefix}_part_{part_num}.csv"
        current_chunk.to_csv(output_file, index=False, sep=';')
        final_size = os.path.getsize(output_file)
        print(f"Saved {output_file} (size: {final_size / (1024 * 1024):.2f} MB)")

# Usage
file_path = 'lifebear.csv'  # Path to the CSV file
max_size_in_mb = 150        # Max size of each chunk in MB
output_prefix = 'chunk'     # Output file name prefix

# Split the CSV file by size
split_csv_by_size(file_path, max_size_in_mb, output_prefix)
