In [None]:
import pandas as pd
import os

def split_by_entity_and_batch(file_path, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Step 1: Load the CSV file
    df = pd.read_csv(file_path)

    # Step 2: Group by entity_name
    grouped = df.groupby('entity_name')

    # Step 3: Process each group
    for entity, group_df in grouped:
        # Step 4: Split into batches of 20,000 rows
        num_rows = len(group_df)
        batch_size = 1000
        num_batches = num_rows // batch_size
        leftover_rows = num_rows % batch_size

        for batch in range(num_batches):
            # Get the batch rows
            batch_df = group_df.iloc[batch * batch_size:(batch + 1) * batch_size]
            # Save each batch to a CSV file
            batch_df.to_csv(f'{output_folder}/{entity}_batch_{batch + 1}.csv', index=False)

        # Step 5: Save leftover rows (if any)
        if leftover_rows > 0:
            leftover_df = group_df.iloc[num_batches * batch_size:]
            leftover_df.to_csv(f'{output_folder}/{entity}_leftover.csv', index=False)

# Example usage
file_path = 'test.csv'  # Replace with your file path
output_folder = 'split_data_2'  # Directory to save the split files
split_by_entity_and_batch(file_path, output_folder)


In [None]:
import shutil
from google.colab import files

# Step 1: Zip the folder
shutil.make_archive('split_data_2', 'zip', 'split_data_2')

# Step 2: Download the zipped folder
files.download('split_data_2.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>