In [5]:
import pandas as pd
import os
import gc

# --- Configuration ---
# Define where your raw CSV data is
CSV_DATA_DIR = '../data/' # Adjust if your data folder is named differently

# Define where you want to save the new Parquet files
PARQUET_OUTPUT_DIR = '../data_parquet/' # Create this folder if it doesn't exist

# Create the output directory if needed
os.makedirs(PARQUET_OUTPUT_DIR, exist_ok=True)

# --- Process train_data.csv ---
train_csv_path = os.path.join(CSV_DATA_DIR, 'train_data.csv')
train_parquet_path = os.path.join(PARQUET_OUTPUT_DIR, 'train.parquet')

if not os.path.exists(train_parquet_path):
    print(f"Converting {train_csv_path} to parquet...")
    
    # Read the CSV in chunks and append to a temporary list
    chunk_list = []
    reader = pd.read_csv(train_csv_path, chunksize=500000) 
    
    for i, chunk in enumerate(reader):
        print(f"  Processing train chunk {i+1}...")
        chunk_list.append(chunk)
        gc.collect() # Try to free up memory after each chunk

    # Concatenate all chunks into one DataFrame
    print("Concatenating train chunks...")
    train_full_df = pd.concat(chunk_list, ignore_index=True)
    del chunk_list # Free up memory
    gc.collect()

    # Save the full DataFrame to Parquet
    print(f"Saving to {train_parquet_path}...")
    train_full_df.to_parquet(train_parquet_path, engine='fastparquet')
    del train_full_df # Free up memory
    gc.collect()
    print("Train data conversion complete.")
else:
    print(f"{train_parquet_path} already exists. Skipping conversion.")

# --- Process test_data.csv ---
test_csv_path = os.path.join(CSV_DATA_DIR, 'test_data.csv')
test_parquet_path = os.path.join(PARQUET_OUTPUT_DIR, 'test.parquet')

if not os.path.exists(test_parquet_path):
    print(f"\nConverting {test_csv_path} to parquet...")
    
    chunk_list = []
    reader = pd.read_csv(test_csv_path, chunksize=500000)
    
    for i, chunk in enumerate(reader):
        print(f"  Processing test chunk {i+1}...")
        chunk_list.append(chunk)
        gc.collect()
        
    print("Concatenating test chunks...")
    test_full_df = pd.concat(chunk_list, ignore_index=True)
    del chunk_list
    gc.collect()

    print(f"Saving to {test_parquet_path}...")
    test_full_df.to_parquet(test_parquet_path, engine='fastparquet')
    del test_full_df
    gc.collect()
    print("Test data conversion complete.")
else:
    print(f"{test_parquet_path} already exists. Skipping conversion.")

print("\nAll conversions finished!")

Converting ../data/train_data.csv to parquet...
  Processing train chunk 1...
  Processing train chunk 2...
  Processing train chunk 3...
  Processing train chunk 4...
  Processing train chunk 5...
  Processing train chunk 6...
  Processing train chunk 7...
  Processing train chunk 8...
  Processing train chunk 9...
  Processing train chunk 10...
  Processing train chunk 11...
  Processing train chunk 12...
Concatenating train chunks...
Saving to ../data_parquet/train.parquet...
Train data conversion complete.

Converting ../data/test_data.csv to parquet...
  Processing test chunk 1...
  Processing test chunk 2...
  Processing test chunk 3...
  Processing test chunk 4...
  Processing test chunk 5...
  Processing test chunk 6...
  Processing test chunk 7...
  Processing test chunk 8...
  Processing test chunk 9...
  Processing test chunk 10...
  Processing test chunk 11...
  Processing test chunk 12...
  Processing test chunk 13...
  Processing test chunk 14...
  Processing test chunk 1

It's pretty good, about a 60% decrease in storage from 50GB to about 20GB


In [None]:

import numpy as np


# --- Memory Reduction Function (with detailed printout) ---
def reduce_mem_usage(df, verbose=True):
    """ Iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'Memory usage of dataframe is {start_mem:.2f} MB')
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object: # Exclude object (string) columns
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else: # Float types
                # Check for float32 first
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else: # Fallback to float64 if range is too large
                    df[col] = df[col].astype(np.float64) 
        # else: # Optionally convert object types to category
        #     df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'Memory usage after optimization is: {end_mem:.2f} MB')
        print(f'Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%')
    
    return df

# --- Define File Paths ---
PARQUET_DATA_DIR = '../data_parquet/' 
CSV_DATA_DIR = '../data/' 

# Define paths for original and optimized Parquet files
train_parquet_path = os.path.join(PARQUET_DATA_DIR, 'train.parquet')
train_downcasted_parquet_path = os.path.join(PARQUET_DATA_DIR, 'train_optimized.parquet')

# --- Load Original Parquet Data ---
print("Loading original training data from Parquet...")
train_df = pd.read_parquet(train_parquet_path, engine='fastparquet')
print(f"Original Parquet loaded. Shape: {train_df.shape}")

# --- Optimize Memory Usage ---
# The function will now print the before/after memory
print("\nOptimizing memory usage...")
train_df_optimized = reduce_mem_usage(train_df) 

# --- Save the Optimized DataFrame ---
# Check if it doesn't already exist to avoid re-saving unnecessarily
if not os.path.exists(train_downcasted_parquet_path):
    print(f"\nSaving optimized DataFrame to {train_downcasted_parquet_path}...")
    train_df_optimized.to_parquet(train_downcasted_parquet_path, engine='fastparquet')
    print("Optimized DataFrame saved.")
else:
    print(f"\nOptimized file {train_downcasted_parquet_path} already exists. Skipping save.")

# --- Load Training Labels ---
print("\nLoading training labels...")
train_labels_path = os.path.join(CSV_DATA_DIR, 'train_labels.csv')
train_labels = pd.read_csv(train_labels_path)
print(f"Training labels loaded. Shape: {train_labels.shape}")

# --- Clean up original large DataFrame from memory ---
del train_df 
gc.collect()

# --- Display Head of Optimi,,,,,,,,,zed Data ---
print("\nShowing first few rows of optimized training data:")
print(train_df_optimized.head())

Loading original training data from Parquet...
Original Parquet loaded. Shape: (5531451, 190)

Optimizing memory usage...
Memory usage of dataframe is 8018.31 MB
Memory usage after optimization is: 4077.73 MB
Decreased by 49.1%

Saving optimized DataFrame to ../data_parquet/train_downcasted.parquet...
Optimized DataFrame saved.

Loading training labels...
Training labels loaded. Shape: (458913, 2)

Showing first few rows of optimized training data:
                                         customer_ID         S_2       P_2  \
0  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-03-09  0.938469   
1  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-04-07  0.936665   
2  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-05-28  0.954180   
3  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-06-13  0.960384   
4  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-07-16  0.947248   

       D_39       B_1       B_2       R_1       S_3      D_41       B_3  ...  

In [10]:
# --- Define File Paths ---
ORIGINAL_TEST_PATH = os.path.join(PARQUET_DATA_DIR, 'test.parquet')
OPTIMIZED_TEST_PATH = os.path.join(PARQUET_DATA_DIR, 'test_optimized.parquet') # New file!

# --- Load, Optimize, and Save Test Data ---
if not os.path.exists(OPTIMIZED_TEST_PATH):
    print(f"Loading original test data from {ORIGINAL_TEST_PATH}...")
    test_df = pd.read_parquet(ORIGINAL_TEST_PATH, engine='fastparquet')

    print("\nOptimizing test data memory usage...")
    test_df_optimized = reduce_mem_usage(test_df)
    del test_df # Free up memory
    gc.collect()

    print(f"\nSaving optimized test data to {OPTIMIZED_TEST_PATH}...")
    test_df_optimized.to_parquet(OPTIMIZED_TEST_PATH, engine ='fastparquet') 
    del test_df_optimized # Free up memory
    gc.collect()

    print("\nTest data optimization complete.")
else:
    print(f"{OPTIMIZED_TEST_PATH} already exists. Skipping.")

Loading original test data from ../data_parquet/test.parquet...

Optimizing test data memory usage...
Memory usage of dataframe is 16472.74 MB
Memory usage after optimization is: 8377.25 MB
Decreased by 49.1%

Saving optimized test data to ../data_parquet/test_optimized.parquet...

Test data optimization complete.
