# TPC-DS Query Execution Time Data Cleaning

This notebook cleans the query execution time data from three different runs of TPC-DS benchmark on HeavyDB. The cleaning process involves:
1. Fixing formatting issues (e.g., query 24 with split values)
2. Handling missing time values
3. Ensuring consistent data formats

In [6]:
import pandas as pd
import os
import re
nums_list = ["1", "5", "10","20", "50"]
tpc_types = ["tpc-h/"]

## Define Cleaning Function

This function handles the common cleaning tasks for all CSV files.

In [None]:
def clean_csv_file(file_path):
    """Clean the query execution time CSV file."""
    
    # Read the raw content of the file to handle the special case with query 24
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    # Fix the issue with query 24 where a value appears on a separate line
    fixed_lines = []
    for i, line in enumerate(lines):
        if i > 0 and re.match(r'^\d+$', line.strip()):
            # This is a lone number - append it to the previous line
            # fixed_lines[-1] = fixed_lines[-1].strip() + line.strip() + '\n'
            continue
        else:
            fixed_lines.append(line)
    
    # Write the fixed content to a temporary file
    import tempfile
    temp_file = tempfile.NamedTemporaryFile(delete=False, mode='w')
    temp_file.writelines(fixed_lines)
    temp_file.close()
    
    # Read the CSV with pandas
    df = pd.read_csv(temp_file.name, sep=',', skipinitialspace=True)
    
    # Remove the temporary file
    os.unlink(temp_file.name)
    
    # Clean column names
    df.columns = [col.strip() for col in df.columns]
    
    # Convert time column to numeric, with NaN for missing values
    df['time (ms)'] = pd.to_numeric(df['time (ms)'], errors='coerce')
    
    # Remove rows where time (ms) is NaN
    df = df.dropna(subset=['time (ms)'])
    
    # Return only the query and time columns
    return df[['query', 'time (ms)']]

## Process Each File

Apply the cleaning function to each input file and save the cleaned results.

In [7]:


for tpc_type in tpc_types:
    for num in nums_list:
        base_dir = tpc_type+num+'gb/'
        input_files = {
            'run1': os.path.join(base_dir, 'run1/query'+num+'_execution_times.csv'),
            'run2': os.path.join(base_dir, 'run2/query'+num+'_execution_times.csv'),
            'run3': os.path.join(base_dir, 'run3/query'+num+'_execution_times.csv')
        }

        # Create output directory for cleaned data
        output_dir = os.path.join(base_dir, 'cleaned')
        os.makedirs(output_dir, exist_ok=True)

        output_files = {
            'run1': os.path.join(output_dir, 'run1_cleaned.csv'),
            'run2': os.path.join(output_dir, 'run2_cleaned.csv'),
            'run3': os.path.join(output_dir, 'run3_cleaned.csv')
        }


        # Process each file and save the cleaned version
        cleaned_dfs = {}

        for run, file_path in input_files.items():
            print(f"Cleaning {run} data...")
            df = clean_csv_file(file_path)
            cleaned_dfs[run] = df
            
            # Save the cleaned file
            df.to_csv(output_files[run], index=False)
            print(f"Saved cleaned data to {output_files[run]}")

        print("\nCleaning complete!")

Cleaning run1 data...
Saved cleaned data to tpc-h/1gb/cleaned/run1_cleaned.csv
Cleaning run2 data...
Saved cleaned data to tpc-h/1gb/cleaned/run2_cleaned.csv
Cleaning run3 data...
Saved cleaned data to tpc-h/1gb/cleaned/run3_cleaned.csv

Cleaning complete!
Cleaning run1 data...
Saved cleaned data to tpc-h/5gb/cleaned/run1_cleaned.csv
Cleaning run2 data...
Saved cleaned data to tpc-h/5gb/cleaned/run2_cleaned.csv
Cleaning run3 data...
Saved cleaned data to tpc-h/5gb/cleaned/run3_cleaned.csv

Cleaning complete!
Cleaning run1 data...
Saved cleaned data to tpc-h/10gb/cleaned/run1_cleaned.csv
Cleaning run2 data...
Saved cleaned data to tpc-h/10gb/cleaned/run2_cleaned.csv
Cleaning run3 data...
Saved cleaned data to tpc-h/10gb/cleaned/run3_cleaned.csv

Cleaning complete!
Cleaning run1 data...
Saved cleaned data to tpc-h/20gb/cleaned/run1_cleaned.csv
Cleaning run2 data...
Saved cleaned data to tpc-h/20gb/cleaned/run2_cleaned.csv
Cleaning run3 data...
Saved cleaned data to tpc-h/20gb/cleaned/run