# Filter Duplicate Query IDs

This notebook processes the query times CSV file to:
1. Filter out duplicate query_ids (keep only the first occurrence)
2. Remove the user_time and sys_time columns
3. Convert time from seconds to milliseconds
4. Save the filtered data to a new CSV file

In [25]:
# Import required libraries
import pandas as pd
import os
nums_list = ["1", "5", "10", "20","50","100"]
tpc_types = ["tpc-ds/", "tpc-h/"]
times=["1","2","3"]

In [23]:
def process_query_times(input_file, output_file):
    # Check if the file exists
    if os.path.exists(input_file):
        print(f"File found: {input_file}")
    else:
        print(f"File not found: {input_file}")
        return None
    
    # Read the CSV file
    df = pd.read_csv(input_file, skipinitialspace=True)
    
    # Strip whitespace from string columns if needed
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.strip() if hasattr(df[col], 'str') else df[col]
    
    # Filter out duplicates (keep first occurrence)
    df_filtered = df.drop_duplicates(subset=['query_id'], keep='first')
    
    # Keep only query_id and real_time columns
    df_filtered = df_filtered[['query_id', 'real_time(s)']]
    
    # Convert seconds to milliseconds and rename the column
    df_filtered['time(ms)'] = df_filtered['real_time(s)'] * 1000
    df_filtered = df_filtered[['query_id', 'time(ms)']]
    
    # Save the filtered data to a new CSV file
    df_filtered.to_csv(output_file, index=False)
    
    return df_filtered

In [26]:
for tpc_type in tpc_types:
    for nums in nums_list:
        for time in times:
            # Define the input and output file paths
            input_file = f'{tpc_type}result_log/result_log_{nums}GB/time_{time}/query_times.csv'
            output_file = f'{tpc_type}result_log/result_log_{nums}GB/time_{time}/filtered_query_times.csv'
            
            # Process the data
            result_df = process_query_times(input_file, output_file)
            
            if result_df is not None:
                print(f"Processed {input_file} successfully.")
            else:
                print(f"Failed to process {input_file}.")

File found: tpc-ds/result_log/result_log_1GB/time_1/query_times.csv
Processed tpc-ds/result_log/result_log_1GB/time_1/query_times.csv successfully.
File found: tpc-ds/result_log/result_log_1GB/time_2/query_times.csv
Processed tpc-ds/result_log/result_log_1GB/time_2/query_times.csv successfully.
File found: tpc-ds/result_log/result_log_1GB/time_3/query_times.csv
Processed tpc-ds/result_log/result_log_1GB/time_3/query_times.csv successfully.
File found: tpc-ds/result_log/result_log_5GB/time_1/query_times.csv
Processed tpc-ds/result_log/result_log_5GB/time_1/query_times.csv successfully.
File found: tpc-ds/result_log/result_log_5GB/time_2/query_times.csv
Processed tpc-ds/result_log/result_log_5GB/time_2/query_times.csv successfully.
File found: tpc-ds/result_log/result_log_5GB/time_3/query_times.csv
Processed tpc-ds/result_log/result_log_5GB/time_3/query_times.csv successfully.
File found: tpc-ds/result_log/result_log_10GB/time_1/query_times.csv
Processed tpc-ds/result_log/result_log_10GB