In [111]:
import pandas as pd
import re
nums_list = ["1", "2", "5", "10", "20"]
tpc_types = ["tpc-ds/", "tpc-h/"]

In [112]:
def average_query_metrics(paths, columns_to_use, output_path):
    # Read, filter, and clean data
    dfs = [
        pd.read_csv(path, comment='/')[columns_to_use]
        .assign(query_id=lambda df: df['query_id'].astype(str).str.strip())  # Convert to string and strip
        for path in paths
    ]
    
    # Merge data and calculate average by query_id
    all_data = pd.concat(dfs)
    average_metrics = all_data.groupby('query_id', as_index=False).mean(numeric_only=True)
    average_metrics = average_metrics.round(2)
    
    # Add column for sorting by number in query_id
    average_metrics['query_number'] = average_metrics['query_id'].apply(lambda x: int(re.search(r'\d+', x).group()))
    
    # Sort by query number
    average_metrics = average_metrics.sort_values(by='query_number').drop(columns=['query_number'])
    
    # Write to file
    average_metrics.to_csv(output_path, index=False)
    
    return average_metrics

In [113]:
def worst_query_metrics(paths, columns_to_use, output_path):
    # Read, filter, and clean data
    dfs = [
        pd.read_csv(path, comment='/')[columns_to_use]
        .assign(query_id=lambda df: df['query_id'].astype(str).str.strip())  # Convert to string and strip
        for path in paths
    ]
    
    # Merge data and calculate average by query_id
    all_data = pd.concat(dfs)
    average_metrics = all_data.groupby('query_id', as_index=False).max(numeric_only=True)
    average_metrics = average_metrics.round(2)
    
    # Add column for sorting by number in query_id
    average_metrics['query_number'] = average_metrics['query_id'].apply(lambda x: int(re.search(r'\d+', x).group()))
    
    # Sort by query number
    average_metrics = average_metrics.sort_values(by='query_number').drop(columns=['query_number'])
    
    # Write to file
    average_metrics.to_csv(output_path, index=False)
    
    return average_metrics

In [114]:
for tpc_type in tpc_types:
    for nums in nums_list:
        # Đường dẫn đến 3 file CSV
        average_path_1 = tpc_type+"result_log/result_log_"+nums+"GB/time_1/average_query_sys_params.csv"
        average_path_2 = tpc_type+"result_log/result_log_"+nums+"GB/time_2/average_query_sys_params.csv"
        average_path_3 = tpc_type+"result_log/result_log_"+nums+"GB/time_3/average_query_sys_params.csv"

        worst_path_1 = tpc_type+"result_log/result_log_"+nums+"GB/time_1/worst_query_sys_params.csv"
        worst_path_2 = tpc_type+"result_log/result_log_"+nums+"GB/time_2/worst_query_sys_params.csv"
        worst_path_3 = tpc_type+"result_log/result_log_"+nums+"GB/time_3/worst_query_sys_params.csv"

        worst_output_path = tpc_type+"result_log/result_log_"+nums+"GB/worst_query_case_"+nums+"GB.csv"
        average_output_path = tpc_type+"result_log/result_log_"+nums+"GB/average_query_case_"+nums+"GB.csv"
        # Đường dẫn đến 3 file CSV
        average_paths = [
            average_path_1,average_path_2,average_path_3
        ]
        worst_paths = [
            worst_path_1,worst_path_2,worst_path_3
        ]
        # Các cột cần thiết
        columns_to_use = [
            "query_id","cpu_used(%)","ram_used(gb)","time(ms)"
        ]

        # Process average metrics
        average_metrics = average_query_metrics(
            average_paths, 
            columns_to_use, 
            average_output_path
        )

        # Process worst metrics
        worst_metrics = worst_query_metrics(
            worst_paths,
            columns_to_use,
            worst_output_path
        )