In [1]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
import re
import ast
import pandas as pd

#### Get Latencies and Token-Counts

In [4]:
import re
import pandas as pd

def get_beam_data(file_path):
    with open(file_path, 'r') as f:
        content = f.read()
    
    # Split content into question blocks
    # Each block ends with "Total beam search time: X.XX seconds."
    blocks = re.split(r'Total beam search time: \d+\.\d+ seconds\.', content)
    
    # Remove empty blocks and the last one (which might be incomplete)
    blocks = [block.strip() for block in blocks if block.strip()]
    
    data = []
    prev_total_tokens = 0
    
    for i, block in enumerate(blocks):
        # Extract the last "Total number of tokens generated thus far: X" before the beam search time
        token_matches = re.findall(r'Total number of tokens generated thus far: (\d+)', block)
        
        if token_matches:
            current_total_tokens = int(token_matches[-1])
            
            # Calculate tokens for this question (difference from previous)
            beam_tc = current_total_tokens - prev_total_tokens
            
            # Extract beam latency from the next block's start (since we split on it)
            if i < len(blocks) - 1:
                # Look for the beam search time in the original content
                # Find the position of this block in the original content
                block_start = content.find(block)
                block_end = block_start + len(block)
                
                # Look for "Total beam search time:" after this block
                time_match = re.search(r'Total beam search time: (\d+\.\d+) seconds\.', content[block_end:])
                if time_match:
                    beam_latency = float(time_match.group(1))
                else:
                    beam_latency = None
            else:
                # For the last block, we need to look at the end of the file
                time_match = re.search(r'Total beam search time: (\d+\.\d+) seconds\.$', content)
                beam_latency = float(time_match.group(1)) if time_match else None
            
            # Question ID is 5010 + i
            sb_idx = 5010 + i
            
            data.append({
                'sb_idx': sb_idx,
                'beam_tc': beam_tc,
                'beam_latency': beam_latency
            })
            
            prev_total_tokens = current_total_tokens
    
    return pd.DataFrame(data)

In [5]:
# Parse the file
Beam_8_5010_5020 = get_beam_data('/dccstor/gma2/jhjenny9/search-and-learn/Beam-8-5010-5020.out')
Beam_8_5010_5020.head()

Unnamed: 0,sb_idx,beam_tc,beam_latency
0,5010,13276,30.93
1,5011,5643,11.27
2,5012,9461,28.21
3,5013,3837,8.98
4,5014,5405,12.53


In [8]:
import re
import pandas as pd
import os
import glob

def parse_beam_data_from_filename(file_path):
    # Extract beam size and question range from filename
    # Example: "Beam-8-5010-5020.out" -> beam_size=8, start=5010, end=5020
    filename = os.path.basename(file_path)
    match = re.match(r'Beam-(\d+)-(\d+)-(\d+)\.out', filename)
    
    if not match:
        raise ValueError(f"Invalid filename format: {filename}")
    
    beam_size = int(match.group(1))
    start_idx = int(match.group(2))
    end_idx = int(match.group(3))
    
    with open(file_path, 'r') as f:
        content = f.read()
    
    # Split content into question blocks
    blocks = re.split(r'Total beam search time: \d+\.\d+ seconds\.', content)
    blocks = [block.strip() for block in blocks if block.strip()]
    
    data = []
    prev_total_tokens = 0
    
    for i, block in enumerate(blocks):
        # Extract the last "Total number of tokens generated thus far: X" before the beam search time
        token_matches = re.findall(r'Total number of tokens generated thus far: (\d+)', block)
        
        if token_matches:
            current_total_tokens = int(token_matches[-1])
            
            # Calculate tokens for this question (difference from previous)
            beam_tc = current_total_tokens - prev_total_tokens
            
            # Extract beam latency from the next block's start
            if i < len(blocks) - 1:
                block_start = content.find(block)
                block_end = block_start + len(block)
                
                time_match = re.search(r'Total beam search time: (\d+\.\d+) seconds\.', content[block_end:])
                if time_match:
                    beam_latency = float(time_match.group(1))
                else:
                    beam_latency = None
            else:
                # For the last block, look at the end of the file
                time_match = re.search(r'Total beam search time: (\d+\.\d+) seconds\.$', content)
                beam_latency = float(time_match.group(1)) if time_match else None
            
            # Question ID is start_idx + i
            sb_idx = start_idx + i
            
            data.append({
                'sb_idx': sb_idx,
                'beam_tc': beam_tc,
                'beam_latency': beam_latency,
                'N': beam_size
            })
            
            prev_total_tokens = current_total_tokens
    
    return pd.DataFrame(data)

def process_all_beam_files(directory_path):
    # Find all Beam-*.out files
    pattern = os.path.join(directory_path, "Beam-*.out")
    files = glob.glob(pattern)
    
    all_data = []
    
    for file_path in sorted(files):
        try:
            df = parse_beam_data_from_filename(file_path)
            all_data.append(df)
            print(f"Processed: {os.path.basename(file_path)}")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    if all_data:
        # Combine all DataFrames
        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df
    else:
        return pd.DataFrame()

In [9]:
# Process all files in the directory
directory = "/dccstor/gma2/jhjenny9/search-and-learn"
df = process_all_beam_files(directory)

if not df.empty:
    print("\nCombined DataFrame:")
    print(df)
    print(f"\nTotal rows: {len(df)}")
else:
    print("No data found or error occurred")

Processed: Beam-16-10000-11000.out
Processed: Beam-16-11000-12000.out
Processed: Beam-16-12000-13000.out
Processed: Beam-16-13000-14000.out
Processed: Beam-16-14000-15000.out
Processed: Beam-16-15000-16000.out
Processed: Beam-16-16000-17000.out
Processed: Beam-16-17000-18000.out
Processed: Beam-16-18000-19000.out
Processed: Beam-16-19000-20000.out
Processed: Beam-16-20000-21000.out
Processed: Beam-16-21000-22000.out
Processed: Beam-16-22000-23000.out
Processed: Beam-16-23000-24000.out
Processed: Beam-16-24000-25000.out
Processed: Beam-16-25000-26000.out
Processed: Beam-16-26000-27000.out
Processed: Beam-16-27000-28000.out
Processed: Beam-16-28000-29000.out
Processed: Beam-16-29000-30000.out
Processed: Beam-16-30000-32000.out
Processed: Beam-16-32000-34000.out
Processed: Beam-16-34000-36000.out
Processed: Beam-16-36000-38000.out
Processed: Beam-16-38000-40000.out
Processed: Beam-16-40000-42000.out
Processed: Beam-16-42000-44000.out
Processed: Beam-16-44000-46000.out
Processed: Beam-16-4

In [None]:
df.groupby("N")[["beam_latency", "beam_tc"]].agg(["mean", "count"]) # debug this on a smaller dataset (N=8,16,32) to make sure the data are correct.

Unnamed: 0_level_0,beam_latency,beam_latency,beam_tc,beam_tc
Unnamed: 0_level_1,mean,count,mean,count
N,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
8,64.168285,21867,36144.985595,21867
16,36.347403,22890,19570.513718,22890
32,77.906351,24174,44421.821378,24174


In [14]:
import pandas as pd

bert_features_path = "/dccstor/gma2/jhjenny9/search-and-learn/feature-data/bert-features/16000/df_test_16000.csv"
bert_df = pd.read_csv(bert_features_path)
bert_df.head()

bert_df_beam = bert_df[bert_df["method_beam_search"] == True]



In [None]:
bert_df_beam.shape # ask the parse_beam_data_from_filename script to only retrieve the rows where sb_idx matches one of the values in method_beam_search.

(5282, 10)