In [1]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
import re
import ast
import pandas as pd

# Load in chat data.

In [3]:
from datasets import load_dataset

# Load the lmsys/chatbot_arena_conversations dataset from HuggingFace
chat_data = load_dataset("lmarena-ai/arena-human-preference-55k", split="train")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# HuggingFace datasets.Dataset doesn't have .head(), so use .select or .to_pandas()
# Show first 5 rows as a DataFrame
chat_data_df = chat_data.to_pandas()
chat_data_df.shape


(57477, 9)

In [5]:
chat_data_df.head()


Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [6]:
# Only keep games where one of the specified models is the winner
target_models = [
    "gpt-4-0125-preview",
    "gpt-4-1106-preview",
    "gpt-4-0314",
    "gpt-4-0613",
    "qwen1.5-73b-chat"
]

# Assuming chat_data_df has a 'winner' column with model names
strong_winners = chat_data_df[
    ((chat_data_df["winner_model_a"] == 1) & (chat_data_df["model_a"].isin(target_models))) |
    ((chat_data_df["winner_model_b"] == 1) & (chat_data_df["model_b"].isin(target_models))) |
    ((chat_data_df["winner_tie"] == 1) & (chat_data_df["model_a"].isin(target_models)) | (chat_data_df["model_b"].isin(target_models)))
]

In [7]:
# all conversations where the winner is one of the top 5 models. this should, in hopes, provide a good
# signal for what a "good" answer is.
strong_winners.shape # 15242

(15242, 9)

In [10]:
# Read the .out file as a string for later parsing
out_path = "/dccstor/gma2/jhjenny9/search-and-learn/data/beam_numina/pita2240.out"

#### Get Latencies and Token-Counts

In [11]:
import re
import pandas as pd

def get_beam_data(file_path):
    with open(file_path, 'r') as f:
        content = f.read()
    
    # Split content into question blocks
    # Each block ends with "Total beam search time: X.XX seconds."
    blocks = re.split(r'Total beam search time: \d+\.\d+ seconds\.', content)
    
    # Remove empty blocks and the last one (which might be incomplete)
    blocks = [block.strip() for block in blocks if block.strip()]
    
    data = []
    prev_total_tokens = 0
    
    for i, block in enumerate(blocks):
        # Extract the last "Total number of tokens generated thus far: X" before the beam search time
        token_matches = re.findall(r'Total number of tokens generated thus far: (\d+)', block)
        
        if token_matches:
            current_total_tokens = int(token_matches[-1])
            
            # Calculate tokens for this question (difference from previous)
            beam_tc = current_total_tokens - prev_total_tokens
            
            # Extract beam latency from the next block's start (since we split on it)
            if i < len(blocks) - 1:
                # Look for the beam search time in the original content
                # Find the position of this block in the original content
                block_start = content.find(block)
                block_end = block_start + len(block)
                
                # Look for "Total beam search time:" after this block
                time_match = re.search(r'Total beam search time: (\d+\.\d+) seconds\.', content[block_end:])
                if time_match:
                    beam_latency = float(time_match.group(1))
                else:
                    beam_latency = None
            else:
                # For the last block, we need to look at the end of the file
                time_match = re.search(r'Total beam search time: (\d+\.\d+) seconds\.$', content)
                beam_latency = float(time_match.group(1)) if time_match else None
            
            # Question ID is 5010 + i
            sb_idx = 5010 + i
            
            data.append({
                'sb_idx': sb_idx,
                'beam_tc': beam_tc,
                'beam_latency': beam_latency
            })
            
            prev_total_tokens = current_total_tokens
    
    return pd.DataFrame(data)

In [12]:
# Parse the file
Beam_8_5010_5020 = get_beam_data('/dccstor/gma2/jhjenny9/search-and-learn/Beam-8-5010-5020.out')
Beam_8_5010_5020.head()

Unnamed: 0,sb_idx,beam_tc,beam_latency
0,5010,13276,30.93
1,5011,5643,11.27
2,5012,9461,28.21
3,5013,3837,8.98
4,5014,5405,12.53


In [13]:
import re
import pandas as pd
import os
import glob

def parse_beam_data_from_filename(file_path):
    # Extract beam size and question range from filename
    # Example: "Beam-8-5010-5020.out" -> beam_size=8, start=5010, end=5020
    filename = os.path.basename(file_path)
    match = re.match(r'Beam-(\d+)-(\d+)-(\d+)\.out', filename)
    
    if not match:
        raise ValueError(f"Invalid filename format: {filename}")
    
    beam_size = int(match.group(1))
    start_idx = int(match.group(2))
    end_idx = int(match.group(3))
    
    with open(file_path, 'r') as f:
        content = f.read()
    
    # Split content into question blocks
    blocks = re.split(r'Total beam search time: \d+\.\d+ seconds\.', content)
    blocks = [block.strip() for block in blocks if block.strip()]
    
    data = []
    prev_total_tokens = 0
    
    for i, block in enumerate(blocks):
        # Extract the last "Total number of tokens generated thus far: X" before the beam search time
        token_matches = re.findall(r'Total number of tokens generated thus far: (\d+)', block)
        
        if token_matches:
            current_total_tokens = int(token_matches[-1])
            
            # Calculate tokens for this question (difference from previous)
            beam_tc = current_total_tokens - prev_total_tokens
            
            # Extract beam latency from the next block's start
            if i < len(blocks) - 1:
                block_start = content.find(block)
                block_end = block_start + len(block)
                
                time_match = re.search(r'Total beam search time: (\d+\.\d+) seconds\.', content[block_end:])
                if time_match:
                    beam_latency = float(time_match.group(1))
                else:
                    beam_latency = None
            else:
                # For the last block, look at the end of the file
                time_match = re.search(r'Total beam search time: (\d+\.\d+) seconds\.$', content)
                beam_latency = float(time_match.group(1)) if time_match else None
            
            # Question ID is start_idx + i
            sb_idx = start_idx + i
            
            data.append({
                'sb_idx': sb_idx,
                'beam_tc': beam_tc,
                'beam_latency': beam_latency,
                'N': beam_size
            })
            
            prev_total_tokens = current_total_tokens
    
    return pd.DataFrame(data)

def process_all_beam_files(directory_path):
    # Find all Beam-*.out files
    pattern = os.path.join(directory_path, "Beam-*.out")
    files = glob.glob(pattern)
    
    all_data = []
    
    for file_path in sorted(files):
        try:
            df = parse_beam_data_from_filename(file_path)
            all_data.append(df)
            print(f"Processed: {os.path.basename(file_path)}")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    if all_data:
        # Combine all DataFrames
        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df
    else:
        return pd.DataFrame()

In [14]:
# Process all files in the directory
directory = "/dccstor/gma2/jhjenny9/search-and-learn"
df = process_all_beam_files(directory)

if not df.empty:
    print("\nCombined DataFrame:")
    print(df)
    print(f"\nTotal rows: {len(df)}")
else:
    print("No data found or error occurred")

Processed: Beam-16-10000-11000.out
Processed: Beam-16-11000-12000.out
Processed: Beam-16-12000-13000.out
Processed: Beam-16-13000-14000.out
Processed: Beam-16-14000-15000.out
Processed: Beam-16-15000-16000.out
Processed: Beam-16-16000-17000.out
Processed: Beam-16-17000-18000.out
Processed: Beam-16-18000-19000.out
Processed: Beam-16-19000-20000.out
Processed: Beam-16-20000-21000.out
Processed: Beam-16-21000-22000.out
Processed: Beam-16-22000-23000.out
Processed: Beam-16-23000-24000.out
Processed: Beam-16-24000-25000.out
Processed: Beam-16-25000-26000.out
Processed: Beam-16-26000-27000.out
Processed: Beam-16-27000-28000.out
Processed: Beam-16-28000-29000.out
Processed: Beam-16-29000-30000.out
Processed: Beam-16-30000-32000.out
Processed: Beam-16-32000-34000.out
Processed: Beam-16-34000-36000.out
Processed: Beam-16-36000-38000.out
Processed: Beam-16-38000-40000.out
Processed: Beam-16-40000-42000.out
Processed: Beam-16-42000-44000.out
Processed: Beam-16-44000-46000.out
Processed: Beam-16-4

In [15]:
df.groupby("N")[["beam_latency", "beam_tc"]].agg(["mean", "count"]) # debug this on a smaller dataset (N=8,16,32) to make sure the data are correct.

Unnamed: 0_level_0,beam_latency,beam_latency,beam_tc,beam_tc
Unnamed: 0_level_1,mean,count,mean,count
N,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
8,58.915149,15972,32975.299587,15972
16,36.347403,22890,19570.513718,22890
32,77.906351,24174,44421.821378,24174


In [16]:
import pandas as pd

bert_features_path = "/dccstor/gma2/jhjenny9/search-and-learn/feature-data/bert-features/16000/df_test_16000.csv"
bert_df = pd.read_csv(bert_features_path)
bert_df.head()

bert_df_beam = bert_df[bert_df["method_beam_search"] == True]



In [17]:
bert_df_beam.shape # ask the parse_beam_data_from_filename script to only retrieve the rows where sb_idx matches one of the values in method_beam_search.

(5282, 10)

In [18]:
# read in this file.
import json
import pandas as pd

beam_completions_path = "/dccstor/gma2/jhjenny9/search-and-learn/data/Numina_Beam/Numina_beam_search_4_4_40_fourth_chunk/beam_search_completions.jsonl"
with open(beam_completions_path, "r") as f:
    completions = [json.loads(line) for line in f]
df_completions = pd.DataFrame(completions)
print(df_completions.shape)

(5000, 31)


#### Beam Times for Small vs. Large N Beam.

In [19]:
import re
import numpy as np

def extract_average_beam_search_time(log_file_path):
    """
    Extract all "Total beam search time:" values from a log file and calculate the average.
    
    Args:
        log_file_path (str): Path to the log file
        
    Returns:
        dict: Dictionary containing:
            - 'times': List of all beam search times in seconds
            - 'average': Average beam search time in seconds
            - 'count': Number of beam search operations
            - 'min': Minimum beam search time
            - 'max': Maximum beam search time
            - 'std': Standard deviation of beam search times
    """
    # Pattern to match "Total beam search time: X.XX seconds."
    pattern = r'Total beam search time: ([\d.]+) seconds\.'
    
    times = []
    
    try:
        with open(log_file_path, 'r') as file:
            for line in file:
                match = re.search(pattern, line)
                if match:
                    time_value = float(match.group(1))
                    times.append(time_value)
    except FileNotFoundError:
        print(f"Error: File '{log_file_path}' not found.")
        return None
    except Exception as e:
        print(f"Error reading file: {e}")
        return None
    
    if not times:
        print("No beam search times found in the log file.")
        return None
    
    # Calculate statistics
    times_array = np.array(times)
    result = {
        'times': times,
        'average': np.mean(times_array),
        'count': len(times),
        'min': np.min(times_array),
        'max': np.max(times_array),
        'std': np.std(times_array)
    }
    
    return result


In [20]:
# Test the function
result = extract_average_beam_search_time('/dccstor/gma2/jhjenny9/search-and-learn/large-beam-2.out')

print(f"Beam Time for N=128")
print(f"Found {result['count']} beam search operations")
print(f"Average time: {result['average']:.2f} seconds")
print(f"Min time: {result['min']:.2f} seconds")
print(f"Max time: {result['max']:.2f} seconds")
print(f"Standard deviation: {result['std']:.2f} seconds")
print(f"\nAll times: {result['times']}")

Beam Time for N=128
Found 10 beam search operations
Average time: 388.20 seconds
Min time: 133.41 seconds
Max time: 616.34 seconds
Standard deviation: 123.20 seconds

All times: [297.57, 403.46, 403.98, 415.98, 133.41, 616.34, 359.12, 514.49, 431.28, 306.37]


In [21]:
# Test the function
result = extract_average_beam_search_time('/dccstor/gma2/jhjenny9/search-and-learn/large-beam.out')

print(f"Beam Time for N=32")
print(f"Found {result['count']} beam search operations")
print(f"Average time: {result['average']:.2f} seconds")
print(f"Min time: {result['min']:.2f} seconds")
print(f"Max time: {result['max']:.2f} seconds")
print(f"Standard deviation: {result['std']:.2f} seconds")
print(f"\nAll times: {result['times']}")

Beam Time for N=32
Found 100 beam search operations
Average time: 81.80 seconds
Min time: 10.66 seconds
Max time: 264.00 seconds
Standard deviation: 57.81 seconds

All times: [24.76, 19.27, 99.12, 95.19, 26.03, 78.23, 13.39, 31.76, 134.76, 11.11, 56.41, 121.37, 109.11, 18.25, 264.0, 23.91, 62.78, 65.45, 64.4, 48.95, 118.92, 120.7, 24.39, 70.16, 160.42, 25.36, 167.64, 103.07, 88.3, 13.57, 27.39, 85.95, 21.11, 37.36, 41.51, 78.14, 77.44, 123.13, 122.26, 24.13, 20.46, 99.25, 21.17, 75.69, 24.67, 15.8, 19.05, 43.24, 115.7, 135.53, 67.32, 125.27, 73.35, 81.99, 224.16, 52.6, 103.65, 67.62, 130.81, 175.54, 21.22, 120.32, 176.65, 16.03, 105.63, 90.8, 23.85, 31.28, 36.32, 26.99, 91.97, 111.23, 217.35, 101.09, 35.78, 12.05, 51.53, 110.73, 189.84, 19.03, 11.21, 144.93, 16.41, 42.49, 98.03, 197.02, 102.67, 90.83, 139.4, 173.64, 77.79, 120.7, 82.18, 32.01, 40.53, 134.08, 104.03, 177.38, 222.48, 10.66]
