# visualize surprisal development


In [1]:
from pathlib import Path
import pandas as pd

ROOT = Path("/Users/jliu/workspace/RAG/")
surprisal_path = ROOT / "results" / "surprisal"

In [8]:
def format_csv(file_path:Path)->pd.DataFrame:
    """Remove the target col from the csv file."""
    df = pd.read_csv(file_path)
    #df = df.drop('ablated_neurons', axis=1)
    # convert differnet steps into different rows
    df_h = df[df["step"]==0]
    df_h = df_h.drop('surprisal', axis=1)
    df_grouped = df.groupby("step")
    for step, df_group, in df_grouped:
        df_h[step] = df_group['surprisal'].to_list()
    return df_h

In [9]:
# load freq file
file_path = surprisal_path/"pythia-example.csv"
df_h = format_csv(file_path)

In [3]:
import pandas as pd
import numpy as np
import typing as t
from pathlib import Path

def select_matching_rows(
    df_a: pd.DataFrame,
    df_b: pd.DataFrame,
    match_columns: list[str]
) -> pd.DataFrame:
    """Select rows from df_a that match with df_b on specified columns."""
    # Validate that match_columns exist in both DataFrames
    for col in match_columns:
        if col not in df_a.columns or col not in df_b.columns:
            raise ValueError(f"Column '{col}' not found in both DataFrames")
    
    # Perform a left merge from df_b to df_a on the match_columns
    # This will keep all rows from df_b and only matching rows from df_a
    result = pd.merge(
        df_b,
        df_a,
        on=match_columns,
        how='left',
        indicator=True
    )
    
    # Remove the indicator column if not needed
    if '_merge' in result.columns:
        result = result.drop(columns=['_merge'])
        
    return result

def test_select_matching_rows() -> None:
    """Test the select_matching_rows function with sample DataFrames."""
    # Create sample DataFrame A
    data_a = {
        "id": [1, 2, 3, 4, 5],
        "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
        "age": [25, 30, 35, 40, 45],
        "score": [95, 85, 75, 90, 80]
    }
    df_a = pd.DataFrame(data_a)
    
    # Create sample DataFrame B (subset with some missing and some extra rows)
    data_b = {
        "id": [2, 3, 6],
        "name": ["Bob", "Charlie", "Frank"],
        "age": [30, 356,17]
    }
    df_b = pd.DataFrame(data_b)
    
    print("DataFrame A:")
    print(df_a)
    print("\nDataFrame B:")
    print(df_b)
    
    # Select rows from A that match with B on 'id' and 'name'
    result = select_matching_rows(df_a, df_b, ["id", "name","age"])
    
    print("\nResult after selecting rows:")
    print(result)

if __name__ == "__main__":
    test_select_matching_rows()

DataFrame A:
   id     name  age  score
0   1    Alice   25     95
1   2      Bob   30     85
2   3  Charlie   35     75
3   4    David   40     90
4   5      Eve   45     80

DataFrame B:
   id     name  age
0   2      Bob   30
1   3  Charlie  356
2   6    Frank   17

Result after selecting rows:
   id     name  age  score
0   2      Bob   30   85.0
1   3  Charlie  356    NaN
2   6    Frank   17    NaN
