In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO
from ipywidgets import interact, widgets
from xlwings.utils import chunk
import numpy as np

%matplotlib inline

In [2]:
from pathlib import Path
base_path = Path("/mnt/data/kolomyttseva/long-contex-eval/output/rag_results/chunk_score")

full_file_path = base_path / "full_file"
fixed_line_path = base_path / "fixed_line"
langchain_path = base_path / "langchain"

'''
fixed_line:
    - chunk_completion_file.jsonl
        - scorer: bm25
        - splitter: word_splitter
        - chunk_lines_size = [8, 16, 32, 64, 128]
        - chunk_completion_file = [True, False]
        - completion_last_chunk_size = 32
    - completion_last_chunk_size.jsonl
        - scorer: bm25
        - splitter: word_splitter
        - chunk_lines_size = [8, 16, 32, 64, 128]
        - chunk_completion_file = True
        - completion_last_chunk_size = [8, 16, 32, 64, 128]

full_file:
    - bm25_word_splitter.jsonl
        - scorer: bm25
        - splitter: word_splitter
    - dense_scorer_word_splitter.jsonl
        - scorer: dense
        - splitter: word_splitter
    - scorers_splitters.jsonl
        - scorer: iou/bm25
        - splitter: line_splitter/word_splitter/model_tokenizer

langchain:
    - langchain.jsonl
        - scorer: bm25
        - splitter: word_splitter
        - chunk_lines_size = [8, 16, 32, 64, 128]
        - chunk_completion_file = True
        - completion_last_chunk_size = 32
'''

'\nfixed_line:\n    - chunk_completion_file.jsonl\n        - scorer: bm25\n        - splitter: word_splitter\n        - chunk_lines_size = [8, 16, 32, 64, 128]\n        - chunk_completion_file = [True, False]\n        - completion_last_chunk_size = 32\n    - completion_last_chunk_size.jsonl\n        - scorer: bm25\n        - splitter: word_splitter\n        - chunk_lines_size = [8, 16, 32, 64, 128]\n        - chunk_completion_file = True\n        - completion_last_chunk_size = [8, 16, 32, 64, 128]\n\nfull_file:\n    - bm25_word_splitter.jsonl\n        - scorer: bm25\n        - splitter: word_splitter\n    - dense_scorer_word_splitter.jsonl\n        - scorer: dense\n        - splitter: word_splitter\n    - scorers_splitters.jsonl\n        - scorer: iou/bm25\n        - splitter: line_splitter/word_splitter/model_tokenizer\n\nlangchain:\n    - langchain.jsonl\n        - scorer: bm25\n        - splitter: word_splitter\n        - chunk_lines_size = [8, 16, 32, 64, 128]\n        - chunk_comp

In [3]:
def read_results(file, group_columns):

    dataframes = []

    with open(file, 'r') as file:
        for line in file:
            json_data = StringIO(line.strip())
            df = pd.read_json(json_data)
            df["em"] = df["scores"].apply(lambda x: x["exact_match_valid"]["mean"])
            grouped = df.groupby(group_columns).agg({
                'context_len_config': list,
                'em': list,
            }).reset_index()
            dataframes.append(grouped)
    final_dataframe = pd.concat(dataframes, ignore_index=True)

    return final_dataframe

In [4]:
def filter_target_columns(results, group_columns, delete_columns=[]):    
    target_columns = dict()

    for column in group_columns:
        column_values = results[column].unique().tolist()
        if len(column_values) > 1:
            target_columns[column] = sorted(column_values)

    # del target_columns["stride"]
    for del_col in delete_columns:
        del target_columns[del_col]
    return target_columns

In [12]:
def plot_dropdown(results: pd.DataFrame, plot_by, fontsize=11, **kwargs):
    filter_cond = ' & '.join(
        [f'{key}==@params["{key}"]' if isinstance(value, (int, float)) 
         else f'{key}=="{value}"' for key, value in kwargs.items()]
    )

    params = {key: value for key, value in kwargs.items()}
    
    filtered_df = results.query(filter_cond, local_dict={'params': params})

    fig, ax = plt.subplots(figsize=(6, 4))
    filtered_df = filtered_df.sort_values(by=plot_by)
    for idx, row in filtered_df.iterrows():
        name = f"{row['chunker']}_{row[plot_by]}"
        ax.plot(row['context_len_config'], row['em'], label=name)
    ax.legend(loc='lower right')
    
    plt.xlabel('Context length', fontsize=fontsize)
    plt.ylabel('EM', fontsize=fontsize)
    plt.xticks(fontsize=fontsize)
    plt.yticks(fontsize=fontsize)
    plt.grid(True)
    plt.ylim(0.1, 0.63)
    
    title = ', '.join([f'{key} = {value}' for key, value in kwargs.items()])
    plt.title(f"EM for {title}", fontsize=fontsize)
    plt.show()

In [19]:
def plot_dropdown_with_group_by(results: pd.DataFrame, plot_by, group_by, fontsize=11, **kwargs):
    params = {key: value for key, value in kwargs.items()}
    unique_groups = results[group_by].unique()

    fig, axes = plt.subplots(1, len(unique_groups), figsize=(6 * len(unique_groups), 4))

    if len(unique_groups) == 1:
        axes = [axes]

    for ax, group_value in zip(axes, unique_groups):
        params[group_by] = group_value

        filter_cond = ' & '.join(
            [f'{key}==@params["{key}"]' if isinstance(val, (int, float, np.integer, np.bool_)) 
             else f'{key}=="{val}"' for key, val in params.items()]
        )
        filtered_df = results.query(filter_cond, local_dict={'params': params})
        filtered_df = filtered_df.sort_values(by=plot_by)

        for idx, row in filtered_df.iterrows():
            name = f"{row['chunker']}_{row[plot_by]}"
            ax.plot(row['context_len_config'], row['em'], label=name)
        ax.legend(loc='lower right')
        
        ax.set_xlabel('Context length', fontsize=fontsize)
        ax.set_ylabel('EM', fontsize=fontsize)
        ax.tick_params(axis='both', which='major', labelsize=fontsize)
        ax.grid(True)
        ax.set_ylim(0.1, 0.63)
        ax.set_title(f"{group_by} = {group_value}", fontsize=fontsize)

    plt.tight_layout()
    plt.show()

In [20]:
def get_group_columns(path):
    df = pd.read_json(path, orient="records", lines=True)

    drop_columns = ['context_len_config', 'count', 'context_len_mean', 'time_gen_per_item', 'scores', 'time_data_load_per_item']
    if 'stride' in df.columns:
        drop_columns.append('stride')

    # Drop all columns for which we don't want aggregation
    group_columns = df.columns.drop(drop_columns).tolist()
    return group_columns

In [21]:
def make_interaction(results, group_columns, dropdown, additional_params, delete_columns):
    target_columns = filter_target_columns(results, group_columns, delete_columns)

    additional_params['results'] = widgets.fixed(results)

    # Merge additional_params with target_columns
    all_params = {**target_columns, **additional_params}

    interact(dropdown, **all_params)

### Analyze scorers and splitters

chunk_score:
- chunker: full_file
- scorer: iou/bm25/dense
- splitter: line_splitter/word_splitter/model_tokenizer

In [22]:
full_file_scorers_splitters = full_file_path / "scorers_splitters.jsonl"
full_file_dense_word = full_file_path / "dense_scorer_word_splitter.jsonl"

# Drop all columns for which we don't want aggregation
group_columns = get_group_columns(full_file_scorers_splitters)

df_full_file_scorers_splitters = read_results(full_file_scorers_splitters, group_columns)
# Drop all rows where n_grams_max > 1
df_full_file_scorers_splitters = df_full_file_scorers_splitters[df_full_file_scorers_splitters['n_grams_max'] == 1]

df_full_file_dense_word = read_results(full_file_dense_word, group_columns)

results_splitters_scorers = pd.concat([df_full_file_scorers_splitters, df_full_file_dense_word], ignore_index=True)

#### Analyse scorers for each splitter

In [23]:
additional_params = {
    'plot_by': 'scorer',
}
make_interaction(results_splitters_scorers, group_columns, plot_dropdown, additional_params, delete_columns=["scorer"])

interactive(children=(Text(value='scorer', description='plot_by'), IntSlider(value=11, description='fontsize',…

In [25]:
additional_params = {
    'plot_by': 'scorer',
    'group_by': 'splitter'
}
make_interaction(results_splitters_scorers, group_columns, plot_dropdown_with_group_by, additional_params, delete_columns=["scorer", "splitter"])

interactive(children=(Text(value='scorer', description='plot_by'), Text(value='splitter', description='group_b…

#### Analyse splitters for each scorer

In [26]:
additional_params = {
    'plot_by': 'splitter',
}
make_interaction(results_splitters_scorers, group_columns, plot_dropdown, additional_params, delete_columns=["splitter"])

interactive(children=(Text(value='splitter', description='plot_by'), IntSlider(value=11, description='fontsize…

In [27]:
additional_params = {
    'plot_by': 'splitter',
    'group_by': 'scorer'
}
make_interaction(results_splitters_scorers, group_columns, plot_dropdown_with_group_by, additional_params, delete_columns=["scorer", "splitter"])

interactive(children=(Text(value='splitter', description='plot_by'), Text(value='scorer', description='group_b…

### Analyse completion_last_chunk_size

chunk_score:
- chunker: fixed_line
- scorer: bm25
- splitter: word_splitter
- chunk_lines_size = [8, 16, 32, 64, 128]
- chunk_completion_file = True
- completion_last_chunk_size = [8, 16, 32, 64, 128]

In [28]:
fixed_line_completion_last_chunk_size = fixed_line_path / "completion_last_chunk_size.jsonl"

# Drop all columns for which we don't want aggregation
group_columns = get_group_columns(fixed_line_completion_last_chunk_size)

results_completion_last_chunk_size = read_results(fixed_line_completion_last_chunk_size, group_columns)

In [29]:
additional_params = {
    'plot_by': 'chunk_lines_size',
}
make_interaction(results_completion_last_chunk_size, group_columns, plot_dropdown, additional_params, delete_columns=["chunk_lines_size"])

interactive(children=(Text(value='chunk_lines_size', description='plot_by'), IntSlider(value=11, description='…

In [30]:
additional_params = {
    'plot_by': 'chunk_lines_size',
    'group_by': 'completion_last_chunk_size'
}
make_interaction(results_completion_last_chunk_size, group_columns, plot_dropdown_with_group_by, additional_params, delete_columns=["chunk_lines_size", "completion_last_chunk_size"])

interactive(children=(Text(value='chunk_lines_size', description='plot_by'), Text(value='completion_last_chunk…

### Analyse chunk_completion_file

chunk_score:
- chunker: fixed_line
- scorer: bm25
- splitter: word_splitter
- chunk_lines_size = [8, 16, 32, 64, 128]
- chunk_completion_file = [True, False]
- completion_last_chunk_size = 32

In [31]:
fixed_line_chunk_completion_file = fixed_line_path / "chunk_completion_file.jsonl"

# Drop all columns for which we don't want aggregation
group_columns = get_group_columns(fixed_line_chunk_completion_file)

results_chunk_completion_file = read_results(fixed_line_chunk_completion_file, group_columns)

In [18]:
additional_params = {
    'plot_by': 'chunk_lines_size',
}
make_interaction(results_chunk_completion_file, group_columns, plot_dropdown, additional_params, delete_columns=["chunk_lines_size"])

interactive(children=(Text(value='chunk_lines_size', description='plot_by'), IntSlider(value=11, description='…

In [19]:
additional_params = {
    'plot_by': 'chunk_lines_size',
    'group_by': 'chunk_completion_file',
}
make_interaction(results_chunk_completion_file, group_columns, plot_dropdown_with_group_by, additional_params, delete_columns=["chunk_lines_size", "chunk_completion_file"])

interactive(children=(Text(value='chunk_lines_size', description='plot_by'), Text(value='chunk_completion_file…

### Analyse chunkers

chunk_score:
- chunker: full_file/fixed_line/langchain
- scorer: bm25
- splitter: word_splitter

setup only for fixed_line/langchain:
- chunk_lines_size = [8, 16, 32, 64, 128]
- chunk_completion_file = True
- completion_last_chunk_size = 32

In [20]:
fixed_line = fixed_line_path / "chunk_completion_file.jsonl"
full_file = full_file_path / "bm25_word_splitter.jsonl"
langchain = langchain_path / "langchain.jsonl"

# Drop all columns for which we don't want aggregation
group_columns = get_group_columns(full_file)

df_fixed_line = read_results(fixed_line, group_columns)
# Drop all rows where chunk_completion_file is False
df_fixed_line = df_fixed_line[df_fixed_line['chunk_completion_file'] == True]
df_full_file = read_results(full_file, group_columns)
df_langchain = read_results(langchain, group_columns)

results_full_file_fixed_line = pd.concat([df_full_file, df_fixed_line], ignore_index=True)
results_full_file_langchain = pd.concat([df_full_file, df_langchain], ignore_index=True)

#### Full_file VS Fixed_Line

In [21]:
additional_params = {
    'plot_by': 'chunk_lines_size',
}
make_interaction(results_full_file_fixed_line, group_columns, plot_dropdown, additional_params, delete_columns=["chunk_lines_size", "chunk_completion_file", "chunker"])

interactive(children=(Text(value='chunk_lines_size', description='plot_by'), IntSlider(value=11, description='…

#### Full_file VS Langchain

In [None]:
additional_params = {
    'plot_by': 'chunk_lines_size',
}
make_interaction(results_full_file_langchain, group_columns, plot_dropdown, additional_params, delete_columns=["chunk_lines_size", "chunk_completion_file", "chunker"])

interactive(children=(Text(value='chunk_lines_size', description='plot_by'), IntSlider(value=11, description='…