Results:
- For long context > 8000 we want more completion length -- >=128 + chunk size 128 
- For small context < 4000 we can use smaller completion length -- 64(32) + chunk size 128
- Full_file only good on 16k context; otherwise chunking is better
- Fixed_line is slightly better than Langchain Chunker

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO
from ipywidgets import interact, widgets
from xlwings.utils import chunk
import numpy as np

from plotter import read_results_path, get_group_columns_path, filter_target_columns, plot_dropdown, plot_dropdown_with_group_by, make_interaction

In [2]:
from pathlib import Path
base_path = Path("/mnt/data/kolomyttseva/long-contex-eval/output/rag_results/python/chunk_score")

full_file_path = base_path / "full_file"
fixed_line_path = base_path / "fixed_line"
langchain_path = base_path / "langchain"

'''
fixed_line:
    - chunk_completion_file.jsonl
        - scorer: bm25
        - splitter: word_splitter
        - chunk_lines_size = [8, 16, 32, 64, 128]
        - chunk_completion_file = [True, False]
        - completion_last_chunk_size = 32
    - completion_last_chunk_size.jsonl
        - scorer: bm25
        - splitter: word_splitter
        - chunk_lines_size = [8, 16, 32, 64, 128]
        - chunk_completion_file = True
        - completion_last_chunk_size = [8, 16, 32, 64, 128]

full_file:
    - bm25_word_splitter.jsonl
        - scorer: bm25
        - splitter: word_splitter
    - dense_scorer_word_splitter.jsonl
        - scorer: dense
        - splitter: word_splitter
    - scorers_splitters.jsonl
        - scorer: iou/bm25
        - splitter: line_splitter/word_splitter/model_tokenizer

langchain:
    - langchain.jsonl
        - scorer: bm25
        - splitter: word_splitter
        - chunk_lines_size = [8, 16, 32, 64, 128]
        - chunk_completion_file = True
        - completion_last_chunk_size = 32
'''

'\nfixed_line:\n    - chunk_completion_file.jsonl\n        - scorer: bm25\n        - splitter: word_splitter\n        - chunk_lines_size = [8, 16, 32, 64, 128]\n        - chunk_completion_file = [True, False]\n        - completion_last_chunk_size = 32\n    - completion_last_chunk_size.jsonl\n        - scorer: bm25\n        - splitter: word_splitter\n        - chunk_lines_size = [8, 16, 32, 64, 128]\n        - chunk_completion_file = True\n        - completion_last_chunk_size = [8, 16, 32, 64, 128]\n\nfull_file:\n    - bm25_word_splitter.jsonl\n        - scorer: bm25\n        - splitter: word_splitter\n    - dense_scorer_word_splitter.jsonl\n        - scorer: dense\n        - splitter: word_splitter\n    - scorers_splitters.jsonl\n        - scorer: iou/bm25\n        - splitter: line_splitter/word_splitter/model_tokenizer\n\nlangchain:\n    - langchain.jsonl\n        - scorer: bm25\n        - splitter: word_splitter\n        - chunk_lines_size = [8, 16, 32, 64, 128]\n        - chunk_comp

### Analyze scorers and splitters

chunk_score:
- chunker: full_file
- scorer: iou/bm25/dense
- splitter: line_splitter/word_splitter/model_tokenizer

In [3]:
full_file_scorers_splitters = full_file_path / "scorers_splitters.jsonl"
full_file_dense_word = full_file_path / "dense_scorer_word_splitter.jsonl"

# Drop all columns for which we don't want aggregation
group_columns = get_group_columns_path(full_file_scorers_splitters)

df_full_file_scorers_splitters = read_results_path(full_file_scorers_splitters, group_columns)
# Drop all rows where n_grams_max > 1
df_full_file_scorers_splitters = df_full_file_scorers_splitters[df_full_file_scorers_splitters['n_grams_max'] == 1]

df_full_file_dense_word = read_results_path(full_file_dense_word, group_columns)

results_splitters_scorers = pd.concat([df_full_file_scorers_splitters, df_full_file_dense_word], ignore_index=True)

#### Analyse scorers for each splitter

In [4]:
plot_params = {
    'plot_by': 'scorer',
}
make_interaction(results=results_splitters_scorers, 
                 group_columns=group_columns, 
                 dropdown=plot_dropdown, 
                 plot_params=plot_params,
                 metrics=['em'],
                 )

interactive(children=(Text(value='scorer', description='plot_by'), Dropdown(description='metric', options=('em…

In [9]:
plot_params = {
    'plot_by': 'scorer',
    'group_by': 'splitter'

}
make_interaction(results=results_splitters_scorers, 
                 group_columns=group_columns, 
                 dropdown=plot_dropdown_with_group_by, 
                 plot_params=plot_params,
                 metrics=['em'],
                 )

interactive(children=(Text(value='scorer', description='plot_by'), Text(value='splitter', description='group_b…

#### Analyse splitters for each scorer

In [None]:
plot_params = {
    'plot_by': 'splitter',
}
make_interaction(results=results_splitters_scorers, 
                 group_columns=group_columns, 
                 dropdown=plot_dropdown, 
                 plot_params=plot_params,
                 metrics=['em'],
                 )

In [None]:
plot_params = {
    'plot_by': 'splitter',
    'group_by': 'scorer'

}
make_interaction(results=results_splitters_scorers, 
                 group_columns=group_columns, 
                 dropdown=plot_dropdown_with_group_by, 
                 plot_params=plot_params,
                 metrics=['em'],
                 )

### Analyse completion_last_chunk_size

chunk_score:
- chunker: fixed_line
- scorer: bm25
- splitter: word_splitter
- chunk_lines_size = [8, 16, 32, 64, 128]
- chunk_completion_file = True
- completion_last_chunk_size = [8, 16, 32, 64, 128]

In [None]:
fixed_line_completion_last_chunk_size = fixed_line_path / "completion_last_chunk_size.jsonl"

# Drop all columns for which we don't want aggregation
group_columns = get_group_columns_path(fixed_line_completion_last_chunk_size)

results_completion_last_chunk_size = read_results_path(fixed_line_completion_last_chunk_size, group_columns)

In [None]:
plot_params = {
    'plot_by': 'chunk_lines_size',
}
make_interaction(results=results_completion_last_chunk_size, 
                 group_columns=group_columns, 
                 dropdown=plot_dropdown, 
                 plot_params=plot_params,
                 metrics=['em'],
                 )

In [None]:
plot_params = {
    'plot_by': 'chunk_lines_size',
    'group_by': 'completion_last_chunk_size'
}
make_interaction(results=results_completion_last_chunk_size, 
                 group_columns=group_columns, 
                 dropdown=plot_dropdown_with_group_by, 
                 plot_params=plot_params,
                 metrics=['em'],
                 )

### Analyse chunk_completion_file

chunk_score:
- chunker: fixed_line
- scorer: bm25
- splitter: word_splitter
- chunk_lines_size = [8, 16, 32, 64, 128]
- chunk_completion_file = [True, False]
- completion_last_chunk_size = 32

In [None]:
fixed_line_chunk_completion_file = fixed_line_path / "chunk_completion_file.jsonl"

# Drop all columns for which we don't want aggregation
group_columns = get_group_columns_path(fixed_line_chunk_completion_file)

results_chunk_completion_file = read_results_path(fixed_line_chunk_completion_file, group_columns)

In [None]:
plot_params = {
    'plot_by': 'chunk_lines_size',
}
make_interaction(results=results_chunk_completion_file, 
                 group_columns=group_columns, 
                 dropdown=plot_dropdown, 
                 plot_params=plot_params,
                 metrics=['em'],
                 )

In [None]:
plot_params = {
    'plot_by': 'chunk_lines_size',
    'group_by': 'chunk_completion_file',

}
make_interaction(results=results_chunk_completion_file, 
                 group_columns=group_columns, 
                 dropdown=plot_dropdown_with_group_by, 
                 plot_params=plot_params,
                 metrics=['em'],
                 )

### Analyse chunkers

chunk_score:
- chunker: full_file/fixed_line/langchain
- scorer: bm25
- splitter: word_splitter

setup only for fixed_line/langchain:
- chunk_lines_size = [8, 16, 32, 64, 128]
- chunk_completion_file = True
- completion_last_chunk_size = 32

In [None]:
fixed_line = fixed_line_path / "chunk_completion_file.jsonl"
full_file = full_file_path / "bm25_word_splitter.jsonl"
langchain = langchain_path / "langchain.jsonl"

# Drop all columns for which we don't want aggregation
group_columns = get_group_columns_path(full_file)

df_fixed_line = read_results_path(fixed_line, group_columns)
# Drop all rows where chunk_completion_file is False
df_fixed_line = df_fixed_line[df_fixed_line['chunk_completion_file'] == True]
df_full_file = read_results_path(full_file, group_columns)
df_langchain = read_results_path(langchain, group_columns)

results_full_file_fixed_line = pd.concat([df_full_file, df_fixed_line], ignore_index=True)
results_full_file_langchain = pd.concat([df_full_file, df_langchain], ignore_index=True)

#### Full_file VS Fixed_Line

In [None]:
plot_params = {
    'plot_by': 'chunk_lines_size',

}
make_interaction(results=results_full_file_fixed_line, 
                 group_columns=group_columns, 
                 dropdown=plot_dropdown, 
                 plot_params=plot_params,
                 metrics=['em'],
                 delete_columns=["chunk_completion_file", "chunker"],
                 )

#### Full_file VS Langchain

In [None]:
plot_params = {
    'plot_by': 'chunk_lines_size',

}
make_interaction(results=results_full_file_langchain, 
                 group_columns=group_columns, 
                 dropdown=plot_dropdown, 
                 plot_params=plot_params,
                 metrics=['em'],
                 delete_columns=["chunk_completion_file", "chunker"],
                 )