In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pandas as pd

df = pd.read_csv('data/annotated_qa_pairs_with_llm_annotations.csv')

order = ["single-paragraph", "multiple-paragraphs", "sentence-level-cot", "sentence-level-cot-with-search", "legal-sentence-level-cot-with-search-v1", "legal-sentence-level-cot-with-search-v2"]

df = df.sort_values(by='prompt_id', key=lambda x: x.map({v: i for i, v in enumerate(order)}))

prompt_ids = df['prompt_id'].unique()

dfs = [df[df['prompt_id'] == prompt] for prompt in prompt_ids]

color_scheme = ["#519DE9", "#7CC674", "#F6D173", "#8481DD", "#EF9234", "#73C5C5"]
    
categories = ['question_relevance', 'question_fluency', 'answer_fluency', 'comprehensiveness', 'conciseness']

category_to_label = {
    'question_relevance': 'Question Relevance',
    'question_fluency': 'Question Fluency',
    'answer_fluency': 'Answer Fluency',
    'comprehensiveness': 'Answer Comprehensiveness',
    'conciseness': 'Answer Conciseness'
}

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(20, 20))
fig.subplots_adjust(hspace=0.5, wspace=0.3)
# plt.suptitle('Comparison of QA Generation Prompts', fontsize=16)

for i, category in enumerate(categories):
    ax = axes.flatten()[i]
    indices = np.arange(5)
    width = 0.15

    def to_values(df, category):
        d = {1: 0, 2:0, 3:0, 4:0, 5:0}
        for k, v in df[category].value_counts().sort_index().to_dict().items():
            d[k] = v
        return d.values()

    for j, prompt_id in enumerate(prompt_ids):
        ax.bar(indices + width * j, to_values(dfs[j], category), width=width, label=prompt_id, color=color_scheme[j % len(color_scheme)])
    
    ax.set_title(category_to_label[category])
    ax.set_xlabel('Rating')
    ax.set_ylabel('Count')
    ax.set_xticks(indices + width)
    ax.set_xticklabels(['1', '2', '3', '4', '5'])
    ax.legend()

# accept all in the df that have a rating of >= 4
accepted = df[(df['question_relevance'] >= 4) & (df['question_fluency'] >= 4) & (df['answer_fluency'] >= 4) & (df['comprehensiveness'] >= 4) & (df['conciseness'] >= 4)]
# accepted = df[(df['comprehensiveness'] >= 4) & (df['conciseness'] >= 4)]

accepted_by_prompt = accepted.groupby('prompt_id').size()
total_by_prompt = df.groupby('prompt_id').size()

# size of accepted divided by size of all
print("Percentage accepted total", accepted.size / df.size)
print("Accepted by prompt")
print(accepted_by_prompt)
print(accepted_by_prompt / total_by_prompt)

# sort accepted_by_prompt by order
accepted_by_prompt = accepted_by_prompt[order]

# transform accepted_by_prompt to a list
accepted_by_prompt = accepted_by_prompt.to_dict().values()

ax = axes.flatten()[-1]

bars = ax.bar(['1', '2', '3', '4', '5', '6'], accepted_by_prompt, color=color_scheme)

# Add labels to the bars
for bar, prompt_id in zip(bars, order):
    bar.set_label(prompt_id)

ax.set_title('Total Pairs with all Scores >=4')
ax.set_ylabel('Count')
ax.legend()

In [None]:
df = pd.read_csv('data/annotated_qa_pairs.csv')

order = ["single-paragraph", "multiple-paragraphs", "sentence-level-cot", "sentence-level-cot-with-search", "legal-sentence-level-cot-with-search-v1", "legal-sentence-level-cot-with-search-v2"]

df_single_paragraph = df[df['prompt_id'] == 'single-paragraph']
df_multiple_paragraphs = df[df['prompt_id'] == 'multiple-paragraphs']
df_sentence_level_cot = df[df['prompt_id'] == 'sentence-level-cot']
df_sentence_level_cot_with_search = df[df['prompt_id'] == 'sentence-level-cot-with-search']
df_legal_sentence_level_cot_with_search_v1 = df[df['prompt_id'] == 'legal-sentence-level-cot-with-search-v1']
df_legal_sentence_level_cot_with_search_v2 = df[df['prompt_id'] == 'legal-sentence-level-cot-with-search-v2']
print("prompt & Question Relevance & Question Fluency & Answer Fluency & Comprehensiveness & Conciseness ")
print(f"single-paragraph & {df_single_paragraph['question_relevance'].mean()} & {df_single_paragraph['question_fluency'].mean()} & {df_single_paragraph['answer_fluency'].mean()} & {df_single_paragraph['comprehensiveness'].mean()} & {df_single_paragraph['conciseness'].mean()}")
print(f"multiple-paragraphs & {df_multiple_paragraphs['question_relevance'].mean()} & {df_multiple_paragraphs['question_fluency'].mean()} & {df_multiple_paragraphs['answer_fluency'].mean()} & {df_multiple_paragraphs['comprehensiveness'].mean()} & {df_multiple_paragraphs['conciseness'].mean()}")
print(f"sentence-level-cot & {df_sentence_level_cot['question_relevance'].mean()} & {df_sentence_level_cot['question_fluency'].mean()} & {df_sentence_level_cot['answer_fluency'].mean()} & {df_sentence_level_cot['comprehensiveness'].mean()} & {df_sentence_level_cot['conciseness'].mean()}")
print(f"sentence-level-cot-with-search & {df_sentence_level_cot_with_search['question_relevance'].mean()} & {df_sentence_level_cot_with_search['question_fluency'].mean()} & {df_sentence_level_cot_with_search['answer_fluency'].mean()} & {df_sentence_level_cot_with_search['comprehensiveness'].mean()} & {df_sentence_level_cot_with_search['conciseness'].mean()}")
print(f"legal-sentence-level-cot-with-search-v1 & {df_legal_sentence_level_cot_with_search_v1['question_relevance'].mean()} & {df_legal_sentence_level_cot_with_search_v1['question_fluency'].mean()} & {df_legal_sentence_level_cot_with_search_v1['answer_fluency'].mean()} & {df_legal_sentence_level_cot_with_search_v1['comprehensiveness'].mean()} & {df_legal_sentence_level_cot_with_search_v1['conciseness'].mean()}")
print(f"legal-sentence-level-cot-with-search-v2 & {df_legal_sentence_level_cot_with_search_v2['question_relevance'].mean()} & {df_legal_sentence_level_cot_with_search_v2['question_fluency'].mean()} & {df_legal_sentence_level_cot_with_search_v2['answer_fluency'].mean()} & {df_legal_sentence_level_cot_with_search_v2['comprehensiveness'].mean()} & {df_legal_sentence_level_cot_with_search_v2['conciseness'].mean()}")


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv('data/annotated_qa_pairs_with_llm_annotations.csv')

# Define the order of categories to sort the DataFrame
order = ["single-paragraph", "multiple-paragraphs", "sentence-level-cot", "sentence-level-cot-with-search", "legal-sentence-level-cot-with-search-v1", "legal-sentence-level-cot-with-search-v2"]
df = df.sort_values(by='prompt_id', key=lambda x: x.map({v: i for i, v in enumerate(order)}))

# Get unique prompt IDs and filter the DataFrame
prompt_ids = df['prompt_id'].unique()
dfs = [df[df['prompt_id'] == prompt] for prompt in prompt_ids]

# Define color scheme and categories
color_scheme = ["#519DE9", "#73C5C5", "#8481DD", "#F6D173", "#EF9234", "#7CC674"]

categories = ['question_relevance', 'question_fluency', 'answer_fluency', 'comprehensiveness', 'conciseness']

# Map category names to more user-friendly labels
category_to_label = {
    'question_relevance': 'Question Relevance',
    'question_fluency': 'Question Fluency',
    'answer_fluency': 'Answer Fluency',
    'comprehensiveness': 'Answer Comprehensiveness',
    'conciseness': 'Answer Conciseness'
}

# Create a 3x2 grid of subplots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 15), constrained_layout=True)

# Plotting loop
for i, category in enumerate(categories):
    ax = axes.flatten()[i]
    indices = np.arange(5)
    width = 0.15

    # Helper function to convert DataFrame values for plotting
    def to_values(df, category):
        d = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
        for k, v in df[category].value_counts().sort_index().to_dict().items():
            d[k] = v
        return list(d.values())

    # Plot bars for each prompt
    for j, prompt_id in enumerate(prompt_ids):
        ax.bar(indices + width * j, to_values(dfs[j], category), width=width, label=prompt_id, color=color_scheme[j % len(color_scheme)])

    # Configure subplot aesthetics
    ax.set_title(category_to_label[category], fontsize=12)
    ax.set_xlabel('Rating', fontsize=10)
    ax.set_ylabel('Count', fontsize=10)
    ax.set_xticks(indices + width)
    ax.set_xticklabels(['1', '2', '3', '4', '5'])

# Place the legend inside the first subplot, increasing font size
axes[0, 0].legend(fontsize=12, loc='upper left')

# accept all in the df that have a rating of >= 4
accepted = df[(df['question_relevance'] >= 4) & (df['question_fluency'] >= 4) & (df['answer_fluency'] >= 4) & (df['comprehensiveness'] >= 4) & (df['conciseness'] >= 4)]
# accepted = df[(df['comprehensiveness'] >= 4) & (df['conciseness'] >= 4)]

accepted_by_prompt = accepted.groupby('prompt_id').size()
total_by_prompt = df.groupby('prompt_id').size()

# size of accepted divided by size of all
print("Percentage accepted total", accepted.size / df.size)
print("Accepted by prompt")
print(accepted_by_prompt)
print(accepted_by_prompt / total_by_prompt)

# sort accepted_by_prompt by order
accepted_by_prompt = accepted_by_prompt[order]

# transform accepted_by_prompt to a list
accepted_by_prompt = accepted_by_prompt.to_dict().values()

ax = axes.flatten()[-1]

bars = ax.bar(['1', '2', '3', '4', '5', '6'], accepted_by_prompt, color=color_scheme)

# Add labels to the bars
for bar, prompt_id in zip(bars, order):
    bar.set_label(prompt_id)

ax.set_title('Total Pairs with all Scores >=4')
ax.set_ylabel('Count')

plt.show()