In [None]:
import pandas as pd
import os

# Assuming all CSV files are in the 'annotations' directory
directory_path = "doc_outputs/annotations"


# List of your CSV files
csv_files = [
    'ablations/doc_docnocontrol.csv',
    'ablations/doc_docnooutline.csv',
   # 'detailed_relevance/doc_docnocontrol_detailedrelevance.csv',
   # 'interactive/doc_re3_interactive.csv',
    'main/doc_re3.csv',
    'main/doc_rollinggpt.csv',
    'main/doc_rollingopt.csv'
]

# Read each CSV file and append it to a list
dataframes_list = []
for csv_file in csv_files:
    file_path = os.path.join(directory_path, csv_file)
    df = pd.read_csv(file_path)
    dataframes_list.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dataframes_list, ignore_index=True)

# Now `combined_df` is a single DataFrame containing all the data


In [None]:
# to be used for  visualization
def abridge_text(passage):
    # Remove any occurrences of '\n\n'
    passage = passage.replace('\n\n', ' ')

    # Split the text into sentences
    sentences = passage.split('.')

    # Initialize variables
    abridged_text = []
    word_count = 0
    sentence_count = 0

    # Iterate through sentences
    for sentence in sentences:
        # Count words in the current sentence
        words_in_sentence = len(sentence.split())
        word_count += words_in_sentence

        # Increment sentence counter
        sentence_count += 1

        # Check if it's the third sentence
        if sentence_count % 10 == 1:
            # If we have counted words from previous sentences, add them before this sentence
            if word_count - words_in_sentence > 0:
                abridged_text.append(f"... [{word_count - words_in_sentence} words]")
            # Add the current sentence
            abridged_text.append(sentence.strip())
            # Reset word count
            word_count = words_in_sentence

    # Handle the case where the last sentence(s) were not added
    if sentence_count % 3 != 1:
        abridged_text.append(f"... [{word_count} words]")

    # Join the abridged sentences back into a string
    return '. '.join(abridged_text)

# Example usage
text = "this was a great day. How are you. I am great. Are you great? Yeah i am fine, but i lost my bike. Oh that sucks."

In [None]:
# calculate dataset statistics
import pandas as pd

# Assuming your DataFrame is named df and is already loaded with the data
df = combined_df
# Calculate the average word count of 'premise'
avg_premise_length = df['premise'].apply(lambda x: len(x.split())).mean()

# Calculate the average word count of 'outline_item'
avg_outline_item_length = df['outline_item'].apply(lambda x: len(x.split())).mean()

# Calculate the average word count of 'passage1'
avg_passage1_length = df['passage1'].apply(lambda x: len(x.split())).mean()

# Calculate the average word count of 'passage2'
avg_passage2_length = df['passage2'].apply(lambda x: len(x.split())).mean()

print(f"Average word count of premise: {avg_premise_length:.2f}")
print(f"Average word count of outline item: {avg_outline_item_length:.2f}")
print(f"Average word count of passage1: {avg_passage1_length:.2f}")
print(f"Average word count of passage2: {avg_passage2_length:.2f}")


In [None]:
import pandas as pd

# Assuming 'df' is your DataFrame containing the reviewer preferences

# Define a mapping function
def map_preference_to_score(preference):
    if preference == 'Passage A':
        return "story1"
    elif preference == 'Passage B':
        return "story2"
    else:  # 'Neither', 'Both', or any other response that does not indicate a clear preference
        return 0

# List the columns that contain the questions
question_columns = [
    'Which passage seems more interesting?',
    'Which passage has a more coherent overall plot?',
    'Which passage is better focused on the given sub-event?'
]

# Apply the mapping function to each question column
for column in question_columns:
    combined_df[column + ' Score'] = combined_df[column].apply(map_preference_to_score)

# Now df has new columns with the scores


In [None]:
# DataFrame with relevance score
df_relevance = combined_df[['outline_item', 'passage1', 'passage2', 'Which passage is better focused on the given sub-event? Score']]
df_relevance.columns = ['premise', 'story1', 'story2', 'relevance_preference']

# DataFrame with coherence score
df_coherence = combined_df[['outline_item', 'passage1', 'passage2', 'Which passage has a more coherent overall plot? Score']]
df_coherence.columns = ['premise', 'story1', 'story2', 'coherence_preference']

In [None]:
# Drop rows where relevance score is zero
df_relevance = df_relevance.loc[df_relevance['relevance_preference'] != 0]
df_relevance = df_relevance.reset_index(drop=True)
# Drop rows where coherence score is zero
df_coherence = df_coherence.loc[df_coherence['coherence_preference'] != 0]
df_coherence = df_coherence.reset_index(drop=True)

In [None]:
# lengths should be different because we drop different rows
print(len(df_relevance))
print(len(df_coherence))

In [None]:
# Save the DataFrame as a CSV file
csv_file_path = 'relevance_human_data.csv'
df_relevance.to_csv(csv_file_path, index=False)

# Read the CSV file back into a DataFrame
df_reloaded = pd.read_csv(csv_file_path)

# Display the reloaded DataFrame
df_reloaded.head()

In [None]:
# Save the DataFrame as a CSV file
csv_file_path = 'coherence_human_data.csv'
df_coherence.to_csv(csv_file_path, index=False)

# Read the CSV file back into a DataFrame
df_reloaded = pd.read_csv(csv_file_path)

# Display the reloaded DataFrame
df_reloaded.head()