In [1]:
import ast
import glob
from pathlib import Path

import pandas as pd

In [2]:
def parse_dataframe(df):
    entries = {}
    for index, row in df.iterrows():
        page_number = row['page']
        index_name = row['index_name']
        position_range = row['position']
        paragraph_number = row['paragraph']
        
        try:
            # Extract start and end positions
            start_pos, end_pos = map(int, position_range.strip('()').split(','))
            #print(start_pos, end_pos)
            # Calculate midpoint
            midpoint = (start_pos + end_pos) / 2
            
            unique_identifier = (page_number, paragraph_number)
            if unique_identifier not in entries:
                entries[unique_identifier] = []
            entries[unique_identifier].append({"index_name": index_name, "midpoint": midpoint, "start_pos": start_pos, "end_pos": end_pos})
        except ValueError:
            print("Error parsing position range in row:", index)
            print("Page:", page_number)
            print("Position Range:", position_range)
            print("Paragraph:", paragraph_number)
    
    return entries


In [3]:
# Function to calculate distances between midpoints for each combination of different names within the same paragraph on the same page
def calculate_closest_pairs(entries):
    closest_pairs = {}
    for page_para, entry_list in entries.items():
        names = set(entry['index_name'] for entry in entry_list)
        if len(names) > 1:
            for i in range(len(entry_list)):
                for j in range(i + 1, len(entry_list)):
                    entry1 = entry_list[i]
                    entry2 = entry_list[j]
                    distance = abs(entry1["midpoint"] - entry2["midpoint"])
                    if not entry1["index_name"] == entry2["index_name"]:
                        name_pair = tuple(sorted([entry1["index_name"], entry2["index_name"]]))
                        if name_pair not in closest_pairs or distance < closest_pairs[name_pair]["distance"]:
                            closest_pairs[name_pair] = {"page": page_para[0], "paragraph": page_para[1], "distance": distance, "mid_positions": (entry1["midpoint"], entry2["midpoint"]), "positions": (min(entry1["start_pos"], entry2["start_pos"]), max(entry1["end_pos"], entry2["end_pos"]))}
    return closest_pairs

In [4]:
def find_closest_pairs(df):
    entries = parse_dataframe(df)
    closest_pairs = calculate_closest_pairs(entries)
    return closest_pairs

In [5]:
def load_source_text(csv_text_file):
    source_df = pd.read_csv(csv_text_file)
    return source_df

# Function to extract text within the given positions from the source text
def extract_text(source_text, paragraph, page, positions):
    start_pos, end_pos = positions
    output = ""
    for index, row in source_text.iterrows():
        page_numbers = [row['page']]
        paragraph_numbers =  [row['paragraph_id']]

        if page in page_numbers and paragraph in paragraph_numbers:
            if pd.isna(row['text']):
                continue
            else:
                output = row['text'][start_pos:end_pos]
            #print(output)
    return output


In [7]:
#Sample CSV filename
for file in glob.glob("../../data/references/*.csv"):
    df = pd.read_csv(file)

    # Load source text
    source_text_file = f"../../data/volumes/{Path(file).stem}.csv"
    source_text_input = load_source_text(source_text_file)

    # Load CSV into a pandas DataFrame
    closest_pairs = find_closest_pairs(df)
    
    for name_pair, data in closest_pairs.items():
        try:
            text = extract_text(source_text_input, data['paragraph'], data['page'], data['positions'])
            data['text_snippet'] = text
        except TypeError:
            print(data['paragraph'], data['page'], data['positions'])
    
    df2 = pd.DataFrame.from_dict(closest_pairs).transpose()
    df2 = df2.drop(columns=['mid_positions', 'distance'])
    df2.to_csv(f'../../data/cooccurrence_text_snippets/{Path(file).stem}.csv')

## To test how many of the references are actually in the text

In [10]:
# Load source text
source_text_file = "../../data/volumes/0.csv"

#Sample CSV filename
csv_file = "../../data/references/0_v1.csv"

# Load CSV into a pandas DataFrame
df_references = pd.read_csv(csv_file)
df_volumes = pd.read_csv(source_text_file)

for index, row in df_references.iterrows():
    start_pos, end_pos = x = ast.literal_eval(row['position'])
    page_ref = row['page']
    paragraph_ref = row['paragraph']

    for index_2, row_2 in df_volumes.iterrows():
        page_numbers =  [row_2['page']]
        paragraph_numbers =  [row_2['paragraph_id']]

        if page_ref in page_numbers and paragraph_ref in paragraph_numbers:
            try:
                #print(start_pos, end_pos, page_numbers, paragraph_numbers, row['index_name'], len(row_2['text']))
                if row['reference'] == row_2['text'][start_pos:end_pos]:
                    print(f"Yay: {row_2['text'][start_pos:end_pos]}")
                else: 
                    print(f"Nay!")
            
            except ValueError:
                print("Error parsing position range in row:", index)
                print("Page:", page_numbers)
                print("Position Range:", row['position'])
                print("Paragraph:", paragraph_numbers)



KeyboardInterrupt: 