In [214]:
import ast

import pandas as pd

In [66]:

# Function to parse character position ranges, page numbers, and paragraph numbers from a pandas DataFrame
def parse_dataframe(df):
    entries = {}
    for index, row in df.iterrows():
        page_number = row['page']
        index_name = row['index_name']
        position_range = row['position']
        paragraph_number = row['paragraph']
        
        try:
            # Extract start and end positions
            start_pos, end_pos = map(int, position_range.strip('()').split(','))
            #print(start_pos, end_pos)
            # Calculate midpoint
            midpoint = (start_pos + end_pos) / 2
            
            unique_identifier = (page_number, paragraph_number)
            if unique_identifier not in entries:
                entries[unique_identifier] = []
            entries[unique_identifier].append({"index_name": index_name, "midpoint": midpoint, "start_pos": start_pos, "end_pos": end_pos})
        except ValueError:
            print("Error parsing position range in row:", index)
            print("Page:", page_number)
            print("Position Range:", position_range)
            print("Paragraph:", paragraph_number)
    
    return entries


In [67]:
# Function to calculate distances between midpoints for each combination of different names within the same paragraph on the same page
def calculate_closest_pairs(entries):
    closest_pairs = {}
    for page_para, entry_list in entries.items():
        names = set(entry['index_name'] for entry in entry_list)
        if len(names) > 1:
            for i in range(len(entry_list)):
                for j in range(i + 1, len(entry_list)):
                    entry1 = entry_list[i]
                    entry2 = entry_list[j]
                    distance = abs(entry1["midpoint"] - entry2["midpoint"])
                    if not entry1["index_name"] == entry2["index_name"]:
                        name_pair = tuple(sorted([entry1["index_name"], entry2["index_name"]]))
                        if name_pair not in closest_pairs or distance < closest_pairs[name_pair]["distance"]:
                            closest_pairs[name_pair] = {"page": page_para[0], "paragraph": page_para[1], "distance": distance, "mid_positions": (entry1["midpoint"], entry2["midpoint"]), "positions": (min(entry1["start_pos"], entry2["start_pos"]), max(entry1["end_pos"], entry2["end_pos"]))}
    return closest_pairs

In [64]:
def find_closest_pairs(df):
    entries = parse_dataframe(df)
    closest_pairs = calculate_closest_pairs(entries)
    return closest_pairs

In [69]:
# Sample CSV filename
csv_file = "../../data/references/0.csv"

# Load CSV into a pandas DataFrame
df = pd.read_csv(csv_file)

closest_pairs = find_closest_pairs(df)
print("Closest pairs based on character position for each combination of different names within the same paragraph on the same page:")
for name_pair, data in closest_pairs.items():
    print(f"Names: {name_pair}, Positions: {data['positions']}, Distance: {data['distance']}, Page: {data['page']}, Paragraph: {data['paragraph']}")

Closest pairs based on character position for each combination of different names within the same paragraph on the same page:
Names: ('Cimabue, Giovanni', 'Giotto'), Positions: (3774, 3794), Distance: 0.5, Page: 8, Paragraph: 81
Names: ('Pisano, Giovanni', 'Pisano, Niccola'), Positions: (355, 362), Distance: 2.0, Page: 36, Paragraph: 126
Names: ('Como, Guido da', 'Tafi, Andrea'), Positions: (1144, 3314), Distance: 2160.5, Page: 48, Paragraph: 155
Names: ('Cimabue, Giovanni', 'Tafi, Andrea'), Positions: (3317, 3335), Distance: 11.5, Page: 50, Paragraph: 158
Names: ('Giotto', 'Tafi, Andrea'), Positions: (936, 976), Distance: 36.0, Page: 51, Paragraph: 162
Names: ('Cimabue, Giovanni', 'Gaddi, Gaddo'), Positions: (1756, 1773), Distance: 11.0, Page: 56, Paragraph: 167
Names: ('Gaddi, Gaddo', 'Tafi, Andrea'), Positions: (1756, 1805), Distance: 41.0, Page: 56, Paragraph: 167
Names: ('Margaritone', 'Tedesco, Jacopo (Lapo)'), Positions: (1660, 1672), Distance: 9.0, Page: 65, Paragraph: 181
Name

In [203]:
def load_source_text(csv_text_file):
    source_df = pd.read_csv(csv_text_file)
    return source_df

# Function to extract text within the given positions from the source text
def extract_text(source_text, paragraph, page, positions):
    start_pos, end_pos = positions
    output = ""
    for index, row in source_text.iterrows():
        if ' ' in row['pages']:
            page_numbers = [int(page1) for page1 in row['pages'].split()]
        else:
            page_numbers = [int(row['pages'])]
        paragraph_numbers =  [int(row['paragraph_id'])]

        if page in page_numbers and paragraph in paragraph_numbers:
            print(start_pos, end_pos, page, paragraph)
            output = row['text'][start_pos:end_pos]
            print(len(row['text']))
            #print(output)
    return output


In [204]:
# Load source text
source_text_file = "../../data/volumes/0.csv"
source_text_input = load_source_text(source_text_file)


In [205]:
#Sample CSV filename
csv_file = "../../data/references/0.csv"

# Load CSV into a pandas DataFrame
df = pd.read_csv(csv_file)

closest_pairs = find_closest_pairs(df)
for name_pair, data in closest_pairs.items():
    text = extract_text(source_text_input, data['paragraph'], data['page'], data['positions'])
    print(f"Names: {name_pair}, Text: {text}")

3774 3794 8 81
1936
Names: ('Cimabue, Giovanni', 'Giotto'), Text: 
355 362 36 126
1982
Names: ('Pisano, Giovanni', 'Pisano, Niccola'), Text: his son
1144 3314 48 155
3442
Names: ('Como, Guido da', 'Tafi, Andrea'), Text: Andrea, when more practised, afterwards made, as will be said below, the Christ that is over the side of the principal chapel. But having made mention of S. Giovanni, I will not pass by in silence that this ancient temple is all wrought, both without and within, with marbles of the Corinthian Order, and that it is not only designed and executed perfectly in all its parts and with all its proportions, but also very well adorned with doors and with windows, and enriched with two columns of granite on each wall-face, each eleven braccia high, in order to make the three spaces over which are the architraves, that rest on the said columns in order to support the whole mass of the double vaulted roof, which has been praised by modern architects as something remarkable, and de

In [224]:
# Load source text
source_text_file = "../../data/volumes/0.csv"

#Sample CSV filename
csv_file = "../../data/references/0.csv"

# Load CSV into a pandas DataFrame
df_references = pd.read_csv(csv_file)
df_volumes = pd.read_csv(source_text_file)

for index, row in df_references.iterrows():
    start_pos, end_pos = x = ast.literal_eval(row['position'])
    page_ref = row['page']
    paragraph_ref = row['paragraph']

    for index_2, row_2 in df_volumes.iterrows():
        if ' ' in row_2['pages']:
            page_numbers = [int(page1) for page1 in row_2['pages'].split()]
        else:
            page_numbers = [int(row_2['pages'])]
            
        paragraph_numbers =  [int(row_2['paragraph_id'])]

        if page_ref in page_numbers and paragraph_ref in paragraph_numbers:
            try:
                print(start_pos, end_pos, page_numbers, paragraph_numbers, row['index_name'], len(row_2['text']))
                if row['reference'] == row_2['text'][start_pos:end_pos]:
                    print(f"Yay: {row_2['text'][start_pos:end_pos]}")
                else: 
                    print(f"Nay!")
            
            except ValueError:
                print("Error parsing position range in row:", index)
                print("Page:", page_numbers)
                print("Position Range:", row['position'])
                print("Paragraph:", paragraph_numbers)



397 472 [4] [76] Cimabue, Giovanni 3097
Yay: Giovanni, surnamed Cimabue, of the family, noble in those times, of Cimabue
474 476 [4] [76] Cimabue, Giovanni 3097
Yay: He
512 515 [4] [76] Cimabue, Giovanni 3097
Yay: his
607 609 [4] [76] Cimabue, Giovanni 3097
Yay: he
625 632 [4] [76] Cimabue, Giovanni 3097
Yay: himself
678 681 [4] [76] Cimabue, Giovanni 3097
Yay: his
758 765 [4] [76] Cimabue, Giovanni 3097
Yay: Cimabue
792 795 [4] [76] Cimabue, Giovanni 3097
Yay: his
1497 1504 [4] [76] Cimabue, Giovanni 3097
Yay: Cimabue
1527 1530 [4] [76] Cimabue, Giovanni 3097
Yay: his
1569 1572 [4] [76] Cimabue, Giovanni 3097
Yay: him
1703 1706 [4] [76] Cimabue, Giovanni 3097
Yay: his
1805 1808 [4] [76] Cimabue, Giovanni 3097
Yay: him
1819 1826 [4] [76] Cimabue, Giovanni 3097
Yay: himself
1873 1876 [4] [76] Cimabue, Giovanni 3097
Yay: his
1903 1905 [4] [76] Cimabue, Giovanni 3097
Yay: he
1977 1984 [4] [76] Cimabue, Giovanni 3097
Yay: himself
2034 2037 [4] [76] Cimabue, Giovanni 3097
Yay: him
2054 2056