In [38]:
import datetime
import pandas as pd
from itertools import combinations
from collections import Counter
import math

In [39]:
df_ranges = pd.read_csv(
    '../../data/biographies/ranges/biography_page_ranges_II.csv')
df_data = pd.read_csv('../../data/references/1.csv')

In [40]:
df_ranges

Unnamed: 0,Name,Page Range
0,Berna,1-6
1,Duccio,7-12
2,Antonio Viniziano,13-20
3,Jacopo Di Casentino,21-26
4,Spinello Aretino,27-40
5,Gherardo Starnina,41-46
6,Lippo,47-52
7,Don Lorenzo Monaco,53-58
8,Taddeo Bartoli,59-64
9,Lorenzo Di Bicci,65-74


In [41]:
df_ranges[['Start Page', 'End Page']] = df_ranges['Page Range'].str.extract(r'(\d+)-(\d+)')
# Convert start and end page numbers to integers
df_ranges['Start Page'] = pd.to_numeric(df_ranges['Start Page'], errors='coerce').astype('Int64')
df_ranges['End Page'] = pd.to_numeric(df_ranges['End Page'], errors='coerce').astype('Int64')

df_data['page'] = pd.to_numeric(df_data['page'], errors='coerce').astype('Int64')
df_data['paragraph'] = pd.to_numeric(df_data['paragraph'], errors='coerce').astype('Int64')

In [42]:
def expand_range(start_page, end_page):
    if pd.notna(start_page) and pd.notna(end_page):
        return list(range(int(start_page), int(end_page) + 1))
    else:
        return []

df_ranges['Page'] = df_ranges.apply(lambda row: expand_range(row['Start Page'], row['End Page']), axis=1)

df_expanded = df_ranges.explode('Page')

df_expanded

Unnamed: 0,Name,Page Range,Start Page,End Page,Page
0,Berna,1-6,1,6,1
0,Berna,1-6,1,6,2
0,Berna,1-6,1,6,3
0,Berna,1-6,1,6,4
0,Berna,1-6,1,6,5
...,...,...,...,...,...
23,Michelozzo Michelozzi,257-272,257,272,268
23,Michelozzo Michelozzi,257-272,257,272,269
23,Michelozzo Michelozzi,257-272,257,272,270
23,Michelozzo Michelozzi,257-272,257,272,271


In [43]:
merged_df = pd.merge(df_data, df_expanded, how='left', left_on='page', right_on='Page')
merged_df.drop(columns=['Page'], inplace=True)

merged_df

Unnamed: 0,page,index_name,position,reference,paragraph,Name,Page Range,Start Page,End Page
0,3,Berna,"(411, 538)","poor young Berna of Siena, who, although he di...",1,Berna,1-6,1,6
1,3,Berna,"(452, 454)",he,1,Berna,1-6,1,6
2,3,Berna,"(504, 506)",he,1,Berna,1-6,1,6
3,3,Berna,"(555, 557)",he,1,Berna,1-6,1,6
4,3,Berna,"(640, 642)",he,1,Berna,1-6,1,6
...,...,...,...,...,...,...,...,...,...
2540,270,"Filarete, Antonio","(324, 327)",his,249,Michelozzo Michelozzi,257-272,257,272
2541,271,Michelozzo Michelozzi,"(14, 17)",his,249,Michelozzo Michelozzi,257-272,257,272
2542,271,Michelozzo Michelozzi,"(183, 193)",Michelozzo,249,Michelozzo Michelozzi,257-272,257,272
2543,271,Michelozzo Michelozzi,"(224, 227)",his,249,Michelozzo Michelozzi,257-272,257,272


In [44]:
subdfs = {name: group for name, group in merged_df.groupby('Name')}

In [45]:
subdfs

{'Antonio Viniziano':    page          index_name      position          reference  paragraph  \
 55   16  Viniziano, Antonio      (49, 52)                his          8   
 56   16  Viniziano, Antonio     (94, 101)            himself          8   
 57   16  Viniziano, Antonio    (123, 125)                 he          8   
 58   16  Viniziano, Antonio    (312, 314)                 he          8   
 59   16  Viniziano, Antonio    (394, 396)                 he          8   
 60   16  Viniziano, Antonio    (988, 995)            Antonio          8   
 61   16  Viniziano, Antonio  (1261, 1263)                 he          8   
 62   16  Viniziano, Antonio  (1485, 1487)                 he          8   
 63   16  Viniziano, Antonio  (1697, 1699)                 he          8   
 64   16  Viniziano, Antonio        (0, 7)            Antonio          9   
 65   16  Viniziano, Antonio    (289, 296)            Antonio          9   
 66   17  Viniziano, Antonio  (1091, 1098)            Antonio      

In [46]:
def dice_coefficient(artist1, artist2, df):
    # Group the dataframe by paragraph and get the set of artists for each paragraph
    grouped = df.groupby('paragraph').agg({'index_name': set, 'page': 'first'})

    # Count the number of paragraphs where each artist appears
    artist1_paragraphs = Counter()
    artist2_paragraphs = Counter()
    shared_paragraphs = []

    for paragraph, row in grouped.iterrows():
        artists = row['index_name']
        page = row['page']
        if artist1 in artists and artist2 in artists:
            shared_paragraphs.append((page, paragraph))
        if artist1 in artists:
            artist1_paragraphs[paragraph] += 1
        if artist2 in artists:
            artist2_paragraphs[paragraph] += 1

    # Calculate the Dice coefficient
    numerator = 2 * len(shared_paragraphs)
    denominator = sum(artist1_paragraphs.values()) + sum(artist2_paragraphs.values())
    dice = numerator / denominator if denominator > 0 else 0
    return dice, shared_paragraphs

In [47]:
def calculate_dice_coefficients(df):
    artist_names = df['index_name'].unique()
    dice_coefficients = {}

    # Iterate over all combinations of unique artist names
    for artist1, artist2 in combinations(artist_names, 2):
        # Calculate the Dice coefficient for the pair
        dice, shared_paragraphs = dice_coefficient(artist1, artist2, df)
        # Store the coefficient and shared paragraphs in the dictionary
        dice_coefficients[(artist1, artist2)] = {'coefficient': dice, 'shared_paragraphs': shared_paragraphs}

    return dice_coefficients


In [48]:
for name, sub_df in subdfs.items():
    print(f"Name: {name}")
    dice_coefficients = calculate_dice_coefficients(sub_df)
    # Create a list to store the rows of the DataFrame
    rows = []
    dice_activity = "dice_activity_biography-level_" + str(datetime.date.today())
    start_time = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
    # Populate the list with the calculated values
    for pair, info in dice_coefficients.items():
        artist1, artist2 = pair
        dice_coefficient_value = info['coefficient']
        if dice_coefficient_value > 0.0:  # Filter out values <= 0.0
            end_time = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
            shared_paragraphs = info['shared_paragraphs']
            for page, paragraph in shared_paragraphs:
                rows.append(
                    {'artist1': artist1, 'artist2': artist2, 'dice_coefficient': dice_coefficient_value, 'page': page,
                     'paragraph': paragraph, 'activity': dice_activity, 'start': start_time, 'end': end_time})
            # Create a DataFrame from the list of rows
            result_df = pd.DataFrame(rows)
            # Save the DataFrame to a CSV file named after the artist
            result_df.to_csv(f"../../data/biographies/dice/{name}_dice_coefficients.csv", index=False)

Name: Antonio Viniziano
Name: Berna
Name: Dello
Name: Don Lorenzo Monaco
Name: Donato [Donatello]
Name: Duccio
Name: Filippo Brunelleschi [Filippo di Ser Brunellesco]
Name: Gherardo Starnina
Name: Jacopo Di Casentino
Name: Jacopo della Quercia [Jacopo della Fonte]
Name: Lippo
Name: Lorenzo Di Bicci
Name: Lorenzo Ghiberti [Lorenzo di Cione Ghiberti or Lorenzo di Bartoluccio Ghiberti]
Name: Luca Della Robbia
Name: Masaccio
Name: Masolino Da Panicale
Name: Michelozzo Michelozzi
Name: Nanni D'Antonio Di Banco
Name: Niccolò Aretino [Niccolò d'Arezzo or Niccolò di Piero Lamberti]
Name: Paolo Uccello
Name: Parri Spinelli
Name: Spinello Aretino
Name: Taddeo Bartoli
Name: The Author's Preface to the Second Part


In [49]:
def calculate_pmi(artist1, artist2, df):
    # Group the dataframe by paragraph and get the set of artists for each paragraph
    grouped = df.groupby(['page', 'paragraph'])['index_name'].apply(set)

    # Count the number of paragraphs where each artist appears
    artist1_paragraphs = Counter()
    artist2_paragraphs = Counter()
    shared_paragraphs = Counter()
    total_paragraphs = len(grouped)

    for (page, paragraph), artists in grouped.items():
        if artist1 in artists and artist2 in artists:
            shared_paragraphs[(page, paragraph)] += 1
        if artist1 in artists:
            artist1_paragraphs[(page, paragraph)] += 1
        if artist2 in artists:
            artist2_paragraphs[(page, paragraph)] += 1

    # Calculate the PMI
    p_artist1 = sum(artist1_paragraphs.values()) / total_paragraphs
    p_artist2 = sum(artist2_paragraphs.values()) / total_paragraphs
    p_both = sum(shared_paragraphs.values()) / total_paragraphs

    # Handle zero probability cases
    if p_artist1 == 0 or p_artist2 == 0 or p_both == 0:
        pmi = float('-inf')
    else:
        pmi = math.log2(p_both / (p_artist1 * p_artist2))
    return pmi, shared_paragraphs

In [50]:
def calculate_pmi_values(df):
    artist_names = df['index_name'].unique()
    pmi_values = {}

    # Iterate over all combinations of unique artist names
    for artist1, artist2 in combinations(artist_names, 2):
        # Calculate the PMI and get shared pages and paragraphs for the pair
        pmi, shared_pages_paragraphs = calculate_pmi(artist1, artist2, df)
        # Store the PMI value and shared pages and paragraphs in the dictionary
        pmi_values[(artist1, artist2)] = {'PMI': pmi, 'Shared_pages_paragraphs': shared_pages_paragraphs}

    return pmi_values


In [51]:
for name, sub_df in subdfs.items():
    pmi_values = calculate_pmi_values(sub_df)
    
    pmi_activity = "pmi_activity_biography-level_" + str(datetime.date.today())
    start_time = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
    
    # Create a list to store the rows of the DataFrame
    rows = []
    # Populate the list with the calculated values
    for pair, data in pmi_values.items():
        artist1, artist2 = pair
        pmi = data['PMI']
        if pmi > 0:
            end_time = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
            shared_pages_paragraphs = data['Shared_pages_paragraphs']
            for (page, paragraph), count in shared_pages_paragraphs.items():
                rows.append({'artist1': artist1, 'artist2': artist2, 'shared_count': count, 'pmi_score': pmi, 'page': page,
                     'paragraph': paragraph, 'activity': pmi_activity, 'start': start_time, 'end': end_time})
            # Create a DataFrame from the list of rows
            result_df = pd.DataFrame(rows)
            # Save the DataFrame to a CSV file named after the artist
            result_df.to_csv(f"../../data/biographies/pmi/{name}_pmi_values.csv", index=False)

Name: Antonio Viniziano
Name: Berna
Name: Dello
Name: Don Lorenzo Monaco
Name: Donato [Donatello]
Name: Duccio
Name: Filippo Brunelleschi [Filippo di Ser Brunellesco]
Name: Gherardo Starnina
Name: Jacopo Di Casentino
Name: Jacopo della Quercia [Jacopo della Fonte]
Name: Lippo
Name: Lorenzo Di Bicci
Name: Lorenzo Ghiberti [Lorenzo di Cione Ghiberti or Lorenzo di Bartoluccio Ghiberti]
Name: Luca Della Robbia
Name: Masaccio
Name: Masolino Da Panicale
Name: Michelozzo Michelozzi
Name: Nanni D'Antonio Di Banco
Name: Niccolò Aretino [Niccolò d'Arezzo or Niccolò di Piero Lamberti]
Name: Paolo Uccello
Name: Parri Spinelli
Name: Spinello Aretino
Name: Taddeo Bartoli
Name: The Author's Preface to the Second Part
