In [None]:
import pandas as pd
from itertools import combinations
from collections import Counter
import math

In [None]:
df_ranges = pd.read_csv(
    '../data/biographies/ranges/biography_page_ranges_I.csv')
df_data = pd.read_csv('../data/references/0.csv')

In [None]:
df_ranges[['Start Page', 'End Page']] = df_ranges['Page Range'].str.extract(r'(\d+)-(\d+)')
# Convert start and end page numbers to integers
df_ranges['Start Page'] = pd.to_numeric(df_ranges['Start Page'], errors='coerce').astype('Int64')
df_ranges['End Page'] = pd.to_numeric(df_ranges['End Page'], errors='coerce').astype('Int64')

df_data['page'] = pd.to_numeric(df_data['page'], errors='coerce').astype('Int64')
df_data['paragraph'] = pd.to_numeric(df_data['paragraph'], errors='coerce').astype('Int64')

In [None]:
def expand_range(start_page, end_page):
    if pd.notna(start_page) and pd.notna(end_page):
        return list(range(int(start_page), int(end_page) + 1))
    else:
        return []

df_ranges['Page'] = df_ranges.apply(lambda row: expand_range(row['Start Page'], row['End Page']), axis=1)

df_expanded = df_ranges.explode('Page')

df_expanded

In [None]:
merged_df = pd.merge(df_data, df_expanded, how='left', left_on='page', right_on='Page')
merged_df.drop(columns=['Page'], inplace=True)

merged_df

In [None]:
subdfs = {name: group for name, group in merged_df.groupby('Name')}

In [None]:
subdfs

In [None]:
def dice_coefficient(artist1, artist2, df):
    # Group the dataframe by paragraph and get the set of artists for each paragraph
    grouped = df.groupby('paragraph').agg({'index_name': set, 'page': 'first'})

    # Count the number of paragraphs where each artist appears
    artist1_paragraphs = Counter()
    artist2_paragraphs = Counter()
    shared_paragraphs = []

    for paragraph, row in grouped.iterrows():
        artists = row['index_name']
        page = row['page']
        if artist1 in artists and artist2 in artists:
            shared_paragraphs.append((page, paragraph))
        if artist1 in artists:
            artist1_paragraphs[paragraph] += 1
        if artist2 in artists:
            artist2_paragraphs[paragraph] += 1

    # Calculate the Dice coefficient
    numerator = 2 * len(shared_paragraphs)
    denominator = sum(artist1_paragraphs.values()) + sum(artist2_paragraphs.values())
    dice = numerator / denominator if denominator > 0 else 0
    return dice, shared_paragraphs

In [ ]:
def calculate_dice_coefficients(df):
    artist_names = df['index_name'].unique()
    dice_coefficients = {}

    # Iterate over all combinations of unique artist names
    for artist1, artist2 in combinations(artist_names, 2):
        # Calculate the Dice coefficient for the pair
        dice, shared_paragraphs = dice_coefficient(artist1, artist2, df)
        # Store the coefficient and shared paragraphs in the dictionary
        dice_coefficients[(artist1, artist2)] = {'coefficient': dice, 'shared_paragraphs': shared_paragraphs}

    return dice_coefficients


In [ ]:
for name, sub_df in subdfs.items():
    print(f"Name: {name}")
    dice_coefficients = calculate_dice_coefficients(sub_df)
    # Create a list to store the rows of the DataFrame
    rows = []
    # Populate the list with the calculated values
    for pair, info in dice_coefficients.items():
        artist1, artist2 = pair
        dice_coefficient_value = info['coefficient']
        if dice_coefficient_value > 0.0:  # Filter out values <= 0.0
            shared_paragraphs = info['shared_paragraphs']
            for page, paragraph in shared_paragraphs:
                rows.append(
                    {'Artist 1': artist1, 'Artist 2': artist2, 'Dice Coefficient': dice_coefficient_value, 'Page': page,
                     'Paragraph': paragraph})
            # Create a DataFrame from the list of rows
            result_df = pd.DataFrame(rows)
            # Save the DataFrame to a CSV file named after the artist
            result_df.to_csv(f"../data/biographies/dice/{name}_dice_coefficients.csv", index=False)

In [None]:
def calculate_pmi(artist1, artist2, df):
    # Group the dataframe by paragraph and get the set of artists for each paragraph
    grouped = df.groupby(['page', 'paragraph'])['index_name'].apply(set)

    # Count the number of paragraphs where each artist appears
    artist1_paragraphs = Counter()
    artist2_paragraphs = Counter()
    shared_paragraphs = Counter()
    total_paragraphs = len(grouped)

    for (page, paragraph), artists in grouped.items():
        if artist1 in artists and artist2 in artists:
            shared_paragraphs[(page, paragraph)] += 1
        if artist1 in artists:
            artist1_paragraphs[(page, paragraph)] += 1
        if artist2 in artists:
            artist2_paragraphs[(page, paragraph)] += 1

    # Calculate the PMI
    p_artist1 = sum(artist1_paragraphs.values()) / total_paragraphs
    p_artist2 = sum(artist2_paragraphs.values()) / total_paragraphs
    p_both = sum(shared_paragraphs.values()) / total_paragraphs

    # Handle zero probability cases
    if p_artist1 == 0 or p_artist2 == 0 or p_both == 0:
        pmi = float('-inf')
    else:
        pmi = math.log2(p_both / (p_artist1 * p_artist2))
    return pmi, shared_paragraphs

In [ ]:
def calculate_pmi_values(df):
    artist_names = df['index_name'].unique()
    pmi_values = {}

    # Iterate over all combinations of unique artist names
    for artist1, artist2 in combinations(artist_names, 2):
        # Calculate the PMI and get shared pages and paragraphs for the pair
        pmi, shared_pages_paragraphs = calculate_pmi(artist1, artist2, df)
        # Store the PMI value and shared pages and paragraphs in the dictionary
        pmi_values[(artist1, artist2)] = {'PMI': pmi, 'Shared_pages_paragraphs': shared_pages_paragraphs}

    return pmi_values


In [ ]:
for name, sub_df in subdfs.items():
    print(f"Name: {name}")
    pmi_values = calculate_pmi_values(sub_df)
    # Create a list to store the rows of the DataFrame
    rows = []
    # Populate the list with the calculated values
    for pair, data in pmi_values.items():
        artist1, artist2 = pair
        pmi = data['PMI']
        if pmi > 0:
            shared_pages_paragraphs = data['Shared_pages_paragraphs']
            for (page, paragraph), count in shared_pages_paragraphs.items():
                rows.append({'Artist 1': artist1, 'Artist 2': artist2, 'PMI': pmi, 'Page': page, 'Paragraph': paragraph,
                             'Shared_count': count})
            # Create a DataFrame from the list of rows
            result_df = pd.DataFrame(rows)
            # Save the DataFrame to a CSV file named after the artist
            result_df.to_csv(f"../data/biographies/pmi/{name}_pmi_values.csv", index=False)