# Experiments in Modeling Issue Segmentation

## Load Libraries and Datasets

In [23]:
import os
import re
import pandas as pd
from tqdm import tqdm
import altair as alt
alt.data_transformers.disable_max_rows()
from collections import deque
import warnings
warnings.filterwarnings('ignore')
from rich.console import Console
from rich.table import Table
import numpy as np
import scipy.stats as stats
from minineedle import needle, core
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
console = Console()
import sys

sys.path.append("..")
from segmentation_scripts.utils import read_csv_file, get_data_directory_path

In [24]:
data_directory_path = get_data_directory_path()
preidentified_periodicals_df = read_csv_file(os.path.join(data_directory_path, "HathiTrust-pcc-datasets", "datasets", "preidentified_periodicals_with_full_metadata.csv"))

In [3]:
preidentified_periodicals_df[0:1][['publication_directory', 'volume_directory']].to_dict(orient='records')

[{'publication_directory': 'datasets/ht_ef_datasets/la_documentation_arabe',
  'volume_directory': 'inu_30000093395964'}]

In [4]:
# Count the number of matching files
matching_files = []
for directory, _, files in tqdm(os.walk("../datasets/annotated_ht_ef_datasets/"), desc="Counting matching files"):
	for file in files:
		if file.endswith(".csv") and 'individual' in file:
			if os.path.exists(os.path.join(directory, file)):
				matching_files.append({"file": file, "directory": directory, "file_path": os.path.join(directory, file)})
matching_files_df = pd.DataFrame(matching_files)
console.print(f"Found {len(matching_files_df)} matching files.", style="bright_green")

for index, row in matching_files_df.iterrows():
	file = row['file']
	directory = row['directory']
	file_path = row['file_path']
	console.print(f"Processing file: {file_path}. Number {index} out of {len(matching_files_df)}", style="bright_white")
	full_df = read_csv_file(file_path)
	full_df = full_df.sort_values(by=['page_number'])
	numb_of_issues = full_df.start_issue.nunique()

Counting matching files: 95it [00:00, 16934.08it/s]


In [129]:
full_df = read_csv_file("../datasets/annotated_ht_ef_datasets/liberator/mdp_39015065389036/mdp_39015065389036_annotated_individual_tokens.csv")
console.print(f"Volume has this many tokens: {len(full_df)}")
console.print(f"Volume has this many issues: {full_df.start_issue.nunique()}")
console.print(f"Volume has this many pages: {full_df.page_number.nunique()}")
# Factorize the 'issue_number' column to create 'actual_issue_number'
full_df = full_df.sort_values(by=['page_number'])
full_df = full_df.rename(columns={'issue_number': 'original_issue_number'})
full_df['temp_issue_number'] = pd.factorize(full_df['original_issue_number'])[0]

# Expand count column
expanded_df = full_df.loc[full_df.index.repeat(full_df['count'])].reset_index(drop=True)
console.print(f"Expanded volume has this many tokens: {len(expanded_df)}")

# Calculate the number of tokens per page
tokens_per_page = expanded_df.groupby('page_number').size().reset_index(name='tokens_per_page')

# Merge the token counts back into the original DataFrame
expanded_df = expanded_df.merge(tokens_per_page, on='page_number', how='left')
missing_pages = full_df[~full_df.page_number.isin(expanded_df.page_number.unique())]
expanded_df = pd.concat([expanded_df, missing_pages], ignore_index=True)
expanded_df = expanded_df.reset_index(drop=True)


In [130]:
def generate_table(df, table_title):
    # Create a Rich Table
    table = Table(title=table_title)
    columns = df.columns
    for column in columns:
        table.add_column(column.replace("_", " ").capitalize(), justify="center", style="cyan", no_wrap=True)    

    # Add rows to the table
    for _, row in df.iterrows():
        table.add_row(*[str(value) if pd.notna(value) else "" for value in row])

    # Print the table
    console.print(table)

### Clean Page Numbers

In [132]:
def filter_integers(token: str) -> bool:
	"""Check if the token is an integer."""
	return bool(re.match(r'^\d+$', token))

def calculate_digit_coverage(rows):
    """Calculate the number of digits in the given rows."""
    number_of_digits = rows['implied_zero'].notna().sum()
    return number_of_digits

def clean_digits(df: pd.DataFrame) -> pd.DataFrame:
	"""Clean and filter digit tokens in the DataFrame while retaining non-digit pages."""
	max_page = df.page_number.max()
	
	df['token'] = df['token'].astype(str)
	if 'volume_name' in df.columns:
		df['volume_number'] = df['volume_number'].fillna(0)
	
	# Identify pages with digit tokens
	subset_digits = df[df['token'].str.isdigit()].copy()
	possible_pages = subset_digits[subset_digits['token'].apply(filter_integers)].copy()
	# Use the smaller subset if possible_pages is smaller
	if len(possible_pages) < len(subset_digits):
		subset_digits = possible_pages
	non_digits_pages = df[(~df['token'].str.isdigit()) & (~df.page_number.isin(subset_digits.page_number))].copy()
	
	# just take the first page of non_digits_pages
	non_digits_pages = non_digits_pages.groupby('page_number').first().reset_index()
	console.print(f"Number of digits in this volume: {len(subset_digits)}")
	console.print(f"Number of non-digit pages in this volume: {len(non_digits_pages)}")
	
	subset_digits['number'] = subset_digits['token'].astype(int, errors='ignore')
	max_possible_number = max_page + 25
	filtered_subset_digits = subset_digits[(subset_digits['number'] < max_possible_number) & (subset_digits['number'] <= subset_digits.page_number)].copy()
	non_filtered_subset_digits = subset_digits[(subset_digits['number'] >= max_possible_number) & (~subset_digits.page_number.isin(filtered_subset_digits.page_number))].groupby('page_number').first().reset_index()
	console.print(f"Number of digits in this volume after filtering for max page length: {len(filtered_subset_digits)}")
	console.print(f"Number of pages without digits in this volume after filtering for max page length: {len(non_filtered_subset_digits)}")
	
	# Calculate implied zero only for digit pages
	filtered_subset_digits['implied_zero'] = filtered_subset_digits['page_number'].astype(int) - filtered_subset_digits['number']
	
	final_subset_digits = filtered_subset_digits[filtered_subset_digits['implied_zero'] >= 0]
	console.print(f"Number of digits in this volume after filtering for max page length and implied zero: {len(final_subset_digits)}")
	remaining_missing_pages = df[(~df.page_number.isin(final_subset_digits.page_number)) & (~df.page_number.isin(non_digits_pages.page_number))].copy()
	remaining_missing_pages = remaining_missing_pages.groupby('page_number').first().reset_index()
	console.print(f"Number of pages without digits in this volume after filtering for max page length and digit pages: {len(remaining_missing_pages)}")
	
	# Merge non-digit pages back into the DataFrame
	non_digits_pages['page_type'] = 'non_digit'
	remaining_missing_pages['page_type'] = 'negative_na_digit'
	final_subset_digits['page_type'] = 'digit'
	non_filtered_subset_digits['page_type'] = 'digit_too_large'
	full_df_with_digits = pd.concat([final_subset_digits, non_digits_pages, remaining_missing_pages, non_filtered_subset_digits]).sort_values(by=['page_number']).reset_index(drop=True)
	
	console.print(f"Number of pages after including non-digit pages: {full_df_with_digits.page_number.nunique()}")

	if full_df_with_digits.page_number.nunique() != df.page_number.nunique():
		added_pages = df[~df.page_number.isin(full_df_with_digits.page_number)].copy()
		added_pages = added_pages.groupby('page_number').first().reset_index()
		added_pages['page_type'] = 'added'
		full_df_with_digits = pd.concat([full_df_with_digits, added_pages]).sort_values(by=['page_number']).reset_index(drop=True)
		console.print(f"Number of pages after including added pages: {full_df_with_digits.page_number.nunique()}")

	# Calculate the number of digits per page
	tqdm.pandas(desc="Calculating digits per page")
	digits_per_page = full_df_with_digits.groupby('page_number').progress_apply(calculate_digit_coverage).reset_index(name='digits_per_page')
	full_df_with_digits = full_df_with_digits.merge(digits_per_page, on='page_number', how='left')
	
	return full_df_with_digits

# Example usage
subset_digits = clean_digits(expanded_df)
subset_digits = subset_digits.sort_values(by=['page_number'])
counts_per_annotated_issue = subset_digits.start_issue.value_counts().reset_index()

generate_table(counts_per_annotated_issue, "Counts per Annotated Issue")

Calculating digits per page: 100%|██████████| 151/151 [00:00<00:00, 15859.67it/s]


In [133]:
distribution_df = subset_digits[['page_number', 'tokens_per_page', 'digits_per_page', 'start_issue']].drop_duplicates()
distribution_df = distribution_df.sort_values(by='page_number').reset_index(drop=True)
distribution_df['digit_ratio'] = distribution_df['digits_per_page'] / distribution_df['tokens_per_page']

# Calculate the mean digit ratio per issue
mean_digit_ratio_per_issue = distribution_df.groupby('start_issue')['digit_ratio'].mean().reset_index(name='mean_digit_ratio')
generate_table(mean_digit_ratio_per_issue, "Mean Digit Ratio per Issue")

In [134]:
alt.Chart(distribution_df).mark_circle().encode(
	x=alt.X("page_number").scale(zero=False),
	y=alt.Y('digit_ratio').scale(zero=False),
	color='start_issue:N',
	tooltip=['page_number', 'digit_ratio', 'start_issue', 'tokens_per_page', 'digits_per_page']
).properties(
	width=400,
	height=200
)

In [135]:
annotated_df = full_df[['page_number', 'start_issue', 'end_issue', 'type_of_page']].drop_duplicates()
# Group by 'start_issue' and aggregate
grouped_df = annotated_df.groupby('start_issue').agg(
	first_page=('page_number', 'min'),
	last_page=('page_number', 'max'),
	number_of_pages=('page_number', 'count')
).reset_index()
grouped_df = grouped_df.sort_values(by='first_page')
if len(grouped_df) <= 1:
	console.print("Only one issue found. Skipping volume.", style="bright_red")
annotated_first_issue = grouped_df[0:1]
annotated_first_issue

Unnamed: 0,start_issue,first_page,last_page,number_of_pages
0,1968-02-01,11,34,23


### Detect Likely First Issue

In [139]:
def run_global_sequence_alignment(window: list, target_sequence: list, placeholder: int = -1) -> tuple:
	"""Apply global sequence alignment on the implied zero values within a window using minineedle, with placeholders."""
	observed_sequence = [int(p[1]) if pd.notna(p[1]) else placeholder for p in window]
	
	# Check for valid entries in the observed sequence
	if all(val == placeholder for val in observed_sequence):
		return 0, [], []

	# Create Needleman-Wunsch global alignment instance
	alignment = needle.NeedlemanWunsch(observed_sequence, target_sequence)
	alignment.change_matrix(core.ScoreMatrix(match=6, miss=-0.5, gap=-1))

	try:
		# Run the alignment
		alignment.align()
		aligned_observed, aligned_target = alignment.get_aligned_sequences(core.AlignmentFormat.list)
		alignment_score = alignment.get_score()
		return alignment_score, aligned_observed, aligned_target

	except ZeroDivisionError:
		return 0, [], []

def sequence_alignment_issue_detection_global(df: pd.DataFrame, threshold_sizes: list, placeholder: int = -1) -> pd.DataFrame:
	"""Detect issue boundaries using global sequence alignment."""
	df['page_number'] = df['page_number'].astype(int)
	df = df.sort_values(by=['page_number', 'implied_zero'])

	all_boundaries = []
	first_page_number = df.page_number.min()
	stop_loop = False
	for threshold_size in tqdm(range(threshold_sizes[0], threshold_sizes[1]), desc="Running Sequence Alignment"):
		for additional_page in range(5):
			current_first_page_number = first_page_number + additional_page
			final_page_number = df[df.page_number == current_first_page_number + threshold_size]
			if final_page_number.empty:
				continue
			final_page_number = final_page_number.page_number.max()
			selected_rows = df[(df.page_number <= final_page_number) & (df.page_number >= current_first_page_number)]
			potential_sequence = list(zip(selected_rows['page_number'], selected_rows['implied_zero']))
			target_sequence = list(range(current_first_page_number, final_page_number))  # Generate the target sequence
			
			# Run sequence alignment with placeholders
			alignment_score, aligned_observed, aligned_target = run_global_sequence_alignment(potential_sequence, target_sequence, placeholder=placeholder)
			
			# Analyze the alignment score
			if alignment_score > 0:  # Adjust this threshold as needed
				all_boundaries.append((alignment_score, aligned_observed, aligned_target, threshold_size, current_first_page_number, final_page_number))

	boundaries_df = pd.DataFrame(all_boundaries, columns=['alignment_score', 'aligned_observed', 'aligned_target', 'threshold_size', 'start_page', 'end_page'])
	return boundaries_df
dedup_subset_digits = subset_digits.drop_duplicates()
# Example usage
sequence_alignment_likely_first_issue_boundaries_df = sequence_alignment_issue_detection_global(dedup_subset_digits, threshold_sizes=[10, 200], placeholder=-1)

Running Sequence Alignment:   0%|          | 0/190 [00:00<?, ?it/s]

Running Sequence Alignment: 100%|██████████| 190/190 [00:11<00:00, 17.13it/s]


In [118]:
df = subset_digits[subset_digits.start_issue == "1970-04-01"]
df['page_number'] = df['page_number'].astype(int)
df = df.sort_values(by=['page_number', 'implied_zero'])
first_page_number = df.page_number.min()
threshold_size = 49
final_page_number = df[df.page_number == first_page_number + threshold_size].page_number.max()
target_sequence = list(range(first_page_number, final_page_number))  # Generate the target sequence
selected_rows = df[(df.page_number <= final_page_number) & (df.page_number >= first_page_number)]
potential_sequence = list(zip(selected_rows['page_number'], selected_rows['implied_zero']))
placeholder = -1

In [119]:
observed_sequence = [int(p[1]) if pd.notna(p[1]) else placeholder for p in potential_sequence]


# Create Needleman-Wunsch global alignment instance
alignment = needle.NeedlemanWunsch(observed_sequence, target_sequence)
alignment.change_matrix(core.ScoreMatrix(match=6, miss=-0.5, gap=-1))

try:
	# Run the alignment
	alignment.align()
	aligned_observed, aligned_target = alignment.get_aligned_sequences(core.AlignmentFormat.list)
	alignment_score = alignment.get_score()
	console.print(f"Alignment Score: {alignment_score}, Threshold Size: {threshold_size}, Start Page: {first_page_number}, End Page: {final_page_number}, length of target sequence: {len(target_sequence)}, length of potential sequence: {len(potential_sequence)}")

except ZeroDivisionError:
	console.print("ZeroDivisionError occurred during alignment.", style="bright_red")

In [128]:
seventy_five_threshold = sequence_alignment_likely_first_issue_boundaries_df['alignment_score'].quantile(0.75)
top_ten_boundaries = sequence_alignment_likely_first_issue_boundaries_df[sequence_alignment_likely_first_issue_boundaries_df.alignment_score > seventy_five_threshold].sort_values(by=['alignment_score', 'start_page'], ascending=[False, True])
generate_table(top_ten_boundaries[['alignment_score', 'threshold_size', 'start_page', 'end_page']], "Top Ten Likely First Issue Boundaries")

In [62]:
sequence_alignment_likely_first_issue_boundaries_df

Unnamed: 0,alignment_score,aligned_observed,aligned_target,threshold_size,start_page,end_page


In [11]:
annotated_df = full_df[['page_number', 'start_issue', 'end_issue', 'type_of_page']].drop_duplicates()
# Group by 'start_issue' and aggregate
grouped_df = annotated_df.groupby('start_issue').agg(
	first_page=('page_number', 'min'),
	last_page=('page_number', 'max'),
	number_of_pages=('page_number', 'count')
).reset_index()
grouped_df = grouped_df.sort_values(by='first_page')
annotated_first_issue = grouped_df[0:1]
annotated_first_issue

Unnamed: 0,start_issue,first_page,last_page,number_of_pages
0,1965-10-04,7,61,54


In [20]:
def select_likely_first_issue(df: pd.DataFrame, mean_threshold: float) -> pd.Series:
    """Select the most likely first issue based on weighted scores."""
    # Calculate frequency of start_page, end_page, and threshold_size
    start_page_freq = df['start_page'].value_counts(normalize=True).to_dict()
    end_page_freq = df['end_page'].value_counts(normalize=True).to_dict()
    # Define weights
    alpha, beta, gamma, delta = 0.4, 0.2, 0.2, 0.2

    # Add a column for weighted score
    def calculate_weighted_score(row):
        alignment_score = row['alignment_score']
        start_page_score = start_page_freq.get(row['start_page'], 0)
        end_page_score = end_page_freq.get(row['end_page'], 0)
        threshold_diff = abs(row['threshold_size'] - mean_threshold)
        threshold_score = 1 - (threshold_diff / mean_threshold)

        return (alpha * alignment_score +
                beta * start_page_score +
                gamma * end_page_score +
                delta * threshold_score)

    df['weighted_score'] = df.apply(calculate_weighted_score, axis=1)

    # Select the candidate with the highest weighted score
    best_candidate = df.sort_values(by='weighted_score', ascending=False).iloc[0]

    return best_candidate

def calculate_confidence_interval(df: pd.DataFrame, column: str, confidence: float = 0.95) -> tuple:
    """Calculate the confidence interval for a given column."""
    mean_val = df[column].mean()
    std_dev = df[column].std()
    n = len(df)

    # Calculate standard error
    standard_error = std_dev / np.sqrt(n)

    # Calculate confidence interval
    z_score = stats.norm.ppf((1 + confidence) / 2)
    margin_of_error = z_score * standard_error

    lower_bound = mean_val - margin_of_error
    upper_bound = mean_val + margin_of_error

    return mean_val, lower_bound, upper_bound, margin_of_error


# Calculate the mean of threshold sizes
mean_threshold = top_ten_boundaries['threshold_size'].mean()

# Apply the selection function to the top ten boundaries
best_first_issue = select_likely_first_issue(top_ten_boundaries, mean_threshold)
best_first_issue_df = pd.DataFrame([best_first_issue]).reset_index(drop=True)
generate_table(best_first_issue_df[['alignment_score', 'threshold_size', 'start_page', 'end_page']], "Best First Issue Candidate")

# Calculate confidence intervals for threshold_size and alignment_score
mean_threshold, lower_threshold, upper_threshold, margin_error_threshold = calculate_confidence_interval(top_ten_boundaries, 'threshold_size')
mean_score, lower_score, upper_score, margin_error_score = calculate_confidence_interval(top_ten_boundaries, 'alignment_score')

console.print(f"Threshold Size: Mean = {mean_threshold}, CI = ({lower_threshold}, {upper_threshold}), Margin of Error = {margin_error_threshold}", style="bold")
console.print(f"Alignment Score: Mean = {mean_score}, CI = ({lower_score}, {upper_score}), Margin of Error = {margin_error_score}", style="bold")


In [21]:
# dedup_subset_digits = subset_digits.drop_duplicates()
# Example usage
sequence_alignment_likely_first_issue_boundaries_df = sequence_alignment_issue_detection_global(subset_digits, threshold_sizes=[10, 200], placeholder=-1)

Threshold Sizes: 100%|██████████| 190/190 [00:47<00:00,  3.97it/s]


In [22]:
seventy_five_threshold = sequence_alignment_likely_first_issue_boundaries_df['alignment_score'].quantile(0.75)
top_ten_boundaries = sequence_alignment_likely_first_issue_boundaries_df[sequence_alignment_likely_first_issue_boundaries_df.alignment_score > seventy_five_threshold].sort_values(by=['alignment_score', 'start_page'], ascending=[False, True])
generate_table(top_ten_boundaries[['alignment_score', 'threshold_size', 'start_page', 'end_page']], "Top Ten Likely First Issue Boundaries")

In [60]:
# Calculate confidence intervals for threshold_size and alignment_score
mean_threshold, lower_threshold, upper_threshold, margin_error_threshold = calculate_confidence_interval(top_ten_boundaries, 'threshold_size')
mean_score, lower_score, upper_score, margin_error_score = calculate_confidence_interval(top_ten_boundaries, 'alignment_score')

console.print(f"Threshold Size: Mean = {mean_threshold}, CI = ({lower_threshold}, {upper_threshold}), Margin of Error = {margin_error_threshold}", style="bold")
console.print(f"Alignment Score: Mean = {mean_score}, CI = ({lower_score}, {upper_score}), Margin of Error = {margin_error_score}", style="bold")

In [103]:
def probabilistic_first_issue_detection(df: pd.DataFrame, threshold_sizes: list, window_size: int = 5, score_threshold: float = 0.5) -> pd.DataFrame:
	"""Identify the likely first issue length using probabilistic detection."""
	df['page_number'] = df['page_number'].astype(int)
	df['implied_zero'] = df['implied_zero'].astype(int, errors='ignore')
	df = df.sort_values(by=['page_number'])

	all_boundaries = []

	for threshold_size in tqdm(range(threshold_sizes[0], threshold_sizes[1]), desc="Threshold Sizes"):
		first_page_number = df.page_number.min()
		final_page_number = df[df.page_number == first_page_number + threshold_size].page_number.max()

		# Vary the start page within a defined range (similar to Needleman-Wunsch approach)
		for additional_page in range(5):
			current_first_page_number = first_page_number + additional_page
			selected_rows = df[(df.page_number <= final_page_number) & (df.page_number >= current_first_page_number)]

			sliding_window = deque(maxlen=window_size)
			cumulative_score = 0

			# Iterate through the observed sequence in the current window
			for _, row in selected_rows.iterrows():
				page_number = row['page_number']
				implied_zero = row['implied_zero'] if pd.notna(row['implied_zero']) else None
				section_weight = 0.2 if row['section'] != "body" else 0

				# Add to the sliding window
				if implied_zero is not None:
					sliding_window.append((page_number, implied_zero, section_weight))
				else:
					sliding_window.append((page_number, None, 0))

				# Calculate scores once the window is full
				# Check if the window is full and contains valid implied_zero values
				non_none_values = [p for p in sliding_window if p[1] is not None]
				if len(sliding_window) == window_size and non_none_values:
					page_range = max(p[0] for p in sliding_window if p[1] is not None) - min(p[0] for p in sliding_window if p[1] is not None)
					implied_zero_diff = max(p[1] for p in sliding_window if p[1] is not None) - min(p[1] for p in sliding_window if p[1] is not None)

					score = 0
					if page_range > threshold_size:
						score += 0.7

					if implied_zero_diff > threshold_size:
						score += 0.5

					non_digit_count = sum(1 for p in sliding_window if p[1] is None)
					if non_digit_count > 0:
						score += 0.25 * (non_digit_count / window_size)

					section_weight = sum(p[2] for p in sliding_window)
					if section_weight > 0:
						score += 0.2 * section_weight

					# Accumulate scores and evaluate threshold
					cumulative_score += score
					if cumulative_score >= score_threshold:
						
						all_boundaries.append((
							cumulative_score, sliding_window, threshold_size,
							current_first_page_number, final_page_number
						))
						cumulative_score = 0  # Reset cumulative score

	boundaries_df = pd.DataFrame(all_boundaries, columns=[
		'cumulative_score', 'sliding_window', 'threshold_size', 'start_page', 'end_page'
	])
	return boundaries_df


# Example usage
sliding_window_prob_first_issue_df = probabilistic_first_issue_detection(subset_digits, threshold_sizes=[round(lower_threshold), round(upper_threshold)], window_size=5, score_threshold=0.5)

# Analyze the top candidates for the first issue
seventy_five_threshold = sliding_window_prob_first_issue_df.describe()[['cumulative_score']].T['75%'].values[0]
top_prob_candidates = sliding_window_prob_first_issue_df[sliding_window_prob_first_issue_df.cumulative_score > seventy_five_threshold].sort_values(by=['cumulative_score', 'start_page'], ascending=[False, True])
generate_table(top_prob_candidates[['cumulative_score', 'threshold_size', 'start_page', 'end_page']], "Top Ten Probabilistic First Issue Candidates")

Threshold Sizes: 100%|██████████| 4/4 [00:00<00:00, 21.52it/s]


In [104]:
# Adjusted Raw Scores Initialization
def initialize_raw_scores(df, max_threshold):
	max_page = df['page_number'].max()
	if pd.isna(max_page):
		return np.zeros((0, 0), dtype=int)
	raw_scores = np.zeros((max_page + 1, max_threshold + 1), dtype=int)

	for _, row in df.iterrows():
		page = int(row['page_number'])
		number = int(row['implied_zero']) if row['page_type'] == 'digit' else 0
		
		if 0 <= page <= max_page and 0 <= number <= max_threshold:
			raw_scores[page, number] += 1

	return raw_scores

# Modified Prefix Sum Calculation for First Issue
def prefix_sums_first_issue(raw_scores, threshold_range, start_pages, updown=0.5, diag=0.25, otherwise=0.01, points=1.0):
	nrows, ncols = raw_scores.shape
	max_score_data = []

	# Iterate over threshold sizes
	for threshold_size in threshold_range:
		# Iterate over start pages extracted from the DataFrame
		for start_page in start_pages:
			end_page = start_page + threshold_size - 1

			# Ensure the end page doesn't exceed the matrix bounds
			if end_page >= nrows:
				continue
			
			# Initialize prefix sum matrix for the current configuration
			current_scores = raw_scores.copy()

			# Apply prefix sums within the current window
			for i in range(start_page, end_page + 1):
				for j in range(ncols):
					cell = otherwise + points * raw_scores[i, j]
					choices = []

					if j > 0:
						choices.append(current_scores[i, j-1] * updown)
					if i > start_page:
						choices.append(current_scores[i-1, j] * updown)
						if j > 0:
							choices.append(current_scores[i-1, j-1] * diag)

					cell += max(choices, default=0)
					current_scores[i, j] = cell

			# Calculate the total score for this configuration
			total_score = current_scores[start_page:end_page + 1, :].sum()

			# Collect the configuration and its total score
			max_score_data.append((total_score, threshold_size, start_page, end_page))

	return max_score_data

# Analyze Prefix Sum Results for First Issue
def detect_first_issue_prefix_sum(df, threshold_range=[10, 50], updown=0.5, diag=0.25, otherwise=0.01, points=1.0):
	max_score_data = []

	for threshold_size in tqdm(range(threshold_range[0], threshold_range[1]), desc="Threshold Sizes"):
		first_page_number = df.page_number.min()
		final_page_number = df[df.page_number == first_page_number + threshold_size].page_number.max()

		# Vary the start page within a defined range (similar to Needleman-Wunsch approach)
		for additional_page in range(5):
			current_first_page_number = first_page_number + additional_page
			selected_rows = df[(df.page_number <= final_page_number) & (df.page_number >= current_first_page_number)]
			# Initialize raw scores matrix
			raw_scores = initialize_raw_scores(selected_rows, max_threshold=threshold_range[1])

			# Extract unique page numbers to use as start pages
			start_pages = selected_rows['page_number'].unique()

			# Run prefix sums across different thresholds and start pages
			max_score_data.extend(prefix_sums_first_issue(
				raw_scores,
				range(threshold_range[0], threshold_range[1]),
				start_pages,
				updown,
				diag,
				otherwise,
				points
			))

	# Convert to DataFrame
	results_df = pd.DataFrame(max_score_data, columns=['total_score', 'threshold_size', 'start_page', 'end_page'])
	results_df['threshold_size'] = results_df.end_page - results_df.start_page 
	best_candidate = results_df.sort_values(by='total_score', ascending=False).head(1)
	
	return best_candidate, results_df

# Example usage with subset_digits DataFrame
best_first_issue, prefix_all_candidates_df = detect_first_issue_prefix_sum(subset_digits, threshold_range=[round(lower_threshold), round(upper_threshold)])

generate_table(best_first_issue[['total_score', 'threshold_size', 'start_page', 'end_page']], "Best First Issue Candidate")

Threshold Sizes: 100%|██████████| 4/4 [00:00<00:00,  4.08it/s]


In [105]:
# Analyze the top candidates for the first issue
seventy_five_threshold = prefix_all_candidates_df.describe()[['total_score']].T['75%'].values[0]
top_prob_candidates = prefix_all_candidates_df[prefix_all_candidates_df.total_score > seventy_five_threshold].sort_values(by=['total_score', 'start_page'], ascending=[False, True])
generate_table(top_prob_candidates[['total_score', 'threshold_size', 'start_page', 'end_page']], "Top Prefix Sum First Issue Candidates")

In [106]:
top_issues = prefix_all_candidates_df[['total_score', 'threshold_size', 'start_page', 'end_page']].merge(sequence_alignment_likely_first_issue_boundaries_df[['alignment_score', 
        'threshold_size', 'start_page', 'end_page']], on=['threshold_size', 'start_page', 'end_page'], how='outer').sort_values(by=['total_score', 'alignment_score'], ascending=[False, False])

top_issues_df = top_issues.merge(sliding_window_prob_first_issue_df[['cumulative_score','threshold_size', 'start_page', 'end_page']], on=['threshold_size', 'start_page', 'end_page'], how='inner').sort_values(by=['total_score', 'cumulative_score'], ascending=[False, False])

top_issues_df = top_issues_df.drop_duplicates()
generate_table(top_issues_df[['threshold_size', 'start_page', 'end_page', 'alignment_score', 'total_score', 'cumulative_score']], "Top First Issue Candidates")

In [107]:
weights = {
    'total_score': 0.4,
    'alignment_score': 0.4,
    'cumulative_score': 0.2
}

top_issues_df['composite_score'] = (
    weights['total_score'] * top_issues_df['total_score'] +
    weights['alignment_score'] * top_issues_df['alignment_score'] +
    weights['cumulative_score'] * top_issues_df['cumulative_score']
)

top_issues_df = top_issues_df.sort_values(by='composite_score', ascending=False)
generate_table(top_issues_df[['threshold_size', 'start_page', 'end_page', 'alignment_score', 'total_score', 'cumulative_score', 'composite_score']], "Top First Issue Candidates with Composite Score")

In [108]:


def calculate_combined_score(df, total_weight=0.4, alignment_weight=0.3, cumulative_weight=0.3):
    """Calculate a combined score based on total_score, alignment_score, and cumulative_score."""
    # Normalize the scores using Min-Max Scaling
    scaler = MinMaxScaler()

    df[['norm_total_score', 'norm_alignment_score', 'norm_cumulative_score']] = scaler.fit_transform(
        df[['total_score', 'alignment_score', 'cumulative_score']]
    )

    # Calculate the combined score as a weighted sum
    df['combined_score'] = (
        df['norm_total_score'] * total_weight +
        df['norm_alignment_score'] * alignment_weight +
        df['norm_cumulative_score'] * cumulative_weight
    )

    return df

# Apply the function to your DataFrame
top_issues_df = calculate_combined_score(top_issues_df)

In [109]:
alt.Chart(top_issues_df).mark_circle().encode(
	x='composite_score',
	y='combined_score',
	color='threshold_size:N',
	tooltip=['threshold_size', 'start_page', 'end_page', 'alignment_score', 'total_score', 'cumulative_score', 'composite_score']
)

In [114]:
annotated_df = full_df[['page_number', 'start_issue', 'end_issue', 'type_of_page']].drop_duplicates()

# Group by 'start_issue' and aggregate
grouped_df = annotated_df.groupby('start_issue').agg(
    first_page=('page_number', 'min'),
    last_page=('page_number', 'max'),
    number_of_pages=('page_number', 'count')
).reset_index()
grouped_df = grouped_df.sort_values(by='first_page')

In [113]:
grouped_df.sor

Unnamed: 0,start_issue,first_page,last_page,number_of_pages
0,1965-01-10,63,118,56
1,1965-01-17,119,176,58
2,1965-02-07,289,344,56
3,1965-02-14,345,400,56
4,1965-02-21,401,455,55
5,1965-02-28,457,512,55
6,1965-03-07,513,567,55
7,1965-03-14,569,624,56
8,1965-03-21,625,680,56
9,1965-03-28,681,736,55


In [115]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def generate_issue_binary(start_page, end_page, total_pages):
	"""
	Convert the issue boundaries into a binary format indicating issue presence.
	"""
	issue_binary = np.zeros(total_pages, dtype=int)
	issue_binary[int(start_page):int(end_page) + 1] = 1
	return issue_binary

def calculate_first_issue_accuracy(top_issues_df, grouped_df, total_pages):
	"""
	Calculate accuracy, precision, recall, and F1-score for first issue detection.
	"""
	# Extract the first row from top_issues_df as the predicted first issue
	metrics_df = top_issues_df.copy()
	for index, row in tqdm(top_issues_df.iterrows(), total=top_issues_df.shape[0], desc="Calculating Metrics"):
		# Extract the predicted first issue boundaries
		predicted_start_page = int(row['start_page'])
		predicted_end_page = int(row['end_page'])
		# Extract the actual first issue boundaries from grouped_df
		actual_first_issue = grouped_df.iloc[0]
		actual_start_page = int(actual_first_issue['first_page'])
		actual_end_page = int(actual_first_issue['last_page'])
		actual_issue_length = int(actual_first_issue['number_of_pages'])

		# Convert predicted and actual issues to binary format
		predicted_issues_binary = generate_issue_binary(predicted_start_page, predicted_end_page, total_pages)
		actual_issues_binary = generate_issue_binary(actual_start_page, actual_end_page, total_pages)

		# Calculate accuracy, precision, recall, and F1-score
		accuracy = accuracy_score(actual_issues_binary, predicted_issues_binary)
		precision = precision_score(actual_issues_binary, predicted_issues_binary)
		recall = recall_score(actual_issues_binary, predicted_issues_binary)
		f1 = f1_score(actual_issues_binary, predicted_issues_binary)

		metrics_df.loc[index, 'accuracy'] = accuracy
		metrics_df.loc[index, 'precision'] = precision
		metrics_df.loc[index, 'recall'] = recall
		metrics_df.loc[index, 'f1'] = f1
		metrics_df.loc[index, 'actual_start_page'] = actual_start_page
		metrics_df.loc[index, 'actual_end_page'] = actual_end_page
		metrics_df.loc[index, 'actual_total_volume_pages'] = total_pages
		metrics_df.loc[index, 'actual_issue_length'] = actual_issue_length


	return metrics_df

# Define the total number of pages in the volume
total_pages = full_df['page_number'].max() + 1

# Calculate accuracy, precision, recall, and F1-score for the first issue detection
metrics_df = calculate_first_issue_accuracy(top_issues_df, grouped_df, total_pages)

Calculating Metrics: 100%|██████████| 200/200 [00:00<00:00, 407.95it/s]


In [116]:
metrics_df = metrics_df.sort_values(by='f1', ascending=False)
generate_table(metrics_df[['threshold_size', 'start_page', 'end_page', 'accuracy', 'precision', 'recall', 'f1']], "First Issue Detection Metrics")

## Explore Issue Segmentation Metrics

In [5]:
identified_issues_dfs = []
for directory, subdir, files in os.walk("../datasets/first_issue_metrics"):
	for file in files:
		if file.endswith(".csv"):
			df = read_csv_file(os.path.join(directory, file))
			identified_issues_dfs.append(df)

In [6]:
identified_issues_df = pd.concat(identified_issues_dfs, ignore_index=True)

In [7]:
identified_issues_df.head(2)

Unnamed: 0,total_score,threshold_size,start_page,end_page,alignment_score,cumulative_score,composite_score,norm_total_score,norm_alignment_score,norm_cumulative_score,...,actual_total_volume_pages,actual_issue_length,annotated_file_path,sequence_alignment_full_data,final_number_of_candidates,upper_threshold,lower_threshold,sequence_alignment_candidates,probabilistic_candidates,prefix_sum_candidates
0,522.0,61,3,64,123.5,0.65,258.33,1.0,1.0,1.0,...,75.0,24.0,../datasets/annotated_ht_ef_datasets/liberator...,False,27,63.060082,60.748137,73,30,2
1,522.0,61,3,64,123.5,0.54,258.308,1.0,1.0,0.266667,...,75.0,24.0,../datasets/annotated_ht_ef_datasets/liberator...,False,27,63.060082,60.748137,73,30,2


In [8]:
grouped_identified_issues_df = identified_issues_df.groupby(['annotated_file_path', 'actual_issue_length', 'actual_total_volume_pages', 'sequence_alignment_full_data'])['f1'].max().reset_index().sort_values(by='f1', ascending=False)

grouped_identified_issues_df['publication_title'] = grouped_identified_issues_df['annotated_file_path'].apply(lambda x: x.split("/")[3])

In [11]:
selection = alt.selection_point(fields=['publication_title'], bind='legend')
alt.Chart(grouped_identified_issues_df).mark_circle(opacity=0.7).encode(
	x='actual_issue_length',
	y='actual_total_volume_pages',
	color=alt.Color('publication_title', scale=alt.Scale(scheme='viridis')),
	size='f1',
	opacity=alt.condition(selection, alt.value(0.7), alt.value(0.1)),
	# row='publication_title:N',
	tooltip=['actual_issue_length', 'f1', 'actual_total_volume_pages', 'sequence_alignment_full_data']
).add_params(selection).properties(
	width=400,
	height=400
)

In [13]:
grouped_identified_issues_df['actual_issue_length_ratio_actual_total_volume_pages'] = grouped_identified_issues_df['actual_issue_length'] / grouped_identified_issues_df['actual_total_volume_pages'] * 100

In [None]:

X = grouped_identified_issues_df['actual_issue_length_ratio_actual_total_volume_pages']
y = grouped_identified_issues_df['f1']
X = sm.add_constant(X)  # Add a constant term for the intercept
model = sm.OLS(y, X).fit()
grouped_identified_issues_df['fitted'] = model.fittedvalues
r_squared = model.rsquared

# Create the base chart
base = alt.Chart(grouped_identified_issues_df).mark_circle(opacity=0.7, size=100).encode(
    x='actual_issue_length_ratio_actual_total_volume_pages',
    y='f1',
    color=alt.Color('publication_title', scale=alt.Scale(scheme='viridis')),
    tooltip=['actual_issue_length_ratio_actual_total_volume_pages', 'f1', 'actual_total_volume_pages', 'sequence_alignment_full_data']
)

# Add the regression line
regression_line = alt.Chart(grouped_identified_issues_df).mark_line(color='red').encode(
    x='actual_issue_length_ratio_actual_total_volume_pages',
    y='fitted'
)

# Add the R-squared value as text
r_squared_text = alt.Chart(pd.DataFrame({
    'x': [0.5],  # Adjust the position as needed
    'y': [0.5],  # Adjust the position as needed
    'text': [f'R² = {r_squared:.2f}']
})).mark_text(align='left', dx=5, dy=-5, color='black').encode(
    x='x:Q',
    y='y:Q',
    text='text:N'
)

# Combine the base chart, regression line, and R-squared text
chart = base + regression_line + r_squared_text

# Add selection
selection = alt.selection_point(fields=['publication_title'], bind='legend')
chart = chart.add_params(selection).encode(
    opacity=alt.condition(selection, alt.value(0.7), alt.value(0.1))
).properties(
    width=400,
    height=400
)

chart

In [33]:
charts = []

for periodical_title in grouped_identified_issues_df['publication_title'].unique():
	subset_grouped_identified_issues_df = grouped_identified_issues_df[grouped_identified_issues_df['publication_title'] == periodical_title]
	if len(subset_grouped_identified_issues_df) > 1:

		X = subset_grouped_identified_issues_df['actual_issue_length_ratio_actual_total_volume_pages']
		y = subset_grouped_identified_issues_df['f1']
		X = sm.add_constant(X)  # Add a constant term for the intercept
		model = sm.OLS(y, X).fit()
		subset_grouped_identified_issues_df['fitted'] = model.fittedvalues
		r_squared = model.rsquared

		# Create the base chart
		base = alt.Chart(subset_grouped_identified_issues_df).mark_circle(opacity=0.7, size=100).encode(
			x='actual_issue_length_ratio_actual_total_volume_pages',
			y='f1',
			color=alt.Color('f1', scale=alt.Scale(scheme='viridis')),
			tooltip=['actual_issue_length_ratio_actual_total_volume_pages', 'f1', 'actual_total_volume_pages', 'sequence_alignment_full_data']
		)

		# Add the regression line
		regression_line = alt.Chart(subset_grouped_identified_issues_df).mark_line(color='red').encode(
			x='actual_issue_length_ratio_actual_total_volume_pages',
			y='fitted'
		)

		# Add the R-squared value as text
		r_squared_text = alt.Chart(pd.DataFrame({
			'x': [0.5],  # Adjust the position as needed
			'y': [0.5],  # Adjust the position as needed
			'text': [f'R² = {r_squared:.2f}']
		})).mark_text(align='left', dx=5, dy=-5, color='black').encode(
			x='x:Q',
			y='y:Q',
			text='text:N'
		)

		# Combine the base chart, regression line, and R-squared text
		chart = base + regression_line + r_squared_text

		# Add selection
		selection = alt.selection_point(fields=['publication_title'], bind='legend')
		chart = chart.add_params(selection).encode(
			opacity=alt.condition(selection, alt.value(0.7), alt.value(0.1))
		).properties(
			width=200,
			height=200,
			title=periodical_title
		)
	else:
		chart = alt.Chart(subset_grouped_identified_issues_df).mark_circle().encode(
			x='actual_issue_length_ratio_actual_total_volume_pages',
			y='f1',
			color=alt.Color('f1', scale=alt.Scale(scheme='viridis')),
			tooltip=['actual_issue_length_ratio_actual_total_volume_pages', 'f1', 'actual_total_volume_pages', 'sequence_alignment_full_data']
		).properties(
			width=200,
			height=200,
			title=periodical_title
		)
	charts.append(chart)

In [34]:
alt.vconcat(alt.hconcat(*charts[0:5]), alt.hconcat(*charts[5:])).resolve_scale(x='independent')