# Latin Master Bibliography Data Exploration

This notebook provides tools for exploring and analyzing the Latin Master Bibliography dataset.

## Overview

The dataset contains deduplicated Latin printed works from 1450-1900, combining data from multiple major bibliographic catalogues:
- USTC (Universal Short Title Catalogue)
- VD16/VD17/VD18 (German printing catalogues)
- ESTC (English Short Title Catalogue)

Each row represents a unique Latin work/edition with information about which catalogues attest it and any available digital facsimiles.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

## Load and Inspect Data

In [None]:
# Load the main dataset
data_path = Path('../data/processed/final/latin_master_bibliography.csv')

try:
    df = pd.read_csv(data_path, encoding='utf-8-sig')
    print(f"Dataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
except FileNotFoundError:
    print(f"Dataset not found at {data_path}")
    print("Please run the pipeline first to generate the dataset.")
    # Create a sample dataset for demonstration
    df = pd.DataFrame({
        'title': ['De Revolutionibus Orbium Coelestium', 'Ars Nova', 'Summa Theologica'],
        'author': ['Copernicus, Nicolaus', 'Anonymous', 'Aquinas, Thomas'],
        'publication_year': [1543, 1320, 1274],
        'publication_place': ['Nuremberg', 'Paris', 'Paris'],
        'language': ['lat', 'lat', 'lat'],
        'source_catalogues': ['VD16;USTC', 'USTC', 'VD16'],
        'has_digital_facsimile': [True, False, True]
    })
    print("Created sample dataset for demonstration.")

In [None]:
# Display basic information about the dataset
print("Dataset Overview:")
print(f"Total records: {len(df):,}")
print(f"Columns: {len(df.columns)}")

# Show first few rows
print("\nFirst 5 records:")
display(df.head())

In [None]:
# Basic statistics for numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns
if len(numeric_columns) > 0:
    print("Numeric Columns Statistics:")
    display(df[numeric_columns].describe())
else:
    print("No numeric columns found in the dataset.")

## Temporal Analysis

In [None]:
# Analyze publication dates
if 'publication_year' in df.columns:
    # Remove invalid years
    valid_years = df[(df['publication_year'] >= 1450) & (df['publication_year'] <= 1900)]['publication_year']
    
    # Create histogram
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    # Overall distribution
    ax1.hist(valid_years, bins=50, alpha=0.7, color='steelblue', edgecolor='black')
    ax1.set_title('Distribution of Latin Works by Publication Year (1450-1900)', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Publication Year')
    ax1.set_ylabel('Number of Works')
    ax1.grid(True, alpha=0.3)
    
    # Century breakdown
    centuries = (valid_years - 1) // 100 + 1
    century_counts = centuries.value_counts().sort_index()
    
    bars = ax2.bar(range(len(century_counts)), century_counts.values, 
                   color=['coral', 'lightblue', 'lightgreen', 'gold', 'plum', 'orange'][:len(century_counts)])
    ax2.set_title('Latin Works by Century', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Century')
    ax2.set_ylabel('Number of Works')
    ax2.set_xticks(range(len(century_counts)))
    ax2.set_xticklabels([f'{c}th' for c in century_counts.index])
    
    # Add value labels on bars
    for bar, count in zip(bars, century_counts.values):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01*max(century_counts.values),
                f'{count:,}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print(f"Date Range: {valid_years.min()} - {valid_years.max()}")
    print(f"Median Year: {valid_years.median():.0f}")
    print(f"\nCentury Breakdown:")
    for century, count in century_counts.items():
        percentage = (count / len(valid_years)) * 100
        print(f"  {century}th century: {count:,} works ({percentage:.1f}%)")
else:
    print("No publication_year column found.")

## Catalogue Coverage Analysis

In [None]:
# Analyze catalogue coverage
if 'source_catalogues' in df.columns:
    # Split catalogue sources and count
    catalogue_series = df['source_catalogues'].str.split(';').explode()
    catalogue_counts = catalogue_series.value_counts()
    
    # Create visualization
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Pie chart
    colors = plt.cm.Set3(np.linspace(0, 1, len(catalogue_counts)))
    wedges, texts, autotexts = ax1.pie(catalogue_counts.values, labels=catalogue_counts.index,
                                      autopct='%1.1f%%', colors=colors, startangle=90)
    ax1.set_title('Catalogue Coverage Distribution', fontsize=14, fontweight='bold')
    
    # Bar chart
    bars = ax2.bar(catalogue_counts.index, catalogue_counts.values, color=colors)
    ax2.set_title('Records per Catalogue', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Catalogue')
    ax2.set_ylabel('Number of Records')
    ax2.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar, count in zip(bars, catalogue_counts.values):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01*max(catalogue_counts.values),
                f'{count:,}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed statistics
    print("Catalogue Coverage Statistics:")
    print(f"Total records: {len(df):,}")
    print(f"Unique catalogue mentions: {catalogue_series.nunique()}")
    print(f"\nRecords per catalogue:")
    for catalogue, count in catalogue_counts.items():
        percentage = (count / len(df)) * 100
        print(f"  {catalogue}: {count:,} records ({percentage:.1f}%)")
    
    # Multi-catalogue overlap
    overlap_stats = df['source_catalogues'].str.split(';').apply(len)
    print(f"\nMulti-catalogue coverage:")
    print(f"  Single catalogue: {(overlap_stats == 1).sum():,} records")
    print(f"  Multiple catalogues: {(overlap_stats > 1).sum():,} records")
    print(f"  Maximum catalogues per record: {overlap_stats.max()}")
else:
    print("No source_catalogues column found.")

## Geographic Distribution

In [None]:
# Analyze publication places
if 'publication_place' in df.columns:
    # Get top publication places
    place_counts = df['publication_place'].value_counts().head(20)
    
    # Create visualization
    plt.figure(figsize=(14, 8))
    
    # Horizontal bar chart
    y_pos = range(len(place_counts))
    bars = plt.barh(y_pos, place_counts.values, color='lightblue', edgecolor='navy')
    
    plt.yticks(y_pos, place_counts.index)
    plt.xlabel('Number of Works')
    plt.title('Top 20 Publication Places for Latin Works', fontsize=14, fontweight='bold')
    
    # Add value labels
    for i, (bar, count) in enumerate(zip(bars, place_counts.values)):
        plt.text(bar.get_width() + 0.01*max(place_counts.values), bar.get_y() + bar.get_height()/2,
                f'{count:,}', ha='left', va='center')
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print(f"Total unique publication places: {df['publication_place'].nunique():,}")
    print(f"\nTop 20 Publication Places:")
    for i, (place, count) in enumerate(place_counts.items(), 1):
        percentage = (count / len(df)) * 100
        print(f"  {i:2d}. {place}: {count:,} works ({percentage:.1f}%)")
else:
    print("No publication_place column found.")

## Author Analysis

In [None]:
# Analyze authors
if 'author' in df.columns:
    # Clean author names and count
    author_counts = df['author'].value_counts().head(20)
    
    # Create visualization
    plt.figure(figsize=(14, 10))
    
    # Horizontal bar chart
    y_pos = range(len(author_counts))
    bars = plt.barh(y_pos, author_counts.values, color='lightcoral', edgecolor='darkred')
    
    plt.yticks(y_pos, author_counts.index)
    plt.xlabel('Number of Works')
    plt.title('Top 20 Most Prolific Latin Authors', fontsize=14, fontweight='bold')
    
    # Add value labels
    for i, (bar, count) in enumerate(zip(bars, author_counts.values)):
        plt.text(bar.get_width() + 0.01*max(author_counts.values), bar.get_y() + bar.get_height()/2,
                f'{count}', ha='left', va='center')
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print(f"Total unique authors: {df['author'].nunique():,}")
    print(f"Works with identified authors: {df['author'].notna().sum():,} ({df['author'].notna().sum()/len(df)*100:.1f}%)")
    print(f"Anonymous works: {df['author'].isna().sum():,} ({df['author'].isna().sum()/len(df)*100:.1f}%)")
    print(f"\nTop 20 Authors by Number of Works:")
    for i, (author, count) in enumerate(author_counts.items(), 1):
        print(f"  {i:2d}. {author}: {count} works")
else:
    print("No author column found.")

## Digital Facsimile Analysis

In [None]:
# Analyze digital facsimile availability
digital_column = None
for col in ['has_digital_facsimile', 'digital_facsimile_url', 'digital_facsimile_urls']:
    if col in df.columns:
        digital_column = col
        break

if digital_column:
    if digital_column == 'has_digital_facsimile':
        digital_counts = df[digital_column].value_counts()
        labels = ['No Digital Facsimile', 'Has Digital Facsimile']
        sizes = [digital_counts.get(False, 0), digital_counts.get(True, 0)]
        colors = ['lightcoral', 'lightgreen']
    else:
        # Check for non-empty URLs
        has_digital = df[digital_column].notna() & (df[digital_column] != '')
        digital_counts = has_digital.value_counts()
        labels = ['No Digital Facsimile', 'Has Digital Facsimile']
        sizes = [digital_counts.get(False, 0), digital_counts.get(True, 0)]
        colors = ['lightcoral', 'lightgreen']
    
    # Create pie chart
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Pie chart
    wedges, texts, autotexts = ax1.pie(sizes, labels=labels, autopct='%1.1f%%', 
                                      colors=colors, startangle=90)
    ax1.set_title('Digital Facsimile Availability', fontsize=14, fontweight='bold')
    
    # Bar chart with counts
    bars = ax2.bar(labels, sizes, color=colors)
    ax2.set_title('Digital Facsimile Counts', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Number of Works')
    
    # Add value labels
    for bar, count in zip(bars, sizes):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01*max(sizes),
                f'{count:,}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    total_records = len(df)
    digital_count = sizes[1] if len(sizes) > 1 else 0
    digital_percentage = (digital_count / total_records) * 100
    
    print(f"Digital Facsimile Analysis:")
    print(f"  Total works: {total_records:,}")
    print(f"  Works with digital facsimiles: {digital_count:,} ({digital_percentage:.1f}%)")
    print(f"  Works without digital facsimiles: {total_records - digital_count:,} ({100 - digital_percentage:.1f}%)")
else:
    print("No digital facsimile columns found.")

## Title Analysis

In [None]:
# Analyze titles
if 'title' in df.columns:
    # Title length analysis
    title_lengths = df['title'].str.len()
    
    # Create word frequency analysis
    all_titles = ' '.join(df['title'].dropna().astype(str)).lower()
    
    # Remove common Latin words
    stop_words = {'de', 'der', 'die', 'das', 'des', 'dem', 'den', 'ein', 'eine', 'einer',
                  'et', 'ad', 'in', 'a', 'ab', 'ex', 'per', 'pro', 'cum', 'sine', 'sub', 
                  'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for'}
    
    words = [word for word in all_titles.split() 
             if len(word) > 3 and word not in stop_words]
    
    from collections import Counter
    word_counts = Counter(words)
    top_words = word_counts.most_common(20)
    
    # Create visualizations
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    
    # Title length distribution
    ax1.hist(title_lengths, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    ax1.set_title('Title Length Distribution', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Number of Characters')
    ax1.set_ylabel('Number of Works')
    ax1.grid(True, alpha=0.3)
    
    # Most common words
    words, counts = zip(*top_words)
    y_pos = range(len(words))
    bars = ax2.barh(y_pos, counts, color='lightgreen', edgecolor='darkgreen')
    ax2.set_yticks(y_pos)
    ax2.set_yticklabels(words)
    ax2.set_xlabel('Frequency')
    ax2.set_title('Top 20 Most Common Words in Titles', fontsize=14, fontweight='bold')
    
    # Add value labels
    for bar, count in zip(bars, counts):
        width = bar.get_width()
        ax2.text(width + 0.01*max(counts), bar.get_y() + bar.get_height()/2,
                f'{count}', ha='left', va='center')
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print(f"Title Analysis:")
    print(f"  Average title length: {title_lengths.mean():.1f} characters")
    print(f"  Shortest title: {title_lengths.min()} characters")
    print(f"  Longest title: {title_lengths.max()} characters")
    print(f"  Total unique words: {len(set(words)):,}")
    print(f"\nTop 20 Most Common Words:")
    for i, (word, count) in enumerate(top_words, 1):
        print(f"  {i:2d}. '{word}': {count} occurrences")
else:
    print("No title column found.")

## Summary and Export

In [None]:
# Create a summary report
summary = {
    'total_records': len(df),
    'columns': list(df.columns),
    'date_range': {
        'start': df['publication_year'].min() if 'publication_year' in df.columns else 'N/A',
        'end': df['publication_year'].max() if 'publication_year' in df.columns else 'N/A'
    },
    'unique_authors': df['author'].nunique() if 'author' in df.columns else 'N/A',
    'unique_places': df['publication_place'].nunique() if 'publication_place' in df.columns else 'N/A',
    'catalogues': df['source_catalogues'].str.split(';').explode().nunique() if 'source_catalogues' in df.columns else 'N/A'
}

print("Dataset Summary:")
for key, value in summary.items():
    print(f"  {key}: {value}")

# Save a sample of interesting records
if len(df) > 0:
    # Find records with digital facsimiles
    digital_cols = ['has_digital_facsimile', 'digital_facsimile_url', 'digital_facsimile_urls']
    digital_col = next((col for col in digital_cols if col in df.columns), None)
    
    if digital_col:
        if digital_col == 'has_digital_facsimile':
            digital_records = df[df[digital_col] == True]
        else:
            digital_records = df[df[digital_col].notna() & (df[digital_col] != '')]
        
        print(f"\nRecords with digital facsimiles: {len(digital_records)}")
        
        # Save sample of digitally available works
        if len(digital_records) > 0:
            sample_digital = digital_records.head(10)
            sample_file = 'sample_digital_facsimiles.csv'
            sample_digital.to_csv(sample_file, index=False, encoding='utf-8-sig')
            print(f"Sample of digital facsimile records saved to {sample_file}")
    
    # Find works from multiple catalogues
    if 'source_catalogues' in df.columns:
        multi_catalogue = df[df['source_catalogues'].str.contains(';', na=False)]
        print(f"\nRecords from multiple catalogues: {len(multi_catalogue)}")
        
        if len(multi_catalogue) > 0:
            sample_multi = multi_catalogue.head(10)
            sample_file = 'sample_multi_catalogue.csv'
            sample_multi.to_csv(sample_file, index=False, encoding='utf-8-sig')
            print(f"Sample of multi-catalogue records saved to {sample_file}")

print("\nExploration complete! Use the generated visualizations for further analysis.")