# PapyrusVision Hieroglyph Detection - Exploratory Data Analysis

This notebook provides a deep dive into the hieroglyph dataset, focusing on detailed analysis and visualization to inform our modeling strategy.

## Objectives:
1. Load the pre-processed and split data.
2. Perform statistical analysis of annotations.
3. Create detailed visualizations for:
   - Bounding box characteristics (size, aspect ratio).
   - Spatial distribution of hieroglyphs on the papyrus.
   - Class balance and relationships.
4. Generate insights to guide data augmentation and model configuration.

## Setup

In [None]:
# install dependencies
!pip install -U torch torchvision cython
!pip install -U 'git+https://github.com/facebookresearch/fvcore.git' 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
import torch, torchvision
torch.__version__

In [None]:
!git clone https://github.com/facebookresearch/detectron2 detectron2_repo
!pip install -e detectron2_repo

In [None]:
import subprocess
import sys
import torch

# Check CUDA version to pick the correct Detectron2 wheel
cuda_version = torch.version.cuda
print(f"Detected CUDA version: {cuda_version}")

# Install other required packages that are not included with the detectron2 installation command
print("\nInstalling other required packages...")
other_packages = [
    'opencv-python-headless',
    'matplotlib>=3.3.0',
    'seaborn>=0.11.0',
    'plotly>=5.0.0',
    'pandas>=1.3.0',
    'scikit-learn>=1.0.0',
    'kaleido>=0.2.1',
    'pycocotools',
    'pillow>=8.0.0',
    'numpy',
    'tqdm',
    'ipywidgets'
]

try:
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q'] + other_packages)
    print(" All other packages installed successfully!")
except subprocess.CalledProcessError as e:
    print(f" Failed to install other packages: {e}")
except Exception as e:
    print(f" Installation of other packages failed: {e}")

In [None]:
# Import libraries
import os
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image
import pandas as pd
from pathlib import Path
from google.colab import drive, files
import zipfile
import shutil
import plotly.express as px

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set up paths
PROJECT_ROOT = '/content/drive/My Drive/PapyrusNU_Detectron'

DATA_DIR = f'{PROJECT_ROOT}/data'
SCRIPTS_DIR = f'{PROJECT_ROOT}/scripts'
ANNOTATIONS_DIR = f'{DATA_DIR}/annotations'
ANALYSIS_PLOTS_DIR = f'{DATA_DIR}/analysis_plots'
os.makedirs(ANALYSIS_PLOTS_DIR, exist_ok=True)

# Add scripts to path
sys.path.append(SCRIPTS_DIR)

In [None]:
print("Current system path:")
for path in sys.path:
    print(path)

print("\nContents of scripts directory:")
scripts_dir = '/content/drive/My Drive/PapyrusNU_Detectron/scripts'
if os.path.exists(scripts_dir):
    for item in os.listdir(scripts_dir):
        print(item)
else:
    print(f"Directory not found: {scripts_dir}")

In [None]:
# Import custom utilities
from dataset_utils import HieroglyphDatasetUtils
from visualization import HieroglyphVisualizer

print("Custom utilities imported successfully.")

## Data Loading

Load the full, unsplit dataset for a complete overview, as well as the training data split from the previous notebook.

In [None]:
# Load the full dataset annotations
full_annotation_file = f'{ANNOTATIONS_DIR}/annotations.json'
train_annotation_file = f'{ANNOTATIONS_DIR}/train_annotations.json'

if not os.path.exists(full_annotation_file):
    print("Error: Annotation files not found. Please run the data preparation notebook first.")
else:
    dataset_utils = HieroglyphDatasetUtils(full_annotation_file)
    with open(train_annotation_file, 'r') as f:
        train_data = json.load(f)
    print("Datasets loaded successfully.")

## Annotation Analysis as DataFrame

For deeper analysis, convert the annotations into a Pandas DataFrame.

In [None]:
# Convert annotations to a Pandas DataFrame
annotations_df = pd.DataFrame(dataset_utils.annotations)

# Enrich DataFrame with category names and bbox details
annotations_df['category_name'] = annotations_df['category_id'].apply(lambda x: dataset_utils.categories[x]['name'])
annotations_df['bbox_x'] = annotations_df['bbox'].apply(lambda x: x[0])
annotations_df['bbox_y'] = annotations_df['bbox'].apply(lambda x: x[1])
annotations_df['bbox_width'] = annotations_df['bbox'].apply(lambda x: x[2])
annotations_df['bbox_height'] = annotations_df['bbox'].apply(lambda x: x[3])
annotations_df['aspect_ratio'] = annotations_df['bbox_width'] / annotations_df['bbox_height']
annotations_df['bbox_center_x'] = annotations_df['bbox_x'] + annotations_df['bbox_width'] / 2
annotations_df['bbox_center_y'] = annotations_df['bbox_y'] + annotations_df['bbox_height'] / 2
annotations_df['area'] = annotations_df['bbox_width'] * annotations_df['bbox_height']

print("DataFrame created with annotation details:")
display(annotations_df.head())

## Bounding Box Analysis

In [None]:
# Bounding Box Size vs. Aspect Ratio
fig = px.scatter(
    annotations_df,
    x='area',
    y='aspect_ratio',
    color='category_name',
    hover_data=['category_name', 'bbox_width', 'bbox_height'],
    log_x=True,
    log_y=True,
    title='Hieroglyph Size (Area) vs. Aspect Ratio (Log Scale)'
)
fig.update_layout(showlegend=False)
fig.show()
fig.write_html(f'{ANALYSIS_PLOTS_DIR}/size_vs_aspect_ratio.html')

In [None]:
# Distribution of Annotation Sizes (Area)
plt.figure(figsize=(12, 6))
sns.histplot(data=annotations_df, x='area', bins=50, kde=True, log_scale=True)
plt.title('Distribution of Annotation Areas (Log Scale)')
plt.xlabel('Area (pixels²)')
plt.ylabel('Number of Annotations')
plt.grid(True, which="both", ls="--", c='0.7')
plt.savefig(f'{ANALYSIS_PLOTS_DIR}/area_distribution.png')
plt.show()

## Spatial Distribution Analysis

In [None]:
# Spatial Heatmap of Hieroglyph Locations
image_info = list(dataset_utils.images.values())[0]
img_width, img_height = image_info['width'], image_info['height']

plt.figure(figsize=(16, 9))
sns.kdeplot(data=annotations_df, x='bbox_center_x', y='bbox_center_y',
            fill=True, cmap="rocket_r", thresh=0.05, levels=100,
            bw_adjust=0.2)
plt.title('Spatial Heatmap of Hieroglyph Annotations')
plt.xlim(0, img_width)
plt.ylim(img_height, 0)
plt.xlabel('Image Width (pixels)')
plt.ylabel('Image Height (pixels)')
plt.gca().set_aspect('equal', adjustable='box')
plt.savefig(f'{ANALYSIS_PLOTS_DIR}/spatial_heatmap.png')
plt.show()

This heatmap shows where hieroglyphs are concentrated on the papyrus. We see distinct columns of text, which is expected.

## Advanced Class Analysis

In [None]:
annotations_df['gardiner_series'] = annotations_df['category_name'].str[0] # e.g., 'A' from 'A1'

fig = px.sunburst(annotations_df, path=['gardiner_series', 'category_name'],
                  title='Interactive Sunburst of Hieroglyph Categories')
fig.show()
fig.write_html(f'{ANALYSIS_PLOTS_DIR}/class_sunburst_chart.html')

In [None]:
# Box plot of area by Gardiner series
plt.figure(figsize=(18, 8))
sorted_series = annotations_df.groupby('gardiner_series')['area'].median().sort_values().index
sns.boxplot(data=annotations_df, x='gardiner_series', y='area', order=sorted_series, showfliers=False)
plt.title('Annotation Area by Gardiner Series (Outliers Removed)')
plt.xlabel('Gardiner Series')
plt.ylabel('Area (pixels²)')
plt.xticks(rotation=45)
plt.grid(True, which="both", ls="--", c='0.7')
plt.savefig(f'{ANALYSIS_PLOTS_DIR}/area_by_series_boxplot.png')
plt.show()

This plot helps understand if certain types of hieroglyphs (e.g., humans, birds, tools) are consistently larger or smaller than others.

## Summary and Insights for Modeling

This analysis provides several key takeaways for the model training strategy:

In [None]:
report = {
    "bbox_insights": {
        "area_mean": float(annotations_df['area'].mean()),
        "area_median": float(annotations_df['area'].median()),
        "area_std": float(annotations_df['area'].std()),
        "aspect_ratio_mean": float(annotations_df['aspect_ratio'].mean()),
        "aspect_ratio_median": float(annotations_df['aspect_ratio'].median()),
        "notes": ["Wide variation in both area and aspect ratio, suggesting multi-scale training and flexible anchor generation will be important."]
    },
    "spatial_insights": {
        "distribution_pattern": "Hieroglyphs are arranged in clear vertical columns.",
        "notes": ["The non-uniform distribution means random crops might create samples with no annotations. Spatial augmentation should be done carefully."]
    },
    "class_insights": {
        "total_classes": int(annotations_df['category_name'].nunique()),
        "classes_with_one_sample": int((annotations_df['category_name'].value_counts() == 1).sum()),
        "most_common_class": annotations_df['category_name'].value_counts().idxmax(),
        "notes": ["Extreme class imbalance is a major challenge. Many classes have very few examples. This justifies our stratified split and suggests we might need techniques like class weighting or focal loss during training."]
    }
}

# Save the report
report_file = f'{ANALYSIS_PLOTS_DIR}/eda_summary_report.json'
with open(report_file, 'w') as f:
    json.dump(report, f, indent=4)

print(" EXPLORATORY DATA ANALYSIS SUMMARY")
print("="*50)
print("Bounding Box Insights:")
print(f"- Mean Area: {report['bbox_insights']['area_mean']:.2f}, Median Area: {report['bbox_insights']['area_median']:.2f}")
print(f"- Mean Aspect Ratio: {report['bbox_insights']['aspect_ratio_mean']:.2f}, Median: {report['bbox_insights']['aspect_ratio_median']:.2f}")
print(f"- Note: {report['bbox_insights']['notes'][0]}")

print("Spatial Insights:")
print(f"- Pattern: {report['spatial_insights']['distribution_pattern']}")
print(f"- Note: {report['spatial_insights']['notes'][0]}")

print("Class Balance Insights:")
print(f"- Total Classes: {report['class_insights']['total_classes']}")
print(f"- Single-instance classes: {report['class_insights']['classes_with_one_sample']}")
print(f"- Note: {report['class_insights']['notes'][0]}")

print(f"\n Analysis complete. Report and plots saved to {ANALYSIS_PLOTS_DIR}")
print("\n NEXT STEP: Proceed to 03_model_training.ipynb to train the model.")