In [2]:
import re
import json
import os
import csv # Use csv module for robust TSV parsing

print("Preprocessing Step 1 (MVP v3.0): Setup for processing cross_references.txt")

# --- Configuration ---
# Adjust path relative to your notebook's location
INPUT_TXT_PATH = './cross_references.txt' # Path to your source TSV file
OUTPUT_JSON_PATH = '../data/references.json' # Path relative to notebook to project's data folder

# --- Book Name Normalization Helper ---
# CRITICAL: This map's keys must cover variants in the TXT file (e.g., "Gen.", "Ps.")
# Values MUST match the canonical names used in canonicalOrder.js and for BSB.json lookups
# Case-insensitive matching will be used.
BOOK_NAME_MAP = {
    'gen': 'Genesis', 'exod': 'Exodus', 'lev': 'Leviticus', 'num': 'Numbers', 'deut': 'Deuteronomy',
    'josh': 'Joshua', 'judg': 'Judges', 'ruth': 'Ruth', '1sam': '1 Samuel', '2sam': '2 Samuel',
    '1kgs': '1 Kings', '2kgs': '2 Kings', '1chr': '1 Chronicles', '2chr': '2 Chronicles', 'ezra': 'Ezra',
    'neh': 'Nehemiah', 'est': 'Esther', 'job': 'Job', 'ps': 'Psalms', 'prov': 'Proverbs',
    'eccl': 'Ecclesiastes', 'song': 'Song of Solomon',
    'isa': 'Isaiah', 'jer': 'Jeremiah', 'lam': 'Lamentations', 'ezek': 'Ezekiel', 'dan': 'Daniel',
    'hos': 'Hosea', 'joel': 'Joel', 'amos': 'Amos', 'obad': 'Obadiah', 'jonah': 'Jonah', 'mic': 'Micah',
    'nah': 'Nahum', 'hab': 'Habakkuk', 'zeph': 'Zephaniah', 'hag': 'Haggai', 'zech': 'Zechariah', 'mal': 'Malachi',
    'matt': 'Matthew', 'mark': 'Mark', 'luke': 'Luke', 'john': 'John', 'acts': 'Acts', 'rom': 'Romans',
    '1cor': '1 Corinthians', '2cor': '2 Corinthians', 'gal': 'Galatians', 'eph': 'Ephesians',
    'phil': 'Philippians', 'col': 'Colossians', '1thess': '1 Thessalonians', '2thess': '2 Thessalonians',
    '1tim': '1 Timothy', '2tim': '2 Timothy', 'titus': 'Titus', 'phlm': 'Philemon', 'heb': 'Hebrews', 'jas': 'James',
    '1pet': '1 Peter', '2pet': '2 Peter', '1jn': '1 John', '1john': '1 John', # Allow both if needed
    '2jn': '2 John', '2john': '2 John', '3jn': '3 John', '3john': '3 John', 'jude': 'Jude',
    'rev': 'Revelation of John' # Match canonical name
}

def normalize_book_name_from_txt(raw_name):
    if not raw_name: return None
    # Remove periods, leading/trailing whitespace, convert to lower case
    cleaned = raw_name.strip().replace('.', '').lower()
    # Handle cases like '1 samuel' -> '1samuel' if map uses that format
    # This basic version just removes spaces for lookup - adjust if map keys have spaces
    lookup_key = cleaned.replace(' ', '')
    normalized = BOOK_NAME_MAP.get(lookup_key)
    if not normalized:
         # Try direct match on cleaned name if map key didn't work
         normalized = BOOK_NAME_MAP.get(cleaned)

    # if not normalized: # Optional: Warn about unmapped names
        # print(f"Warning: No normalization map found for '{raw_name}' (cleaned: '{cleaned}')")
    # Fallback to a cleaned version or None if mapping is required
    return normalized if normalized else cleaned.title() # Return Title Case cleaned name as fallback


# --- Reference String Parsing Helper ---
# Parses strings like "Gen.1.1", "1Sam.2.3"
# Returns (normalized_book_name, chapter, verse) or None
def parse_dot_reference(ref_str):
    if not ref_str: return None
    # Regex: Book(may contain . or space).Chapter.Verse
    match = re.match(r'^([1-3]?[\s\w\.]+)\.(\d+)\.(\d+)$', ref_str.strip(), re.IGNORECASE)
    if match:
        raw_book, chapter_str, verse_str = match.groups()
        normalized_book = normalize_book_name_from_txt(raw_book)
        if normalized_book:
            try:
                return (normalized_book, int(chapter_str), int(verse_str))
            except ValueError:
                return None # Failed converting numbers
    return None

# --- Function to create the final ID string ---
# Format: BookNameChvVs (e.g., Genesis1v1, John3v16) - no spaces
def format_id_string(book, chapter, verse):
     # Remove spaces from the canonical book name for the ID
     book_id_part = book.replace(' ', '')
     return f"{book_id_part}{chapter}v{verse}"


print(f"Checking for input file: {INPUT_TXT_PATH}")
if not os.path.exists(INPUT_TXT_PATH):
     print(f"ERROR: Input file not found at '{os.path.abspath(INPUT_TXT_PATH)}'")
     print("Please ensure 'cross_references.txt' is in the correct location or update INPUT_TXT_PATH.")
else:
     print(f"Input file found: {os.path.abspath(INPUT_TXT_PATH)}")
     print("Helper functions defined.")
     print("Cell 1 Setup Complete. Proceed to Cell 2 for processing.")

# Store paths/helpers if needed across cells (less critical now but can be useful)
%store INPUT_TXT_PATH
%store OUTPUT_JSON_PATH
# Storing functions isn't standard, just ensure they are defined before use in Cell 2

Preprocessing Step 1 (MVP v3.0): Setup for processing cross_references.txt
Checking for input file: ./cross_references.txt
Input file found: C:\Users\joshu\OneDrive - Rick At Your Service Property Solutions\Joshs Hobbies\AppDevelopment\interactive-bible-connection-simulator\cross_references.txt
Helper functions defined.
Cell 1 Setup Complete. Proceed to Cell 2 for processing.
Stored 'INPUT_TXT_PATH' (str)
Stored 'OUTPUT_JSON_PATH' (str)


In [4]:
import csv
import os
import re # Re-import just in case kernel restarted

# --- Restore variables if needed (or redefine if kernel restarted) ---
# %store -r INPUT_TXT_PATH
# %store -r OUTPUT_JSON_PATH
# Make sure helper functions from Cell 1 are accessible (re-run Cell 1 if needed)

print("Preprocessing Step 2 (MVP v3.0): Reading TXT File and Formatting Data")

all_references = []
processed_rows = 0
skipped_rows = 0
parse_errors = 0

# Re-define helpers just in case kernel was restarted
# (Copying from Cell 1 for robustness)
BOOK_NAME_MAP = {
    'gen': 'Genesis', 'exod': 'Exodus', 'lev': 'Leviticus', 'num': 'Numbers', 'deut': 'Deuteronomy',
    'josh': 'Joshua', 'judg': 'Judges', 'ruth': 'Ruth', '1sam': '1 Samuel', '2sam': '2 Samuel',
    '1kgs': '1 Kings', '2kgs': '2 Kings', '1chr': '1 Chronicles', '2chr': '2 Chronicles', 'ezra': 'Ezra',
    'neh': 'Nehemiah', 'est': 'Esther', 'job': 'Job', 'ps': 'Psalms', 'prov': 'Proverbs',
    'eccl': 'Ecclesiastes', 'song': 'Song of Solomon',
    'isa': 'Isaiah', 'jer': 'Jeremiah', 'lam': 'Lamentations', 'ezek': 'Ezekiel', 'dan': 'Daniel',
    'hos': 'Hosea', 'joel': 'Joel', 'amos': 'Amos', 'obad': 'Obadiah', 'jonah': 'Jonah', 'mic': 'Micah',
    'nah': 'Nahum', 'hab': 'Habakkuk', 'zeph': 'Zephaniah', 'hag': 'Haggai', 'zech': 'Zechariah', 'mal': 'Malachi',
    'matt': 'Matthew', 'mark': 'Mark', 'luke': 'Luke', 'john': 'John', 'acts': 'Acts', 'rom': 'Romans',
    '1cor': '1 Corinthians', '2cor': '2 Corinthians', 'gal': 'Galatians', 'eph': 'Ephesians',
    'phil': 'Philippians', 'col': 'Colossians', '1thess': '1 Thessalonians', '2thess': '2 Thessalonians',
    '1tim': '1 Timothy', '2tim': '2 Timothy', 'titus': 'Titus', 'phlm': 'Philemon', 'heb': 'Hebrews', 'jas': 'James',
    '1pet': '1 Peter', '2pet': '2 Peter', '1jn': '1 John', '1john': '1 John', # Allow both if needed
    '2jn': '2 John', '2john': '2 John', '3jn': '3 John', '3john': '3 John', 'jude': 'Jude',
    'rev': 'Revelation of John' # Match canonical name
}
def normalize_book_name_from_txt(raw_name):
    if not raw_name: return None
    cleaned = raw_name.strip().replace('.', '').lower()
    lookup_key = cleaned.replace(' ', '')
    normalized = BOOK_NAME_MAP.get(lookup_key)
    if not normalized: normalized = BOOK_NAME_MAP.get(cleaned) # Try with space if first failed
    # Fallback logic can be adjusted
    return normalized if normalized else cleaned.title() # Fallback to title case cleaned name

def parse_dot_reference(ref_str):
    if not ref_str: return None
    match = re.match(r'^([1-3]?[\s\w\.]+)\.(\d+)\.(\d+)$', ref_str.strip(), re.IGNORECASE)
    if match:
        raw_book, chapter_str, verse_str = match.groups()
        normalized_book = normalize_book_name_from_txt(raw_book)
        if normalized_book:
            try: return (normalized_book, int(chapter_str), int(verse_str))
            except ValueError: return None
    return None

def format_id_string(book, chapter, verse):
     book_id_part = book.replace(' ', '') # Ensure no spaces in final ID
     return f"{book_id_part}{chapter}v{verse}"
# --- End re-defined helpers ---


try:
    with open(INPUT_TXT_PATH, mode='r', newline='', encoding='utf-8') as infile:
        # Use csv.reader with tab delimiter
        reader = csv.reader(infile, delimiter='\t')

        # Skip header row
        header = next(reader)
        print(f"Skipped header: {header}")

        # Process data rows
        for i, row in enumerate(reader):
            if len(row) == 3: # Expecting 3 columns
                from_verse_str, to_verse_str, votes_str = row

                # Handle ranges in 'To Verse' - take start verse only
                # Split by '-', take the first part, strip whitespace
                target_ref_str = to_verse_str.split('-')[0].strip()

                # Parse 'From' and 'Target' (start of range)
                from_parsed = parse_dot_reference(from_verse_str)
                target_parsed = parse_dot_reference(target_ref_str)

                if from_parsed and target_parsed:
                    try:
                        votes = int(votes_str.strip())

                        # Format IDs
                        source_id = format_id_string(*from_parsed)
                        target_id = format_id_string(*target_parsed)

                        # Create JSON object (value is votes)
                        reference_obj = {
                            "source": source_id,
                            "target": target_id,
                            "value": votes
                        }
                        all_references.append(reference_obj)
                        processed_rows += 1

                    except ValueError:
                        # print(f"Warning: Skipping row {i+2} due to invalid votes value: '{votes_str}'")
                        skipped_rows += 1
                        parse_errors += 1
                else:
                    # print(f"Warning: Skipping row {i+2} due to invalid reference format: '{from_verse_str}' or '{target_ref_str}'")
                    skipped_rows += 1
                    if not from_parsed: parse_errors += 1
                    if not target_parsed: parse_errors += 1
            else:
                # print(f"Warning: Skipping row {i+2} due to unexpected number of columns: {len(row)}")
                skipped_rows += 1

except FileNotFoundError:
    print(f"ERROR: Input file not found at '{INPUT_TXT_PATH}'")
except Exception as e:
    print(f"An unexpected error occurred during processing: {e}")

print(f"\n--- Processing Summary ---")
print(f"Total rows processed (excluding header): {processed_rows + skipped_rows}")
print(f"Rows successfully converted to reference objects: {processed_rows}")
print(f"Rows skipped (due to format/value errors): {skipped_rows}")
print(f"  - Estimated individual parse errors: {parse_errors}") # Note: A row skip might involve multiple parse errors
print(f"Length of all_references list: {len(all_references)}")

print("\nCell 2 Processing Complete. Check summary. Proceed to Cell 3 for writing JSON.")

# Store results for next cell
%store all_references

Preprocessing Step 2 (MVP v3.0): Reading TXT File and Formatting Data
Skipped header: ['From Verse', 'To Verse', 'Votes', '#www.openbible.info CC-BY 2025-03-24']

--- Processing Summary ---
Total rows processed (excluding header): 344799
Rows successfully converted to reference objects: 344799
Rows skipped (due to format/value errors): 0
  - Estimated individual parse errors: 0
Length of all_references list: 344799

Cell 2 Processing Complete. Check summary. Proceed to Cell 3 for writing JSON.
Stored 'all_references' (list)


In [6]:
import json
import os

print("Preprocessing Step 3 (MVP v3.0): Writing Data to JSON file")

# --- Restore variables ---
%store -r all_references
%store -r OUTPUT_JSON_PATH # This should be '../data/references.json'

output_dir = os.path.dirname(OUTPUT_JSON_PATH)

try:
    # Ensure the output directory exists
    if output_dir and not os.path.exists(output_dir):
        print(f"Creating output directory: {output_dir}")
        os.makedirs(output_dir)

    # Write the data to the JSON file
    print(f"Writing {len(all_references)} references to {OUTPUT_JSON_PATH}...")
    with open(OUTPUT_JSON_PATH, 'w', encoding='utf-8') as f:
        # Use indent=None for smallest file size for production data
        json.dump(all_references, f, ensure_ascii=False, indent=None)

    print(f"Successfully wrote {os.path.abspath(OUTPUT_JSON_PATH)}")
    print("Preprocessing complete.")

except Exception as e:
    print(f"ERROR writing JSON file: {e}")

Preprocessing Step 3 (MVP v3.0): Writing Data to JSON file
no stored variable or alias #
no stored variable or alias This
no stored variable or alias should
no stored variable or alias be
no stored variable or alias '../data/references.json'
Creating output directory: ../data
Writing 344799 references to ../data/references.json...
Successfully wrote C:\Users\joshu\OneDrive - Rick At Your Service Property Solutions\Joshs Hobbies\AppDevelopment\data\references.json
Preprocessing complete.
