## Data cleaning

In [3]:
import re
import pandas as pd

def clean_english_text(text):
    """
    Clean English text by removing unwanted patterns and formatting.
    Transforming logic:
    - Replace lit "..." with (literally, ...)
    - Remove double quotes ""
    - Remove asterisks * and other weird characters
    - Keep back slashes \
    - Remove squared brackets []
    - Remove normal brackets (), unless ( has a space before it and ) has a space after it
    - Handle incomplete words with brackets (e.g., defe[ated -> defeated, Bū[du -> Būdu)
    - Handles all kinds of quotes of any encoding
    :param text: Input text string
    :return: Cleaned text string
    """
    # Handle NaN or None values
    if pd.isna(text) or text is None:
        return ""

    # Convert to string if not already
    text = str(text).strip()

    # If empty after strip, return empty string
    if not text:
        return ""

    # Step 1: Handle orphaned closing parentheses first (before lit pattern)
    # Pattern: word)-anything -> word-anything (for cases like Bīt)-Kapsi -> Bīt-Kapsi)
    text = re.sub(r'(\w+)\)(-[^\s]+)', r'\1\2', text)

    # Step 2: Handle lit "..." pattern - transform to (literally, content)

    # Step 1.5: Remove orphaned ')' inside words like 'pe)ple'
    text = re.sub(r'(\w+)\)(\w+)', r'\1\2', text)

# Define comprehensive quote characters including all Unicode quote variants

    # All possible quote characters (opening and closing)
    quote_chars = [
        # ASCII quotes
        '"', "'", '`',
        # Unicode quotes - Left/Right Double Quotation Marks
        '\u201C', '\u201D',  # " " (left/right double quotation marks)
        '\u201E', '\u201F',  # „ ‟ (double low-9 quotation mark, double high-reversed-9 quotation mark)
        # Unicode quotes - Left/Right Single Quotation Marks  
        '\u2018', '\u2019',  # ' ' (left/right single quotation marks)
        '\u201A', '\u201B',  # ‚ ‛ (single low-9 quotation mark, single high-reversed-9 quotation mark)
        # Other Unicode quotes
        '\u00AB', '\u00BB',  # « » (left/right-pointing double angle quotation marks)
        '\u2039', '\u203A',  # ‹ › (single left/right-pointing angle quotation marks)
        '\u2E42', '\u301D',  # ⹂ 〝 (double low-reversed-9 quotation mark, reversed double prime quotation mark)
        '\u301E', '\u301F',  # 〞 〟 (double prime quotation mark, low double prime quotation mark)
        # German quotes
        '\u201E', '\u201C',  # „ " (German style)
        # French quotes  
        '\u00AB', '\u00BB',  # « » (French style)
        # Additional quote-like characters
        '\u02DD', '\u02EE',  # ˝ ˮ (double acute accent, modifier letter double apostrophe)
        # CJK quotes
        '\u300C', '\u300D',  # 「 」 (left/right corner brackets)
        '\u300E', '\u300F',  # 『 』 (left/right white corner brackets)
    ]

    # Create a character class for all quote characters
    quote_class = '[' + ''.join(re.escape(char) for char in quote_chars) + ']'

    # Enhanced lit pattern matching function
    def lit_replacer(match):
        content = match.group(1)
        return f'(literally, {content})'

    # Create comprehensive regex pattern for lit quotes
    # This pattern looks for:
    # - "lit" followed by whitespace
    # - Any opening quote character
    # - Content (non-greedy match of any characters except quote characters)
    # - Any closing quote character
    lit_pattern = rf'lit\s+{quote_class}([^{quote_class[1:-1]}]*?){quote_class}'

    # Apply the lit pattern transformation
    text = re.sub(lit_pattern, lit_replacer, text)

    # Fallback: Handle cases where quotes might be mixed or malformed
    # This catches any remaining "lit" + quote patterns that might have been missed
    fallback_patterns = [
        # Handle specific problematic cases like chr(8220) and chr(8221)
        r'lit\s+[\u2000-\u206F\u2E00-\u2E7F"\'`""''‚„‹›«»]([^""''‚„‹›«»"\'`\u2000-\u206F\u2E00-\u2E7F]*?)[\u2000-\u206F\u2E00-\u2E7F"\'`""''‚„‹›«»]',
        # Ultra-broad fallback for any remaining quote-like characters
        r'lit\s+[\u0020-\u007E\u00A0-\u00FF\u2000-\u206F\u2E00-\u2E7F]*?([^a-zA-Z0-9\s\\/-]*?)([^""''‚„‹›«»"\'`]*?)[\u0020-\u007E\u00A0-\u00FF\u2000-\u206F\u2E00-\u2E7F]*?'
    ]

    for pattern in fallback_patterns:
        if re.search(pattern, text):
            text = re.sub(pattern, lit_replacer, text)
            break

    # Step 3: Handle incomplete words with square brackets
    text = re.sub(r'(\w+)\[(\w*)', r'\1\2', text)

    # Step 4: Handle incomplete words with parentheses  
    text = re.sub(r'(\w+)\((\w*)', r'\1\2', text)

    # Step 5: Remove remaining squared brackets and their content
    text = re.sub(r'\[[^\]]*\]', '', text)
    text = re.sub(r'[\[\]]', '', text)

    # Step 6: Handle normal brackets - keep only those with space before ( and space after )
    valid_brackets = []
    bracket_placeholder = "|||VALID_BRACKET_{}|||"

    # Find all valid bracket patterns: space + ( + content + ) + space
    valid_pattern = r'(?<=\s)\([^)]*\)(?=\s)'
    matches = list(re.finditer(valid_pattern, text))

    # Replace valid brackets with placeholders (in reverse order to maintain positions)
    for i, match in enumerate(reversed(matches)):
        placeholder = bracket_placeholder.format(len(matches) - 1 - i)
        valid_brackets.insert(0, match.group())
        text = text[:match.start()] + placeholder + text[match.end():]

    # Now remove all remaining brackets
    text = re.sub(r'\([^)]*\)', '', text)

    # Restore valid brackets
    for i, bracket_content in enumerate(valid_brackets):
        placeholder = bracket_placeholder.format(i)
        text = text.replace(placeholder, bracket_content)

    # Step 7: Remove ALL types of quotes comprehensively
    # Use the same comprehensive quote character class
    text = re.sub(quote_class, '', text)

    # Step 8: Remove asterisks and other weird characters (but keep backslashes)
    weird_chars = r'[*#$%^&+=<>{}|~`¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿×÷]'
    text = re.sub(weird_chars, '', text)

    # Step 9: Remove other potential weird Unicode characters but be conservative
    text = re.sub(r'[†‡•…‰′″‴‵‶‷‸‹›※‼‽⁇⁈⁉⁏⁐⁑]', '', text)

    # Step 10: Clean up multiple spaces
    text = re.sub(r'\s+', ' ', text)

    text = text.strip('"')         # remove any " at the very start/end
    # Step 11: Final cleanup - remove leading/trailing whitespace
    text = text.strip()

    return text


# Test the function with the provided examples and additional test cases
test_cases = [
    # New test cases based on the provided examples
    'At that time I made a pointed iron "arrow," in[scribed the mighty deeds of the god Aššur my lord on it and I set it up at the spring of the city Bīt-Ištar Upaš of the land Bīt)-Kapsi lit "son of Kapsi" assembled his people and ascended Mount Abirus I pursued him defe[ated him and carried off his booty',
    'which are without num[ber the city Atu the tribe Qabiʾi 5 the fortress of Labbanat Arameans on the banks of River(s the tribe Bū[du',
    "In my ninth palû the god Aššur my lord encouraged me and I marched against the lands Bīt-Kapsi Bīt-Sangi Bīt-Urzakki Media lit \"land of the Medes\" Bīt-Zualzaš Bīt-Matti and Tupliyaš I captured plund[ered destroyed devastated and burned with fire the cities Bīt-Ištar Kinkangi Kindigiasu Kingialkasiš Kubušḫati[diš 5 Upušu Aḫsipuna Girgirâ and Kimbazḫati together with cities in their environs",
    # Test cases for various Unicode quotes
    'lit "standard quotes"',
    'lit "unicode left/right quotes"',
    'lit 「CJK quotes」',
    'lit «French quotes»',
    'lit ‹single angle quotes›',
    'pe)ple are testing',
    f'lit {chr(8220)}chr 8220 quotes{chr(8221)}',  # Your specific case
]

print("Testing Enhanced English Text Cleaner:")
print("=" * 80)

for i, test_input in enumerate(test_cases, 1):
    result = clean_english_text(test_input)
    print(f"Test {i}:")
    print(f"Input:  '{test_input}'")
    print(f"Output: '{result}'")
    print("-" * 80)

# Test specifically for the problematic cases
print("\nSpecific problem case tests:")
print("=" * 80)

problem_cases = [
    'The region of lit "son of Kapsi" was conquered',  # Should become The region of (literally, son of Kapsi) was conquered
    f'lit {chr(8220)}son of Kapsi{chr(8221)}',  # Your specific chr(8220) case
    'defe[ated',           # Should become defeated  
    'River(s',             # Should become Rivers
    'Bū[du',               # Should become Būdu
    'num[ber',             # Should become number
    'in[scribed',          # Should become inscribed
    'Bīt)-Kapsi',          # Should become Bīt-Kapsi
]

for case in problem_cases:
    result = clean_english_text(case)
    print(f"'{case}' -> '{result}'")

# Additional test for various quote combinations
print("\nComprehensive quote character tests:")
print("=" * 40)

quote_test_cases = [
    ('lit "ASCII double"', '(literally, ASCII double)'),
    ("lit 'ASCII single'", '(literally, ASCII single)'),
    ('lit "Unicode left/right"', '(literally, Unicode left/right)'),
    ('lit ‚low-9 quotes‛', '(literally, low-9 quotes)'),
    ('lit «angle quotes»', '(literally, angle quotes)'),
    ('lit ‹single angles›', '(literally, single angles)'),
    (f'lit {chr(8220)}chr 8220{chr(8221)}', '(literally, chr 8220)'),
]

for input_text, expected in quote_test_cases:
    result = clean_english_text(input_text)
    status = "✓" if expected in result else "✗"
    print(f"{status} '{input_text}' -> '{result}'")

Testing Enhanced English Text Cleaner:
Test 1:
Input:  'At that time I made a pointed iron "arrow," in[scribed the mighty deeds of the god Aššur my lord on it and I set it up at the spring of the city Bīt-Ištar Upaš of the land Bīt)-Kapsi lit "son of Kapsi" assembled his people and ascended Mount Abirus I pursued him defe[ated him and carried off his booty'
Output: 'At that time I made a pointed iron arrow, inscribed the mighty deeds of the god Aššur my lord on it and I set it up at the spring of the city Bīt-Ištar Upaš of the land Bīt-Kapsi (literally, son of Kapsi) assembled his people and ascended Mount Abirus I pursued him defeated him and carried off his booty'
--------------------------------------------------------------------------------
Test 2:
Input:  'which are without num[ber the city Atu the tribe Qabiʾi 5 the fortress of Labbanat Arameans on the banks of River(s the tribe Bū[du'
Output: 'which are without number the city Atu the tribe Qabiʾi 5 the fortress of Labbanat Ara

In [19]:
import pandas as pd
def process_csv_file(csv_filename):
    """
    Process a CSV file by cleaning the 'english' column and showing before/after for each row.
    :param csv_filename: Path to the CSV file
    """
    try:
        # Read the CSV file
        df = pd.read_csv(csv_filename, encoding="utf-8-sig")

        # Check if 'english' column exists
        if 'english' not in df.columns:
            print(f"Error: 'english' column not found in {csv_filename}")
            print(f"Available columns: {list(df.columns)}")
            return

        print(f"Processing {len(df)} rows from {csv_filename}")
        print("=" * 80)

        # Process each row
        for index, row in df.iterrows():
            original_text = row['english']
            cleaned_text = clean_english_text(original_text)

            print(f"Row {index + 1}:")
            print(f"BEFORE: {original_text}")
            print("-" * 50)
            print(f"AFTER:  {cleaned_text}")
            print("=" * 80)

            # Update the dataframe with cleaned text
            df.at[index, 'english'] = cleaned_text

        # Save the cleaned data to a new CSV file

    except FileNotFoundError:
        print(f"Error: File '{csv_filename}' not found.")
    except Exception as e:
        print(f"Error processing file: {str(e)}")


# Example usage
if __name__ == "__main__":
    # Replace 'your_file.csv' with the actual path to your CSV file
    csv_file_path = "data_directories/rinap/data_files/data_2.csv"

    # Uncomment the line below and replace with your actual CSV file path
    process_csv_file(csv_file_path)

Processing 18 rows from data_directories/rinap/data_files/data_2.csv
Row 1:
BEFORE: Precious scion of Baltil Aššur beloved of the god(dess DN and Šē]rūa creation of the goddess Ninmena who for the dominion of the lands who grew up to be king governor the one who increases voluntary offerings for of emblems 5 powerful male light of all of his people lord of all rulers the one who overwhelms his foes valiant man the one who destroys enemies who cuts straight through interlocking mountains like a taut string and
--------------------------------------------------
AFTER:  Precious scion of Baltil Aššur beloved of the goddess DN and Šērūa creation of the goddess Ninmena who for the dominion of the lands who grew up to be king governor the one who increases voluntary offerings for of emblems 5 powerful male light of all of his people lord of all rulers the one who overwhelms his foes valiant man the one who destroys enemies who cuts straight through interlocking mountains like a taut string a

In [3]:
def clean_visible_text_akkadian(text):
    """
    Clean visible text by removing unwanted patterns and formatting.
    Transforming logic:
    any dots in the end will be replaced with hyphens
    any hyphens in the beginning or end will be removed
    any special characters will be removed
    any logograms associated with missing values like x or [...] will be omitted all together
    any brackets will be removed
    any standalone x will be removed
    any standalone x between dots will be removed
    numbers will be kept as is, if they are surrounded by dots, the dots are replaced by hyphens
    :param text: 
    :return: 
    """
    import re

    # Start with basic strip
    text = text.strip()

    # If empty after strip, return empty string
    if not text:
        return ""

    # ✅ NEW: Remove letter followed by brackets pattern (robust approach)
    # This handles cases like "d[complex content with nested brackets]"
    # Use a more aggressive approach since brackets may not be balanced
    if re.match(r'^[a-zA-Z]\[', text):
        # If text starts with letter followed by bracket, remove everything
        text = ""
    else:
        # Otherwise, remove any letter(s)-bracket patterns we can find
        # Keep applying until no more changes
        prev_text = ""
        while prev_text != text:
            prev_text = text
            # FIXED: Match multiple letters followed by optional hyphen and brackets
            text = re.sub(r'[a-zA-Z]+-?\[(?:[^\[\]]|\[[^\]]*\])*\]', '', text)
            text = re.sub(r'[a-zA-Z]+-?\[[^\]]*\]', '', text)
            # Also catch cases where there might be punctuation before the letters
            text = re.sub(r'[.,;:-]*[a-zA-Z]+\[(?:[^\[\]]|\[[^\]]*\])*\]', '', text)
            text = re.sub(r'[.,;:-]*[a-zA-Z]+\[[^\]]*\]', '', text)
        text = text.strip()

    # If we cleared everything, return empty
    if not text:
        return ""

    # ✅ NEW: Remove patterns like "something.[...]" including bracketed words like "⸢KUR⸣.[...]"
    text = re.sub(r'[⸢⸤]?[\w⸢⸣⸤⸥]+[⸣⸥]?\s*\.\[[^\]]*\]', '', text).strip()

    # ✅ NEW: Remove patterns like "something.x" including bracketed words like "⸢KUR⸣.x"
    text = re.sub(r'[⸢⸤]?[\w⸢⸣⸤⸥]+[⸣⸥]?\s*\.[xX]', '', text).strip()

    # ✅ Step 1: Remove leading garbage like "...]-" or "[x]-"
    text = re.sub(r"^.*[\[\](){}<>]+.*\]-+", "", text).strip()

    # ✅ Step 2: Strip trailing "-[...]" or similar
    text = re.sub(r"-+[\[\](){}<>].*$", "", text).strip()

    # ✅ Step 3: Remove bracketed content entirely
    # text = re.sub(r"[\[\](){}<>][^[\](){}<>]*[\]\})>]", "", text).strip()

    # ✅ Step 4: Remove any remaining standalone brackets
    text = re.sub(r"[\[\](){}<>]", "", text).strip()

    # ✅ Step 5: Remove pure garbage patterns (only x, punctuation, spaces)
    text = re.sub(r"^[xX.\- ]+$", "", text).strip()

    # ✅ Step 6: Replace dots with hyphens around numbers (e.g., .123. -> -123-)
    text = re.sub(r"\.(\d+)\.", r"-\1-", text)

    text = re.sub(r"^x-|-x$", "", text)

    # ✅ Step 7: Remove hyphens (but keep hyphens between numbers)
    text = re.sub(r"(?<!\d)-(?!\d)", "", text)

    # ✅ Step 8: Convert to lowercase
    text = text.lower()

    # ✅ Step 9: Remove weird characters like asterisks
    text = re.sub(r"[*]", "", text)

    # ✅ Step 10: Remove x alone or x between dots
    text = re.sub(r"\b[x]\b|\.+[x]\.+", "", text)

    # ✅ Step 11: Clean up multiple spaces and dots
    text = re.sub(r"\s+", " ", text)  # Multiple spaces to single space
    text = re.sub(r"\.{3,}", "...", text)  # Multiple dots to max 3

    # ✅ Step 11.5: Remove specific unwanted characters
    text = re.sub("ʾ", "", text)  # Remove specific unwanted character

    # ✅ Step 12: Remove leading/trailing punctuation except periods
    text = re.sub(r"^[^\w.]+|[^\w.]+$", "", text)

    # ✅ Step 13: Final cleanup - if only punctuation/whitespace remains, clear it
    if not re.search(r"[a-zA-Z0-9]", text):
        text = ""
    # ✅ Step 14: swap the remaining periods with hyphens
    text = re.sub(r"\.", "-", text)

    # ✅ Step 15: Remove any remaining individual unwanted characters (cleanup)
    text = re.sub(r"[⸢⸣⸤⸥]", "", text)  # Remove any leftover weird bracket characters

    # ✅ Step 16: Remove any trailing or leading hyphens
    text = re.sub(r"^-|-$", "", text)

    return text.strip()

# Test the function with multiple examples
test_cases = [
    "[...].,.[...].,.[...].,.a-[...]",
    "in-[...]",
    "the-[missing text]",
    "word-[complex content]",
    "abc-[...]"
]

for test_input in test_cases:
    result = clean_visible_text_akkadian(test_input)
    print(f"Input: {test_input}")
    print(f"Output: '{result}'")
    print()


lines = open("data_directories/suhu/unlinked_files/unlinked_data_page_1.txt", "r", encoding="utf-8-sig").readlines()
for i, line in enumerate(lines, 1):
    original = line.strip()
    cleaned = clean_visible_text_akkadian(original)


    print(f"\nLine {i}:")
    print(f"  BEFORE: '{original}'")
    print(f"  AFTER:  '{cleaned}'")


Input: [...].,.[...].,.[...].,.a-[...]
Output: ''

Input: in-[...]
Output: ''

Input: the-[missing text]
Output: ''

Input: word-[complex content]
Output: ''

Input: abc-[...]
Output: ''


Line 1:
  BEFORE: '3: [(...)]'
  AFTER:  '3: --'

Line 2:
  BEFORE: '4b: (...)]'
  AFTER:  '4b: --'

Line 3:
  BEFORE: '10b: ⸢4⸣.,.12.,.20.,.4'
  AFTER:  '10b: 4-,-12-,-20-,-4'

Line 4:
  BEFORE: '13: 45.,.9'
  AFTER:  '13: 45-,-9'

Line 5:
  BEFORE: '19b: [2'
  AFTER:  '19b: 2'

Line 6:
  BEFORE: '22: 1'
  AFTER:  '22: 1'

Line 7:
  BEFORE: '24b: 4.,.2'
  AFTER:  '24b: 4-,-2'

Line 8:
  BEFORE: '26: [x.,.x.,.x.,.x.,.x.,.x]'
  AFTER:  '26: -,,,,,'

Line 9:
  BEFORE: '29: [x.,.x.,.x.,.x.,.x.,.(x.,.x)].,.x.,.x.,.x.,.x.,.(x.,.x)]'
  AFTER:  '29: -,,,,,,,,,,,,'

Line 10:
  BEFORE: '32: 1.,.5.,.2.,.20.,.3'
  AFTER:  '32: 1-,-5-,-2-,-20-,-3'

Line 11:
  BEFORE: '34: 2'
  AFTER:  '34: 2'

Line 12:
  BEFORE: '35b: 1.,.7'
  AFTER:  '35b: 1-,-7'

Line 13:
  BEFORE: '36: 80'
  AFTER:  '36: 80'

Line 14:
  BEFOR

In [None]:
import os
import csv
import re
from pathlib import Path

def parse_and_clean_unlinked_data(text_content):
    lines = text_content.strip().split('\n')
    temp = [line.strip().split(":", 1) for line in lines if line.strip()]
    temp_2 = [(line[0], line[1].strip()) for line in temp if len(line) == 2]
    final_result = []
    for line_num, content in temp_2:
        parts = content.split(".,.")
        cleaned_parts = [clean_visible_text_akkadian(part) for part in parts]
        final_result.extend(cleaned_parts)
    return final_result

def replace_data_markers(akkadian_text, cleaned_data, data_index):
    result_text = akkadian_text
    current_index = data_index
    while '<data>' in result_text and current_index < len(cleaned_data):
        replacement_text = cleaned_data[current_index].strip()
        if replacement_text:
            result_text = result_text.replace('<data>', replacement_text, 1)
        else:
            if '<data> ' in result_text:
                result_text = result_text.replace('<data> ', '', 1)
            else:
                result_text = result_text.replace('<data>', '', 1)
        current_index += 1
    return result_text.strip(), current_index

def process_files():
    orig_path = "data_directories/saao/"
    data_dir = Path(orig_path + 'data_files')
    unlinked_dir = Path(orig_path + 'unlinked_files')
    processed_dir = Path(orig_path + 'processed')
    processed_dir.mkdir(exist_ok=True)

    if not data_dir.exists():
        print(f"Error: {data_dir} directory not found!")
        return
    if not unlinked_dir.exists():
        print(f"Error: {unlinked_dir} directory not found!")
        return

    csv_files = list(data_dir.glob('*.csv'))
    if not csv_files:
        print(f"No CSV files found in {data_dir}")
        return

    print(f"Found {len(csv_files)} CSV files to process")

    for csv_file in csv_files:
        print(f"\nProcessing: {csv_file.name}")
        file_stem = csv_file.stem
        unlinked_file = unlinked_dir / f"unlinked_data_page_{file_stem}.txt"

        if not unlinked_file.exists():
            possible_patterns = [
                f"unlinked_data_page_{file_stem.split('_')[-1]}.txt" if '_' in file_stem else None,
                f"unlinked_data_{file_stem}.txt",
                f"unlinked_{file_stem}.txt"
            ]
            for pattern in possible_patterns:
                if pattern and (unlinked_dir / pattern).exists():
                    unlinked_file = unlinked_dir / pattern
                    break

        try:
            with open(csv_file, 'r', encoding='utf-8-sig', newline='') as f:
                reader = csv.DictReader(f)
                rows = list(reader)
                fieldnames = reader.fieldnames

            if 'akkadian' not in fieldnames:
                print(f"Warning: 'akkadian' column not found in {csv_file.name}")
                output_file = processed_dir / f"data_file_processed_{file_stem}.csv"
                with open(output_file, 'w', encoding='utf-8-sig', newline='') as f:
                    writer = csv.DictWriter(f, fieldnames=fieldnames)
                    writer.writeheader()
                    writer.writerows(rows)
                continue

            data_index = 0
            cleaned_data = []

            if unlinked_file.exists():
                print(f"Found corresponding unlinked file: {unlinked_file.name}")
                with open(unlinked_file, 'r', encoding='utf-8-sig') as f:
                    unlinked_content = f.read()
                cleaned_data = parse_and_clean_unlinked_data(unlinked_content)
                print(f"Extracted {len(cleaned_data)} data items from unlinked file")
            else:
                print(f"No corresponding unlinked file found for {csv_file.name}")
                print("Applying English transformation only...")

            # ✅ Process all rows (always clean English; conditionally replace Akkadian <data>)
            for row in rows:
                if cleaned_data and '<data>' in row['akkadian']:
                    original_akkadian = row['akkadian']
                    count_before = original_akkadian.count('<data>')
                    start_idx = data_index
                    row['akkadian'], data_index = replace_data_markers(
                        row['akkadian'], cleaned_data, data_index
                    )
                    print(f"Replaced {data_index - start_idx} <data> in Akkadian (originally {count_before})")

                if 'english' in row:
                    original_english = row['english']
                    row['english'] = clean_english_text(row['english'])
                    if original_english != row['english']:
                        print(f"Old English {original_english}'")
                        print(f"Cleaned English '{row['english']}'")
                else:
                    print("⚠️  'english' column not found in row — skipping English cleaning.")


# ✅ Save processed CSV
            output_file = processed_dir / f"data_file_processed_{file_stem}.csv"
            with open(output_file, 'w', encoding='utf-8-sig', newline='') as f:
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(rows)

            print(f"Saved processed file: {output_file}")

        except Exception as e:
            print(f"Error processing {csv_file.name}: {str(e)}")
            continue

    print(f"\nProcessing complete! Check the '{processed_dir}' directory for results.")

if __name__ == "__main__":
    process_files()


## Combine all CSVs in the processed directories

In [6]:
import os
import pandas as pd

def combine_csvs_in_directory(directory):
    # Check if directory exists
    if not os.path.exists(directory):
        print(f"❌ Directory does not exist: {directory}")
        return pd.DataFrame()

    all_files = [f for f in os.listdir(directory) if f.endswith('.csv')]

    if not all_files:
        print(f"⚠️  No CSV files found in {directory}")
        return pd.DataFrame()

    dataframes = []

    for file in all_files:
        file_path = os.path.join(directory, file)
        df = pd.read_csv(file_path)
        dataframes.append(df)
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        print(f"✅ Combined {len(dataframes)} files into DataFrame with {len(combined_df)} rows")
        return combined_df
    else:
        return pd.DataFrame()

# Updated paths - add "Dr. Azar Research/" prefix
data_dir = "data_directories/"
end_dir = "/processed"

    # Now your original code should work
combined_df_suhu = combine_csvs_in_directory(data_dir + 'suhu' + end_dir)
combined_df_saao = combine_csvs_in_directory(data_dir + 'saao' + end_dir)
combined_df_rinap = combine_csvs_in_directory(data_dir + 'rinap' + end_dir)
combined_df_ribo = combine_csvs_in_directory(data_dir + 'ribo' + end_dir)
combined_df_raio = combine_csvs_in_directory(data_dir + 'raio' + end_dir)

✅ Combined 27 files into DataFrame with 260 rows
✅ Combined 4976 files into DataFrame with 43479 rows
✅ Combined 989 files into DataFrame with 6137 rows
✅ Combined 398 files into DataFrame with 2639 rows
✅ Combined 1702 files into DataFrame with 8416 rows


In [7]:
combined_df_ribo.to_csv(data_dir + "ribo/combined_ribo_processed.csv", index=False, encoding="utf-8")
combined_df_suhu.to_csv(data_dir + "suhu/combined_suhu_processed.csv", index=False, encoding="utf-8")
combined_df_saao.to_csv(data_dir + "saao/combined_saao_processed.csv", index=False, encoding="utf-8")
combined_df_rinap.to_csv(data_dir + "rinap/combined_rinap_processed.csv", index=False, encoding="utf-8")
combined_df_raio.to_csv(data_dir + "raio/combined_raio_processed.csv", index=False, encoding="utf-8")

In [8]:
df_combined = pd.concat([combined_df_suhu,
                         combined_df_saao,
                         combined_df_rinap,
                         combined_df_raio,
                         combined_df_ribo],
                        ignore_index=True)

## Final checkup on the data

In [42]:
df_combined_working = df_combined.copy()

In [43]:
df_combined_working = df_combined_working[~df_combined_working['english'].str.contains("translation", case=False, na=False)]

In [44]:
df_combined_working[df_combined_working['akkadian'].isna()]

Unnamed: 0,line,akkadian,english,source
41,i 2',,his,https://oracc.museum.upenn.edu/suhu/Q006206?la...
182,r 4',,I/he caused,https://oracc.museum.upenn.edu/suhu/Q006210?la...
222,i 1',,I Ninurta-kudurrī-uṣur governor of the land of...,https://oracc.museum.upenn.edu/suhu/Q006214?la...
1588,r 3,,The fine,https://oracc.museum.upenn.edu/saao/P336188?la...
1800,e. ii 1,,Aramaic caption Dayyan-Kurbail,https://oracc.museum.upenn.edu/saao/P335279?la...
...,...,...,...,...
59016,1',,am I,https://oracc.museum.upenn.edu/ribo/Q005446?la...
59193,i 11,,the fifth day,https://oracc.museum.upenn.edu/ribo/Q006302?la...
59292,i 6,,king of justice king of Babylon,https://oracc.museum.upenn.edu/ribo/Q006241?la...
59310,iv 8,,Nebuchadnezzar,https://oracc.museum.upenn.edu/ribo/Q006241?la...


In [45]:
df_combined_working.duplicated().sum()

np.int64(6)

In [51]:
# Solving the (next b) problem
import pandas as pd

def merge_consecutive_b_rows_anchor(df):
    """
    Merge consecutive 'b' lines from the same source, keeping the first as anchor.
    Returns a cleaned DataFrame with merged rows.
    """
    if df.empty:
        return df
    
    # Work with a copy to avoid modifying the original
    df = df.copy().reset_index(drop=True)
    
    i = 0
    while i < len(df):
        # Skip if current row doesn't contain 'b' or line is not a string
        if not isinstance(df.at[i, 'line'], str) or 'b' not in df.at[i, 'line']:
            i += 1
            continue
        
        # This row contains 'b', use it as anchor
        anchor_source = df.at[i, 'source']
        merged_lines = [df.at[i, 'line']]
        rows_to_merge = []
        
        # Look ahead for consecutive 'b' rows from the same source
        j = i + 1
        while j < len(df):
            # Check if this row should be merged
            if (isinstance(df.at[j, 'line'], str) and 
                'b' in df.at[j, 'line'] and 
                df.at[j, 'source'] == anchor_source):
                
                merged_lines.append(df.at[j, 'line'])
                rows_to_merge.append(j)
                j += 1
            else:
                # Stop if we hit a non-matching row
                break
        
        # If we found rows to merge
        if rows_to_merge:
            # Merge all lines into the anchor row
            df.at[i, 'line'] = ', '.join(merged_lines)
            
            # Handle other columns - merge non-NaN values
            for col in df.columns:
                if col not in ['line', 'source']:  # Skip already handled columns
                    values = []
                    # Start with anchor value
                    if pd.notna(df.at[i, col]) and str(df.at[i, col]).strip():
                        values.append(str(df.at[i, col]).strip())
                    
                    # Add values from rows to be merged
                    for row_idx in rows_to_merge:
                        if pd.notna(df.at[row_idx, col]) and str(df.at[row_idx, col]).strip():
                            val = str(df.at[row_idx, col]).strip()
                            if val not in values:  # Avoid duplicates
                                values.append(val)
                    
                    # Set the merged value
                    df.at[i, col] = ', '.join(values) if values else ''
            
            # Drop the merged rows and reset index
            df = df.drop(rows_to_merge).reset_index(drop=True)
            # Continue from the next row after the anchor (no increment needed as indices shifted)
        else:
            # No rows to merge, move to next row
            i += 1
    
    return df

# Apply the fixed function

df_combined_working = merge_consecutive_b_rows_anchor(df_combined_working)

In [53]:
# More thorough deduplication matching your detection logic
df_combined_working['akkadian_clean'] = df_combined_working['akkadian'].str.strip().str.lower()
df_combined_working['english_clean'] = df_combined_working['english'].str.strip().str.lower()

# Remove duplicates based on cleaned versions
df_combined_working = df_combined_working.drop_duplicates(subset=['akkadian_clean', 'english_clean'])

# Clean up temporary columns
df_combined_working = df_combined_working.drop(['akkadian_clean', 'english_clean'], axis=1).dropna()

In [54]:
df_combined_working = df_combined_working[(df_combined_working['akkadian'].str.split().str.len()> 2) &
                            (df_combined_working['english'].str.split().str.len() > 2)]

a
## Saving the final results

In [55]:
df_combined_working.to_csv("data_directories/final_data/full_data_processed.csv", index=False, encoding="utf-8")
df_combined_working['english'].to_csv('data_directories/final_data/english_sentences.txt', index=False, header=False, encoding="utf-8")
df_combined_working['akkadian'].to_csv('data_directories/final_data/akkadian_sentences.txt', index=False, header=False, encoding="utf-8")

In [56]:
df_combined_working = df_combined_working.sample(frac=1, random_state=42).reset_index(drop=True)

# Extract test and validation sets (1000 each)
test_data = df_combined_working.iloc[:1000]
val_data = df_combined_working.iloc[1000:2000]
train_data = df_combined_working.iloc[2000:]

In [57]:
# Enhanced Data Leakage Detection + Automatic Cleanup

# Check for overlapping English phrases
train_en_set = set(train_data['english'].str.strip().str.lower())
val_en_set = set(val_data['english'].str.strip().str.lower())
test_en_set = set(test_data['english'].str.strip().str.lower())

# Check for overlapping Akkadian phrases
train_tr_set = set(train_data['akkadian'].str.strip().str.lower())
val_tr_set = set(val_data['akkadian'].str.strip().str.lower())
test_tr_set = set(test_data['akkadian'].str.strip().str.lower())

# Check for overlapping complete pairs (English + Akkadian together)
train_pairs = set(zip(train_data['english'].str.strip().str.lower(),
                      train_data['akkadian'].str.strip().str.lower()))
val_pairs = set(zip(val_data['english'].str.strip().str.lower(),
                    val_data['akkadian'].str.strip().str.lower()))
test_pairs = set(zip(test_data['english'].str.strip().str.lower(),
                     test_data['akkadian'].str.strip().str.lower()))

# Individual language intersections
val_en_overlap = train_en_set.intersection(val_en_set)
test_en_overlap = train_en_set.intersection(test_en_set)
val_tr_overlap = train_tr_set.intersection(val_tr_set)
test_tr_overlap = train_tr_set.intersection(test_tr_set)

# Complete pair intersections
val_pair_overlap = train_pairs.intersection(val_pairs)
test_pair_overlap = train_pairs.intersection(test_pairs)

# ORIGINAL Results Summary
print("=" * 60)
print("🔍 ORIGINAL DATA LEAKAGE DETECTION RESULTS")
print("=" * 60)

print(f"\n📝 INDIVIDUAL LANGUAGE OVERLAPS:")
print(f"   English    → val/train = {len(val_en_overlap):3d}, test/train = {len(test_en_overlap):3d}")
print(f"   Akkadian   → val/train = {len(val_tr_overlap):3d}, test/train = {len(test_tr_overlap):3d}")

print(f"\n🔗 COMPLETE PAIR OVERLAPS:")
print(f"   Both langs → val/train = {len(val_pair_overlap):3d}, test/train = {len(test_pair_overlap):3d}")

# Severity assessment
total_issues = len(val_en_overlap) + len(test_en_overlap) + len(val_tr_overlap) + len(test_tr_overlap) + len(val_pair_overlap) + len(test_pair_overlap)

if total_issues == 0:
    print(f"\n✅ STATUS: CLEAN - No data leakage detected!")
else:
    print(f"\n⚠  STATUS: OVERLAP DETECTED - Will clean training data...")

# =============================================================================
# AUTOMATIC CLEANUP: Remove overlapping rows from training data
# =============================================================================

if total_issues > 0:
    print("\n" + "=" * 60)
    print("🧹 CLEANING TRAINING DATA")
    print("=" * 60)

    original_train_size = len(train_data)

    # Create normalized columns for matching
    train_data_clean = train_data.copy()
    train_data_clean['english_norm'] = train_data_clean['english'].str.strip().str.lower()
    train_data_clean['akkadian_norm'] = train_data_clean['akkadian'].str.strip().str.lower()

    # Collect all overlapping items to remove
    all_overlapping_english = val_en_overlap.union(test_en_overlap)
    all_overlapping_akkadian = val_tr_overlap.union(test_tr_overlap)
    all_overlapping_pairs = val_pair_overlap.union(test_pair_overlap)

    # Create boolean masks for rows to remove
    english_mask = train_data_clean['english_norm'].isin(all_overlapping_english)
    akkadian_mask = train_data_clean['akkadian_norm'].isin(all_overlapping_akkadian)

    # For complete pairs, create a combined mask
    pair_mask = pd.Series(False, index=train_data_clean.index)
    if all_overlapping_pairs:
        for en, ak in all_overlapping_pairs:
            pair_condition = (train_data_clean['english_norm'] == en) & (train_data_clean['akkadian_norm'] == ak)
            pair_mask = pair_mask | pair_condition

    # Combine all masks (remove if ANY overlap detected)
    rows_to_remove = english_mask | akkadian_mask | pair_mask

    # Apply the cleanup
    train_data_cleaned = train_data_clean[~rows_to_remove].drop(['english_norm', 'akkadian_norm'], axis=1)

    removed_count = original_train_size - len(train_data_cleaned)

    print(f"📊 Training data size: {original_train_size:,} → {len(train_data_cleaned):,}")
    print(f"🗑  Removed {removed_count:,} overlapping rows ({removed_count/original_train_size*100:.1f}%)")

    # Verify cleanup worked
    print(f"\n🔍 VERIFYING CLEANUP...")

    # Re-run detection on cleaned data
    train_en_clean = set(train_data_cleaned['english'].str.strip().str.lower())
    train_tr_clean = set(train_data_cleaned['akkadian'].str.strip().str.lower())
    train_pairs_clean = set(zip(train_data_cleaned['english'].str.strip().str.lower(),
                                train_data_cleaned['akkadian'].str.strip().str.lower()))

    # Check for remaining overlaps
    val_en_clean = train_en_clean.intersection(val_en_set)
    test_en_clean = train_en_clean.intersection(test_en_set)
    val_tr_clean = train_tr_clean.intersection(val_tr_set)
    test_tr_clean = train_tr_clean.intersection(test_tr_set)
    val_pair_clean = train_pairs_clean.intersection(val_pairs)
    test_pair_clean = train_pairs_clean.intersection(test_pairs)

    remaining_issues = len(val_en_clean) + len(test_en_clean) + len(val_tr_clean) + len(test_tr_clean) + len(val_pair_clean) + len(test_pair_clean)

    print(f"\n📝 CLEANED OVERLAPS:")
    print(f"   English    → val/train = {len(val_en_clean):3d}, test/train = {len(test_en_clean):3d}")
    print(f"   Akkadian   → val/train = {len(val_tr_clean):3d}, test/train = {len(test_tr_clean):3d}")
    print(f"   Both langs → val/train = {len(val_pair_clean):3d}, test/train = {len(test_pair_clean):3d}")

    if remaining_issues == 0:
        print(f"\n✅ SUCCESS: All overlaps removed!")
        print(f"💾 Use 'train_data_cleaned' for training")
    else:
        print(f"\n⚠  WARNING: {remaining_issues} overlaps still remain")

else:
    print(f"\n💾 No cleanup needed - use original 'train_data'")


🔍 ORIGINAL DATA LEAKAGE DETECTION RESULTS

📝 INDIVIDUAL LANGUAGE OVERLAPS:
   English    → val/train =  76, test/train =  61
   Akkadian   → val/train =  37, test/train =  40

🔗 COMPLETE PAIR OVERLAPS:
   Both langs → val/train =   0, test/train =   0

⚠  STATUS: OVERLAP DETECTED - Will clean training data...

🧹 CLEANING TRAINING DATA
📊 Training data size: 32,690 → 32,260
🗑  Removed 430 overlapping rows (1.3%)

🔍 VERIFYING CLEANUP...

📝 CLEANED OVERLAPS:
   English    → val/train =   0, test/train =   0
   Akkadian   → val/train =   0, test/train =   0
   Both langs → val/train =   0, test/train =   0

✅ SUCCESS: All overlaps removed!
💾 Use 'train_data_cleaned' for training


In [58]:
# Create output directory if needed
output_dir = "data_directories/final_data/"
os.makedirs(output_dir, exist_ok=True)

# Save English and Akkadian splits
train_data_cleaned['english'].to_csv(f"{output_dir}/english_train.txt", index=False, header=False, encoding='utf-8')
train_data_cleaned['akkadian'].to_csv(f"{output_dir}/akkadian_train.txt", index=False, header=False, encoding='utf-8')

val_data['english'].to_csv(f"{output_dir}/english_val.txt", index=False, header=False, encoding='utf-8')
val_data['akkadian'].to_csv(f"{output_dir}/akkadian_val.txt", index=False, header=False, encoding='utf-8')

test_data['english'].to_csv(f"{output_dir}/english_test.txt", index=False, header=False, encoding='utf-8')
test_data['akkadian'].to_csv(f"{output_dir}/akkadian_test.txt", index=False, header=False, encoding='utf-8')

print("✅ Split complete: train, val, test saved to", output_dir)

✅ Split complete: train, val, test saved to data_directories/final_data/


### Cross validation saving

In [15]:
import os
import pandas as pd

df = df_combined_working.reset_index(drop=True)
assert len(df) >= 3000, "Dataset must have at least 3000 rows for this split logic."

NUM_SPLITS = 5
SPLIT_SIZE = 1000  # For test and val

for fold in range(1, NUM_SPLITS + 1):
    # Shuffle rows for this split (new split every time)
    df_shuffled = df.sample(frac=1, random_state=fold).reset_index(drop=True)

    # Select indices for test, val, train
    test_indices = df_shuffled.index[:SPLIT_SIZE]
    val_indices = df_shuffled.index[SPLIT_SIZE:SPLIT_SIZE*2]
    train_indices = df_shuffled.index[SPLIT_SIZE*2:]

    test_data = df_shuffled.loc[test_indices].reset_index(drop=True)
    val_data = df_shuffled.loc[val_indices].reset_index(drop=True)
    train_data = df_shuffled.loc[train_indices].reset_index(drop=True)

    # === Data Leakage Detection and Cleanup (your logic, as before) ===
    train_en_set = set(train_data['english'].str.strip().str.lower())
    val_en_set = set(val_data['english'].str.strip().str.lower())
    test_en_set = set(test_data['english'].str.strip().str.lower())

    train_tr_set = set(train_data['akkadian'].str.strip().str.lower())
    val_tr_set = set(val_data['akkadian'].str.strip().str.lower())
    test_tr_set = set(test_data['akkadian'].str.strip().str.lower())

    train_pairs = set(zip(train_data['english'].str.strip().str.lower(),
                          train_data['akkadian'].str.strip().str.lower()))
    val_pairs = set(zip(val_data['english'].str.strip().str.lower(),
                        val_data['akkadian'].str.strip().str.lower()))
    test_pairs = set(zip(test_data['english'].str.strip().str.lower(),
                         test_data['akkadian'].str.strip().str.lower()))

    val_en_overlap = train_en_set.intersection(val_en_set)
    test_en_overlap = train_en_set.intersection(test_en_set)
    val_tr_overlap = train_tr_set.intersection(val_tr_set)
    test_tr_overlap = train_tr_set.intersection(test_tr_set)
    val_pair_overlap = train_pairs.intersection(val_pairs)
    test_pair_overlap = train_pairs.intersection(test_pairs)

    total_issues = (len(val_en_overlap) + len(test_en_overlap) +
                    len(val_tr_overlap) + len(test_tr_overlap) +
                    len(val_pair_overlap) + len(test_pair_overlap))

    # Automatic cleanup if overlaps detected
    if total_issues > 0:
        train_data_clean = train_data.copy()
        train_data_clean['english_norm'] = train_data_clean['english'].str.strip().str.lower()
        train_data_clean['akkadian_norm'] = train_data_clean['akkadian'].str.strip().str.lower()

        all_overlapping_english = val_en_overlap.union(test_en_overlap)
        all_overlapping_akkadian = val_tr_overlap.union(test_tr_overlap)
        all_overlapping_pairs = val_pair_overlap.union(test_pair_overlap)

        english_mask = train_data_clean['english_norm'].isin(all_overlapping_english)
        akkadian_mask = train_data_clean['akkadian_norm'].isin(all_overlapping_akkadian)

        pair_mask = pd.Series(False, index=train_data_clean.index)
        if all_overlapping_pairs:
            for en, ak in all_overlapping_pairs:
                pair_condition = (train_data_clean['english_norm'] == en) & (train_data_clean['akkadian_norm'] == ak)
                pair_mask = pair_mask | pair_condition

        rows_to_remove = english_mask | akkadian_mask | pair_mask
        train_data_cleaned = train_data_clean[~rows_to_remove].drop(['english_norm', 'akkadian_norm'], axis=1)
    else:
        train_data_cleaned = train_data

    # === Save the splits ===
    split_dir = f"data_directories/final_data/split_{fold}"
    os.makedirs(split_dir, exist_ok=True)

    train_data_cleaned['english'].to_csv(f"{split_dir}/english_train.txt", index=False, header=False, encoding='utf-8')
    train_data_cleaned['akkadian'].to_csv(f"{split_dir}/akkadian_train.txt", index=False, header=False, encoding='utf-8')
    val_data['english'].to_csv(f"{split_dir}/english_val.txt", index=False, header=False, encoding='utf-8')
    val_data['akkadian'].to_csv(f"{split_dir}/akkadian_val.txt", index=False, header=False, encoding='utf-8')
    test_data['english'].to_csv(f"{split_dir}/english_test.txt", index=False, header=False, encoding='utf-8')
    test_data['akkadian'].to_csv(f"{split_dir}/akkadian_test.txt", index=False, header=False, encoding='utf-8')

    print(f"✅ Fold {fold}: Train={len(train_data_cleaned)}, Val={len(val_data)}, Test={len(test_data)} saved to {split_dir}")


✅ Fold 1: Train=38007, Val=1000, Test=1000 saved to data_directories/final_data/split_1
✅ Fold 2: Train=38047, Val=1000, Test=1000 saved to data_directories/final_data/split_2
✅ Fold 3: Train=38070, Val=1000, Test=1000 saved to data_directories/final_data/split_3
✅ Fold 4: Train=38123, Val=1000, Test=1000 saved to data_directories/final_data/split_4
✅ Fold 5: Train=38006, Val=1000, Test=1000 saved to data_directories/final_data/split_5
