# Documentation
Currently, we extract from the text the normalized form of the akkadain words, we extract words that do not have a normalized form 

we should omit all the captial letters from the text since they represent text that shouldn't be there

In [8]:
from bs4 import BeautifulSoup
import requests
import urllib3
import urllib.parse
from time import time
import os
import re

start = time()
# Disable SSL warnings — only for dev/testing!
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def clean_visible_text(text):
    text = text.strip()

    # ✅ Step 0: Reject leading garbage like "...]-" or "[x]-"
    if re.match(r"^.*[\[\](){}<>]+.*\]-+", text):
        return None

    # ✅ Step 1: Strip trailing "-[...]" or similar
    text = re.sub(r"-+[\[\](){}<>].*$", "", text).strip()

    # ✅ Step 2: Exclude pure garbage: only x, brackets, punctuation
    if re.fullmatch(r"[xX\[\](){}<>.\- ?]+", text):
        return None

    # ✅ Step 3: Exclude anything that still contains bracketed content
    if re.search(r"[\[\](){}<>].*[\]\})>]", text):
        return None

    # ✅ Step 4: Final sanity check
    if not text or re.fullmatch(r"[xX\-\s]+", text):
        return None

    return text


def get_first_normalized_form_from_link(sig_url):
    try:
        response = requests.get(sig_url, verify=False)
        if response.status_code != 200:
            return None
        soup = BeautifulSoup(response.text, "html.parser")

        # Look for a.norms a.icountu that has plain text content (not spans)
        for a in soup.select("p.norms a.icountu"):
            if a.find("span") is None:  # ✅ not a sign group
                text = a.get_text(strip=True)
                if text:
                    return text

    except Exception as e:
        print(f"Error retrieving normalized form from {sig_url}: {e}")
    return None


def extract_project_from_sig(sig):
    if sig.startswith("☣@"):
        try:
            full_path = sig.split("@")[1].split("%")[0]
            return full_path.split("/")[0]
        except IndexError:
            return None
    return None


def extract_normalized_akkadian_text(cell, base_url):
    words = []
    line_unlinked_data = []  # Store unlinked data for this specific akkadian line

    for word_span in cell.select("span.w[class*='akk']"):
        link = word_span.find("a", class_="cbd")
        used = False

        # ✅ Condition: skip ? only if no link
        sup = word_span.select_one("sup")
        if sup and "?" in sup.get_text(strip=True) and not link:
            continue  # ❌ skip uncertain unlinked words

        # Try linked normalization
        if link and link.has_attr("data-wsig"):
            sig = link["data-wsig"]
            project = extract_project_from_sig(sig)
            if project:
                encoded_sig = urllib.parse.quote(sig, safe='')
                sig_url = f"{base_url}/{project}?sig={encoded_sig}"
                normalized = get_first_normalized_form_from_link(sig_url)
                if normalized:
                    words.append(normalized)
                    used = True

        # Try title-based normalization
        if not used:
            title = word_span.get("title", "")
            if "$" in title:
                norm_forms = title.split("$")[1:]
                if norm_forms:
                    first_norm = norm_forms[0].strip()
                    if first_norm:
                        words.append(first_norm)
                        used = True

        # ✅ Modified: For unlinked data, use marker and store actual text
        if not used:
            raw_text = word_span.get_text(strip=True)
            if raw_text:
                words.append("<data>")  # Add marker instead of actual text
                line_unlinked_data.append(raw_text)  # Store raw text without cleaning

    return ' '.join(words), line_unlinked_data


def extract_english_text(cell):
    return ' '.join(span.get_text(strip=True) for span in cell.select("span.w"))


def save_unlinked_data_to_file(unlinked_data_store, filename="unlinked_data.txt"):
    """Save unlinked data to file, organized by line number"""
    with open(filename, "w", encoding="utf-8") as file:
        for line_number in unlinked_data_store.keys():
            data_pieces = unlinked_data_store[line_number]
            joined_data = ".,.".join(data_pieces)
            file.write(f"{line_number}: {joined_data}\n")
    print(f"Unlinked data saved to {filename}")


def parse_oracc_html(html_text, base_url="https://oracc.museum.upenn.edu", filename="unlinked_data.txt"):
    soup = BeautifulSoup(html_text, "html.parser")
    results = []
    accumulated_akkadian = ""
    accumulated_unlinked_data = []  # Accumulate unlinked data like akkadian text
    unlinked_data_store = {}  # Dictionary to store unlinked data by English line number
    table_rows = soup.select_one("table.transliteration") or soup.select_one("table.composite")
    if not table_rows:
        print("No transliteration or composite rows found. Check table class.")
    table_rows = table_rows.select("tr")
    for row in table_rows:
        line_number_tag = row.select_one("td.lnum .lnum")
        line_number = line_number_tag.get_text(strip=True) if line_number_tag else ""

        eng_cell = row.select_one("td.xtr")
        eng_label_tag = row.select_one("td.xtr span.xtr-label")
        eng_line_number = (
            eng_label_tag.get_text(strip=True).strip("()") if eng_label_tag else ""
        )

        eng_text = extract_english_text(eng_cell) if eng_cell else ""

        akk_cell = row.select_one("td.tlit")
        if akk_cell:
            akk_text, line_unlinked_data = extract_normalized_akkadian_text(akk_cell, base_url)
        else:
            akk_text, line_unlinked_data = "", []

        if not eng_text:
            # Accumulate both akkadian text and unlinked data
            accumulated_akkadian += " " + akk_text
            accumulated_unlinked_data.extend(line_unlinked_data)
        else:
            if results:
                # Add accumulated data to the previous entry
                results[-1]["akkadian"] += " " + accumulated_akkadian.strip()
                # Store accumulated unlinked data for the previous English line
                if accumulated_unlinked_data:
                    prev_line = results[-1]["line"]
                    if prev_line in unlinked_data_store:
                        unlinked_data_store[prev_line].extend(accumulated_unlinked_data)
                    else:
                        unlinked_data_store[prev_line] = accumulated_unlinked_data[:]
                accumulated_akkadian = ""
                accumulated_unlinked_data = []

            entry = {
                "line": eng_line_number or line_number,
                "akkadian": akk_text,
                "english": eng_text,
            }
            results.append(entry)

            # Store unlinked data for current English line
            if line_unlinked_data:
                current_line = entry["line"]
                if current_line in unlinked_data_store:
                    unlinked_data_store[current_line].extend(line_unlinked_data)
                else:
                    unlinked_data_store[current_line] = line_unlinked_data[:]

    if accumulated_akkadian and results:
        results[-1]["akkadian"] += " " + accumulated_akkadian.strip()
        # Handle final accumulated unlinked data
        if accumulated_unlinked_data:
            last_line = results[-1]["line"]
            if last_line in unlinked_data_store:
                unlinked_data_store[last_line].extend(accumulated_unlinked_data)
            else:
                unlinked_data_store[last_line] = accumulated_unlinked_data[:]

    # Save unlinked data to file
    if unlinked_data_store:
        save_unlinked_data_to_file(unlinked_data_store, filename)

    return results


def extract_oracc_translations_with_accumulation(html_path):
    with open(html_path, "r", encoding="utf-8") as file:
        html_text = file.read()
    return parse_oracc_html(html_text)


# Run and print
parsed = extract_oracc_translations_with_accumulation("ribo.html")
print("time taken:", time() - start)
for line in parsed:
    print(f"{line['line']}")
    print(f"Akkadian: {line['akkadian']}")
    print(f"English: {line['english']}")
    print("---" * 15)

KeyboardInterrupt: 

In [None]:
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
import time
import pandas as pd
import os
import shutil
from multiprocessing import Pool, cpu_count
from functools import partial
import concurrent.futures
from threading import Lock
from queue import Queue
import threading
import traceback
import logging
from datetime import datetime

# Import your parsing functions
from bs4 import BeautifulSoup
import requests
import urllib3
import urllib.parse
import re

# Global lock for directory operations
dir_lock = Lock()

# Setup focused logging
def setup_logging(log_file="scraper_results.log"):
    """Setup focused logging for results and errors only"""
    # Create formatter for clean output
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

    # File handler for detailed logs
    file_handler = logging.FileHandler(log_file, encoding='utf-8')
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(formatter)

    # Console handler for important messages only
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.WARNING)  # Only warnings and errors to console
    console_handler.setFormatter(formatter)

    # Setup logger
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    logger.handlers.clear()  # Clear existing handlers
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)

    return logger

def dir_checker(dir_path):
    """Thread-safe directory checker"""
    with dir_lock:
        if os.path.exists(dir_path):
            shutil.rmtree(dir_path)
        os.makedirs(dir_path)

def get_chrome_driver():
    """Create a Chrome driver with optimized options"""
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-images")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-plugins")
    chrome_options.add_argument("--disable-background-timer-throttling")
    chrome_options.add_argument("--disable-renderer-backgrounding")
    chrome_options.add_argument("--disable-backgrounding-occluded-windows")
    chrome_options.add_argument("--memory-pressure-off")
    chrome_options.add_argument("--max_old_space_size=4096")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    chrome_options.page_load_strategy = 'normal'

    return webdriver.Chrome(options=chrome_options)

def detect_page_content_type(driver):
    """Detect what type of content is available on the page"""
    content_info = {
        'transliteration_tables': 0,
        'composite_tables': 0,
        'total_tables': 0,
        'has_content': False,
        'page_type': 'unknown'
    }

    try:
        # Check for transliteration tables
        transliteration_tables = driver.find_elements(By.CSS_SELECTOR, "table.transliteration")
        content_info['transliteration_tables'] = len(transliteration_tables)

        # Check for composite tables
        composite_tables = driver.find_elements(By.CSS_SELECTOR, "table.composite")
        content_info['composite_tables'] = len(composite_tables)

        # Check for any tables
        all_tables = driver.find_elements(By.TAG_NAME, "table")
        content_info['total_tables'] = len(all_tables)

        # Determine page type and content availability
        if transliteration_tables:
            content_info['page_type'] = 'transliteration'
            content_info['has_content'] = True
        elif composite_tables:
            content_info['page_type'] = 'composite'
            content_info['has_content'] = True
        elif all_tables:
            content_info['page_type'] = 'generic_table'
            content_info['has_content'] = True
        else:
            content_info['page_type'] = 'no_tables'
            content_info['has_content'] = False

        # Additional content checks
        if not content_info['has_content']:
            content_containers = driver.find_elements(By.CSS_SELECTOR,
                                                      "#content, .content, #main, .main, .transliteration, .composite")
            if content_containers:
                content_info['has_content'] = True
                content_info['page_type'] = 'content_container'

        return content_info

    except Exception as e:
        logger.error(f"Content detection error: {e}")
        return content_info

def wait_for_page_load(driver, timeout=10):
    """Wait for page to fully load with minimal logging"""
    try:
        # Wait for document ready state
        WebDriverWait(driver, timeout).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )

        # Detect content type
        content_info = detect_page_content_type(driver)

        # Wait for specific content based on type
        if content_info['has_content']:
            if content_info['page_type'] == 'transliteration':
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "table.transliteration"))
                    )
                except TimeoutException:
                    pass  # Continue anyway

            elif content_info['page_type'] == 'composite':
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "table.composite"))
                    )
                except TimeoutException:
                    pass  # Continue anyway

            elif content_info['page_type'] == 'generic_table':
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.TAG_NAME, "table"))
                    )
                except TimeoutException:
                    pass  # Continue anyway

        return content_info['has_content']

    except TimeoutException:
        return False
    except Exception:
        return False

def validate_page_content(driver, html_content):
    """Validate that the page contains parseable content"""
    try:
        soup = BeautifulSoup(html_content, "html.parser")
        table_rows = soup.select_one("table.transliteration") or soup.select_one("table.composite")

        if table_rows:
            return True
        else:
            # Check for any tables for debugging
            all_tables = soup.select("table")
            if all_tables:
                logger.warning(f"Found {len(all_tables)} tables but none with expected classes")
            return False

    except Exception as e:
        logger.error(f"Content validation error: {e}")
        return False

def scrape_single_page_enhanced(page_info, parse_function, dir_output, website_name, max_retries=3):
    """Enhanced single page scraper with focused logging"""
    url, page_index = page_info

    for attempt in range(max_retries):
        driver = None
        try:
            print(f"Loading page {page_index}")

            driver = get_chrome_driver()
            driver.set_page_load_timeout(60)
            driver.implicitly_wait(10)

            # Navigate to page
            driver.get(url)

            # Wait for page load
            if not wait_for_page_load(driver):
                raise TimeoutException(f"Page failed to load within timeout")

            # Get page source and validate content
            html = driver.page_source
            if not validate_page_content(driver, html):
                raise ValueError("Page contains no parseable content")

            # Parse the HTML content
            try:
                page_results = parse_function(html, filename=f"{dir_output}/unlinked_data_page_{page_index}.txt")

                if not page_results:
                    raise ValueError("Parser returned no results")

                logger.info(f"SUCCESS - Page {page_index}: Parsed {len(page_results)} items")

            except Exception as parse_error:
                # Save HTML for debugging parse issues
                error_html_filename = f"{dir_output}/error_page_{page_index}.html"
                os.makedirs(dir_output, exist_ok=True)
                with open(error_html_filename, 'w', encoding='utf-8') as f:
                    f.write(html)

                logger.error(f"PARSE ERROR - Page {page_index}: {str(parse_error)} | HTML saved to {error_html_filename}")
                raise parse_error

            # Save page data
            os.makedirs(dir_output, exist_ok=True)
            df = pd.DataFrame(page_results)
            df['source'] = driver.current_url
            df.to_csv(f"{dir_output}/data_{page_index}.csv", index=False, encoding="utf-8-sig")

            return {
                'page_index': page_index,
                'url': url,
                'actual_url': driver.current_url,
                'results': page_results,
                'success': True,
                'error': None,
                'attempts': attempt + 1
            }

        except Exception as e:
            error_type = type(e).__name__
            error_msg = str(e)

            if attempt < max_retries - 1:
                logger.warning(f"RETRY - Page {page_index} attempt {attempt + 1}: {error_type} - {error_msg}")
                time.sleep(1)
            else:
                logger.error(f"FAILED - Page {page_index} after {max_retries} attempts: {error_type} - {error_msg}")

                # Save detailed error info for failed pages
                error_details = {
                    'page_index': page_index,
                    'url': url,
                    'error': error_msg,
                    'error_type': error_type,
                    'attempts': max_retries
                }

                return {
                    'page_index': page_index,
                    'url': url,
                    'actual_url': driver.current_url if driver else None,
                    'results': [],
                    'success': False,
                    'error': error_details,
                    'attempts': max_retries
                }

        finally:
            if driver:
                try:
                    driver.quit()
                except:
                    pass

def discover_all_pages_enhanced(start_url, max_discovery_pages=1e5, restart_interval=10):
    """Enhanced page discovery with focused logging"""
    print(f"Discovering pages from: {start_url}")
    logger.info(f"DISCOVERY START: {start_url}")

    page_urls = []
    page_index = 1
    current_url = start_url
    driver = None

    while page_index <= max_discovery_pages:
        # Restart Chrome periodically
        if (page_index - 1) % restart_interval == 0:
            if driver:
                try:
                    driver.quit()
                except:
                    pass
            driver = get_chrome_driver()
            driver.set_page_load_timeout(60)

        try:
            print(f"Discovering page {page_index}")
            driver.get(current_url)

            # Wait for page load and validate content
            page_load_success = wait_for_page_load(driver)
            actual_url = driver.current_url
            html_content = driver.page_source
            has_parseable_content = validate_page_content(driver, html_content)

            if has_parseable_content:
                page_urls.append((actual_url, page_index))
            else:
                logger.warning(f"Page {page_index} has no parseable content")
                page_urls.append((actual_url, page_index))  # Still add it

            # Look for next button
            next_buttons = driver.find_elements(By.CSS_SELECTOR, "img#p4ItemNext")
            if not next_buttons:
                next_buttons = driver.find_elements(By.ID, "p4ItemNext")

            if not next_buttons or not next_buttons[0].is_enabled():
                logger.info(f"Discovery complete: Found {len(page_urls)} total pages")
                break

            # Click next and wait for page change
            old_url = driver.current_url
            try:
                driver.execute_script("arguments[0].click();", next_buttons[0])
            except Exception:
                try:
                    next_buttons[0].click()
                except Exception:
                    logger.error(f"Cannot click next button at page {page_index}")
                    break

            # Wait for URL to change
            start_time = time.time()
            while driver.current_url == old_url and time.time() - start_time < 15:
                time.sleep(0.25)

            if driver.current_url == old_url:
                logger.info(f"Discovery end: No more pages (URL unchanged)")
                break

            current_url = driver.current_url
            page_index += 1

        except Exception as e:
            logger.error(f"Discovery error at page {page_index}: {str(e)}")
            break

    # Clean up
    if driver:
        try:
            driver.quit()
        except:
            pass

    print(f"Found {len(page_urls)} pages")
    logger.info(f"Discovery result: {len(page_urls)} pages found")
    return page_urls

def scrape_website_parallel_pages(start_url, dir_output, parse_function, max_workers=None):
    """Enhanced website scraper with focused logging"""
    if max_workers is None:
        max_workers = min(cpu_count(), 4)

    # Create output directory
    dir_checker(dir_output)

    # Setup logging for this website
    logger_path = os.path.join(dir_output, "scraper_results.log")
    global logger
    logger = setup_logging(logger_path)

    start_time = time.time()
    logger.info(f"Scraping start: {dir_output} with {max_workers} workers")

    # Step 1: Discover all pages
    page_urls = discover_all_pages_enhanced(start_url)

    if not page_urls:
        logger.error(f"No pages found: {start_url}")
        return {
            'dir_output': dir_output,
            'total_pages': 0,
            'successful_pages': 0,
            'failed_pages': 0,
            'total_items': 0,
            'results': [],
            'success': False
        }

    print(f"Processing {len(page_urls)} pages with {max_workers} workers")
    logger.info(f"Processing {len(page_urls)} pages with {max_workers} workers")

    # Step 2: Process pages in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        scrape_func = partial(scrape_single_page_enhanced,
                              parse_function=parse_function,
                              dir_output=dir_output,
                              website_name=dir_output)

        future_to_page = {executor.submit(scrape_func, page_info): page_info for page_info in page_urls}

        page_results = []
        for future in concurrent.futures.as_completed(future_to_page):
            page_info = future_to_page[future]
            try:
                result = future.result()
                page_results.append(result)

            except Exception as exc:
                logger.error(f"Executor error: Page {page_info[1]} - {str(exc)}")
                page_results.append({
                    'page_index': page_info[1],
                    'url': page_info[0],
                    'actual_url': None,
                    'results': [],
                    'success': False,
                    'error': {'error': str(exc), 'error_type': type(exc).__name__},
                    'attempts': 0
                })

    # Combine results and create summary
    all_results = []
    error_pages = []
    successful_pages = 0

    for result in page_results:
        if result['success']:
            all_results.extend(result['results'])
            successful_pages += 1
        else:
            error_pages.append({
                'page_index': result['page_index'],
                'url': result['url'],
                'actual_url': result.get('actual_url'),
                'error': result['error'],
                'attempts': result.get('attempts', 0)
            })

    # Save detailed error log
    if error_pages:
        error_df = pd.DataFrame(error_pages)
        error_df.to_csv(f"{dir_output}/failed_pages.csv", index=False, encoding="utf-8-sig")
        logger.error(f"Failed pages: {len(error_pages)} pages failed (see failed_pages.csv)")

    # Save combined results
    if all_results:
        combined_df = pd.DataFrame(all_results)
        combined_df.to_csv(f"{dir_output}/combined_data.csv", index=False, encoding="utf-8-sig")

    total_time = time.time() - start_time
    failed_pages = len(page_results) - successful_pages

    # Final summary
    print(f"Completed {dir_output}: {successful_pages}/{len(page_urls)} pages successful")
    logger.info(f"Scraping complete: {dir_output}")
    logger.info(f"Results - Total: {len(page_urls)}, Successful: {successful_pages}, Failed: {failed_pages}, Items: {len(all_results)}, Time: {total_time:.2f}s")

    return {
        'dir_output': dir_output,
        'total_pages': len(page_urls),
        'successful_pages': successful_pages,
        'failed_pages': failed_pages,
        'total_items': len(all_results),
        'results': all_results,
        'success': successful_pages > 0,
        'processing_time': total_time,
        'error_details': error_pages
    }

def scrape_multiple_websites_sequential(url_dir_pairs, parse_function, max_workers_per_site=None):
    """Scrape multiple websites sequentially, but with parallel page processing within each site"""
    if max_workers_per_site is None:
        max_workers_per_site = min(cpu_count(), 4)

    total_start_time = time.time()
    all_website_results = []

    for i, (url, dir_output) in enumerate(url_dir_pairs, 1):
        print(f"\nProcessing website {i}/{len(url_dir_pairs)}: {dir_output}")

        website_result = scrape_website_parallel_pages(
            url, dir_output, parse_function, max_workers_per_site
        )

        all_website_results.append(website_result)

    return all_website_results

def main(parse_oracc_html):
    """Main function to run the scraping process"""
    # Define your URLs and output directories
    url_suhu = "https://oracc.museum.upenn.edu/suhu/Q006211"
    url_saao = "https://oracc.museum.upenn.edu/saao/P224485"
    url_rinap = "https://oracc.museum.upenn.edu/rinap/Q006333?lang=en"
    url_riao = "https://oracc.museum.upenn.edu/riao/Q005738"
    url_ribo = "https://oracc.museum.upenn.edu/ribo/Q006263"

    # Create list of (url, directory) pairs
    url_dir_pairs = [
        (url_saao, "saao"),
        (url_suhu, "suhu"),
        (url_rinap, "rinap"),
        (url_riao, "raio"),
        (url_ribo, "ribo"),
    ]

    # Run the scraping process
    results = scrape_multiple_websites_sequential(
        url_dir_pairs,
        parse_oracc_html,
        max_workers_per_site=8
    )

    return results

if __name__ == "__main__":
    # from your_parsing_module import parse_oracc_html
    results = main(parse_oracc_html)

## File Splitter Tool

In [None]:
## File Splitter Tool
import os
import re
import shutil
from pathlib import Path


def split_files_by_type(source_dir, dry_run=True):
    """
    Split files into 5 different directories based on their type

    Args:
        source_dir: Path to directory containing mixed files
        dry_run: If True, only show what would be moved without actually doing it
    """

    source_path = Path(source_dir)

    if not source_path.exists():
        print(f"❌ Directory {source_dir} does not exist!")
        return

    print(f"🔍 Scanning directory: {source_path}")
    print(f"🔄 Dry run mode: {'ON' if dry_run else 'OFF'}")
    print("─" * 60)

    # File patterns to match
    patterns = {
        'data': r'^data_(\d+)\.csv$',
        'unlinked': r'^unlinked_data_page_(\d+)\.txt$',
        'failed': r'^failed_pages\.csv$',
        'log': r'^scraper_results\.log$',
        'error': r'^error_page_(\d+)\.html$'
    }

    # Target directories
    target_dirs = {
        'data': source_path / 'data_files',
        'unlinked': source_path / 'unlinked_files',
        'failed': source_path / 'failed_files',
        'log': source_path / 'log_files',
        'error': source_path / 'error_files'
    }

    # Find all files and group by type
    files_by_type = {key: [] for key in patterns.keys()}
    unmatched_files = []

    for file_path in source_path.iterdir():
        if file_path.is_file():
            filename = file_path.name
            matched = False

            for file_type, pattern in patterns.items():
                if re.match(pattern, filename):
                    files_by_type[file_type].append(file_path)
                    matched = True
                    break

            if not matched:
                unmatched_files.append(file_path)

    print("📋 Files found:")
    for file_type, files in files_by_type.items():
        print(f"   {file_type}: {len(files)} files")
    if unmatched_files:
        print(f"   unmatched: {len(unmatched_files)} files")

    print("\n📁 Target directories:")
    for file_type, target_dir in target_dirs.items():
        print(f"   {file_type} → {target_dir.name}/")

    if not dry_run:
        print(f"\n📂 Creating directories...")
        for file_type, target_dir in target_dirs.items():
            target_dir.mkdir(exist_ok=True)
            print(f"✅ Created: {target_dir}")

    print(f"\n🔄 File operations:")
    print("─" * 60)

    total_moves = 0

    for file_type, files in files_by_type.items():
        if not files:
            continue

        target_dir = target_dirs[file_type]
        print(f"\n📂 {file_type.upper()} files → {target_dir.name}/")

        for file_path in files:
            target_path = target_dir / file_path.name
            print(f"   {file_path.name} → {target_dir.name}/{file_path.name}")
            total_moves += 1

            if not dry_run:
                try:
                    if target_path.exists():
                        print(f"   ⚠️  Warning: {target_path} already exists, skipping")
                        continue

                    shutil.move(str(file_path), str(target_path))
                    print(f"   ✅ Moved successfully")

                except Exception as e:
                    print(f"   ❌ Error: {e}")

    if unmatched_files:
        print(f"\n❓ UNMATCHED files (will stay in original directory):")
        for file_path in unmatched_files:
            print(f"   {file_path.name}")

    print(f"\n📊 Summary:")
    print(f"   Total files to move: {total_moves}")
    print(f"   Unmatched files: {len(unmatched_files)}")

    if dry_run:
        print(f"\n💡 This was a dry run. To actually move files, set dry_run=False")
    else:
        print(f"\n🎉 File splitting completed!")


# Example usage
if __name__ == "__main__":
    # Set your directory path here
    source_directory = "ribo"  # Change this to your actual path

    print("🚀 File Splitter Tool")
    print("=" * 60)

    # First, do a dry run to see what would happen
    # print("1️⃣  DRY RUN - Showing what would be moved:")
    # split_files_by_type(source_directory, dry_run=False)

    # Uncomment the line below to actually perform the splitting
    # WARNING: This will actually move your files!

    print("\n" + "=" * 60)
    print("2️⃣  ACTUAL SPLITTING - This will move your files:")
    split_files_by_type(source_directory, dry_run=False)