# try extractor

In [1]:
from src.news_extractor import PDFTextExtractor, NewsPDFExtractor
from rich import print

news_file = "data/tages-news-2111.pdf"


extractor = PDFTextExtractor(news_file)
pages = extractor.extract_text()

for i, p in enumerate(pages):
    if p is None:
        print(f"Page {i + 1}: <None>")
    else:
        text = p.strip()
        print(f"Page {i + 1}: length={len(text)} | preview={repr(text[:200])}")

In [3]:
if __name__ == "__main__":
    extractor = NewsPDFExtractor("data/tages-news-2111.pdf")
    news_data = extractor.extract()
    tables = news_data["tables"]
    print(tables[0])

In [None]:
def find_tables_containing(news_data: dict, search: str, case_sensitive: bool = False):
    """Return list of (index, table) where any table cell contains `search` as substring.

    Works with pandas DataFrame, list-of-rows (lists/tuples), or other table-like objects.
    """
    if not news_data:
        return []
    tables = news_data.get("tables", [])
    matches = []

    # try pandas-aware path first (if pandas is available)
    try:
        import pandas as pd
    except Exception:
        pd = None

    for i, tbl in enumerate(tables):
        # pandas DataFrame
        if pd is not None:
            try:
                if isinstance(tbl, pd.DataFrame):
                    df = tbl.astype(str)
                    if case_sensitive:
                        mask = df.apply(lambda col: col.str.contains(search, na=False))
                    else:
                        mask = df.apply(lambda col: col.str.contains(search, case=False, na=False))
                    if mask.any().any():
                        matches.append((i, tbl))
                        continue
            except Exception:
                # fall through to generic checking
                pass

        # list-of-rows or nested sequences
        found = False
        if isinstance(tbl, (list, tuple)):
            for row in tbl:
                # row might be a sequence (row) or a single cell
                if isinstance(row, (list, tuple)):
                    for cell in row:
                        try:
                            s = str(cell)
                        except Exception:
                            continue
                        if (search in s) if case_sensitive else (search.lower() in s.lower()):
                            matches.append((i, tbl))
                            found = True
                            break
                    if found:
                        break
                else:
                    try:
                        s = str(row)
                    except Exception:
                        continue
                    if (search in s) if case_sensitive else (search.lower() in s.lower()):
                        matches.append((i, tbl))
                        found = True
                        break
            if found:
                continue

        # fallback: stringify whole table
        try:
            s = str(tbl)
            if (search in s) if case_sensitive else (search.lower() in s.lower()):
                matches.append((i, tbl))
        except Exception:
            pass

    return matches


# Example usage in the notebook (run after you have `news_data` in scope):
# search_str = "Berlin"
# matches = find_tables_containing(news_data, search_str)
# print(f"Found {len(matches)} matching table(s).")
# if matches:
#     idx, table = matches[0]
#     print("First match index:", idx)
#     try:
#         import pandas as pd
#         if pd is not None and isinstance(table, pd.DataFrame):
#             display(table.head())
#         else:
#             for row in (table[:5] if isinstance(table, (list, tuple)) else [table]):
#                 print(row)
#     except Exception:
#         print(table)
