In [None]:
import docx
print(docx.__file__)


### CONVERT DOC TO DOCX!!!


In [None]:
!chmod +x LibreOffice_7.6.4_Linux_x86-64.AppImage
!./LibreOffice_7.6.4_Linux_x86-64.AppImage --headless --convert-to docx your_file.doc


In [None]:
!module load libreoffice



### CONVERT DOC TO DOCX


In [None]:
###CONVERT DOC TO DOCX!!!

import os
import subprocess

def force_quit_libreoffice():
    """Force quit LibreOffice by killing all soffice processes."""
    try:
        # Kill all LibreOffice (soffice) processes
        subprocess.run(["pkill", "-f", "soffice"], check=True)
        print("Force quit LibreOffice successfully.")
    except subprocess.CalledProcessError:
        print("No LibreOffice processes were found to quit.")

def convert_doc_to_docx(input_path, output_path):
    print(f"Starting conversion of {input_path} to {output_path}...")

    try:
        # Run the conversion command and capture output and errors
        result = subprocess.run(
            ['unoconv', '-v', '-f', 'docx', '-o', output_path, input_path],
            stdout=subprocess.PIPE,  # Capture standard output
            stderr=subprocess.PIPE,  # Capture standard error
            timeout=600  # Timeout after 10 minutes (adjust as needed)
        )

        # Check if the command was successful (exit code 0)
        if result.returncode != 0:
            print(f"Error converting {input_path}: {result.stderr.decode()}")
        else:
            print(f"Successfully converted {input_path} to {output_path}")

    except subprocess.TimeoutExpired:
        print(f"Conversion of {input_path} timed out after 10 minutes")
    except Exception as e:
        print(f"An error occurred while converting {input_path}: {e}")

    # Force quit LibreOffice to free up resources after each conversion
    force_quit_libreoffice()

def convert_all_docs_in_dir(directory_path):
    # Iterate through all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.doc'):
            input_file = os.path.join(directory_path, filename)
            output_file = os.path.join(directory_path, f"{os.path.splitext(filename)[0]}.docx")

            # Convert the file and save it with the new .docx extension
            convert_doc_to_docx(input_file, output_file)

            # Optional: Delete the original .doc file after conversion
            os.remove(input_file)

            print(f"Converted {filename} to {os.path.basename(output_file)}")

# Example usage:
doc_directory ='/home/liorkob/M.Sc/thesis/data/drugs/drugs doc'
convert_all_docs_in_dir(doc_directory)


### UTILS


In [None]:
# pip install stanza
# import stanza
# stanza.download('he') 


In [None]:
import stanza
# nlp = stanza.Pipeline('he', use_gpu=False)
nlp = stanza.Pipeline('he', processors='tokenize', use_gpu=True)

import re
from docx.text.paragraph import Paragraph
from docx import Document

from docx.table import _Cell, Table
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl

import sys
print(sys.executable)

# Modify property of Paragraph.text to include hyperlink text
Paragraph.text = property(lambda self: get_paragraph_text(self))

def get_paragraph_text(paragraph) -> str:
    """
    Extract text from paragraph, including hyperlink text.
    """
    def get_xml_tag(element):
        return "%s:%s" % (element.prefix, re.match("{.*}(.*)", element.tag).group(1))

    text_content = ''
    run_count = 0
    for child in paragraph._p:
        tag = get_xml_tag(child)
        if tag == "w:r":
            text_content += paragraph.runs[run_count].text
            run_count += 1
        if tag == "w:hyperlink":
            for sub_child in child:
                if get_xml_tag(sub_child) == "w:r":
                    text_content += sub_child.text
    return text_content


def is_paragraph_bold(block) -> bool:
    if block.style and block.style.font:
        if block.style.font.bold:  # Check if bold is part of the style
            return True
    return False

def is_block_bold(block) -> bool:
    # Check if the paragraph style indicates a bold style (e.g., "כותרת")
    if block.style and block.style.name in ["כותרת", "Heading", "Title"]:  
        return True

    # Check if the style font is bold
    if block.style and block.style.font and block.style.font.bold:
        return True

    # # Check if any run is bold
    # if block.runs:
    #     for run in block.runs:
    #         if run.bold or (run.font and run.font.bold):
    #             return True
    return False
def is_run_bold(run) -> bool:
    """
    Check if a run is bold, including inherited and complex script (cs_bold) styles.
    """
    if run.bold is not None:
        return run.bold
    if run.font and run.font.bold is not None:
        return run.font.bold
    if run.font and run.font.cs_bold is not None:
        return run.font.cs_bold  # Check for complex script bold
    return False

def is_block_styled(block) -> bool:
    """
    Check if the entire block/paragraph text is fully bold or fully underlined,
    while handling:
    - Allow the first run to differ in style if it is a prefix (e.g., 'א.', '1.', 'א)', '1)').
    - Skip empty or non-alphanumeric runs.
    - Allow trailing punctuation with different styling.
    """
    if hasattr(block, "runs") and block.runs:
        # Combine text from all meaningful runs
        combined_text = " ".join(run.text.strip() for run in block.runs if run.text.strip()).strip()
        
        # Handle empty text
        if not combined_text:
            return False
        
        # Check word count
        word_count = len(combined_text.split())
        if word_count < 4:
            # print(combined_text)
            return True  # Return True if there are fewer than 3 words


        # Identify meaningful runs: Ignore runs that are empty or contain only spaces/non-alphanumeric characters
        meaningful_runs = [run for run in block.runs if run.text.strip() and any(c.isalnum() for c in run.text)]

        if not meaningful_runs:
            return False

        # Check if the first run is a prefix (e.g., "א.", "1.", "א)", "1)")
        first_run_text = meaningful_runs[0].text.strip()
        is_prefix = bool(re.match(r'^[\u0590-\u05FF]\.|^[\u0590-\u05FF]\)|^\d+\.|^\d+\)', first_run_text))

        # Allow the first run to differ in style if it's a valid prefix
        runs_to_check = meaningful_runs[1:] if is_prefix else meaningful_runs

        # Check if all remaining runs are styled as bold or underlined
        all_bold =is_block_bold(block) or all(is_run_bold(run) or run.text in [":", ".", ","] for run in runs_to_check)
        all_underlined = all(run.underline is True or run.text in [":", ".", ","] for run in runs_to_check)

        # Allow for trailing punctuation to differ in style
        if combined_text[-1] in [":", ".", ","]:
            return all_bold or all_underlined
        else:
            return is_block_bold(block) or all(is_run_bold(run) or run.underline is True for run in runs_to_check)

    return False



def iterate_block_items(parent):
    """
    Iterate over paragraphs and tables in a document or cell.
    """
    if hasattr(parent, "element") and hasattr(parent.element, "body"):
        parent_element = parent.element.body
    elif hasattr(parent, "_tc"):
        parent_element = parent._tc
    else:
        print(f"Unsupported parent type: {type(parent)}")
        return

    for child in parent_element.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            table = Table(child, parent)
            for row in table.rows:
                for cell in row.cells:
                    yield from iterate_block_items(cell)

def extract_part_after_number_or_hebrew_letter(sentence: str) -> str:
    """
    Extract text following a pattern of number or Hebrew letter.
    """
    pattern = r'^(?:[0-9\u05D0-\u05EA]+)\.\s*(.*)'
    match = re.search(pattern, sentence)
    return match.group(1).strip() if match else sentence

def count_patterns_in_block(block) -> int:
    """
    Count the number-dot or dot-number patterns in a block.
    """
    pattern = r'\s*(?:\.\d+|\d+\.)'
    return len(re.findall(pattern, block.text))

def count_consecutive_blocks_starting_with_number(blocks) -> int:
    """
    Count consecutive blocks starting with a number or Hebrew letter.
    """
    count = 0
    for block in blocks:
        if 'הנאשם' in block.text:
            return 1
        count += count_patterns_in_block(block)
        if 'חקיקה שאוזכרה' in block.text:
            break
    return count

def extract_name_after_word(text: str, word: str) -> str:
    """
    Extract the words following a given word up to the end of the sentence.
    """
    pattern = re.compile(fr'{word}(?:,)?\s*([\u0590-\u05FF\s\'\(\)-]+)')
    match = pattern.search(text)
    return match.group(1) if match else ''

def extract_violations(text: str) -> list:
    """
    Extract violations from the text based on a pre-defined pattern.
    """

    matches = re.findall(r"(?:סעיף|סעיפים|ס'|סע')\s*\d+\s*(?:\([\s\S]*?\))?.*?(?=\s*(?:ב|ל)(?:חוק|פקודת))\s*(?:ב|ל)(?:חוק|פקודת)\s*ה?(?:עונשין|כניסה לישראל|סמים\s+המסוכנים|\w+)?", text)
    # matches = re.findall(r"(?:סעיף|סעיפים|ס'|סע')\s*\d+\s*(?:\([\s\S]*?\))?.*?(?=\s*(?:ב|ל)(?:חוק|פקודת))\s*(?:ב|ל)(?:חוק|פקודת)\s*ה?(?:עונשין|כניסה לישראל|סמים\s+המסוכנים|[^\[]+)?", text)

    matches = [match.strip() for match in matches]
    return matches

### RENAME DOCX TO VERICT NUMBER


In [None]:
import re
import pandas as pd

# List of legal acronyms (same as yours)
acronyms = [
    "אב", "אבע", "אימוצ", "אמצ", "אפ", "אפח", "את", "אתפ", "באפ", "באש", "בבנ", "בגצ", "בדא", "בדמ",
    "בדמש", "בהנ", "בהע", "בהש", "בידמ", "בידע", "בל", "בלמ", "במ", "בעא", "בעח", "בעמ", "בעק", "בפ",
    "בפמ", "בפת", "בצא", "בצהמ", "בק", "בקמ", "בקשה", "ברמ", "ברע", "ברעפ", "ברש", "בש", "בשא",
    "בשגצ", "בשהת", "בשז", "בשמ", "בשע", "בשפ", "בתת", "גזז", "גמר", "גפ", "דבע", "דח", "דט", "דיונ",
    "דמ", "דמר", "דמש", "דנ", "דנא", "דנגצ", "דנמ", "דנפ", "הד", "הדפ", "הוצלפ", "הט", "הכ", "המ",
    "המד", "הממ", "המע", "המש", "הנ", "הסת", "הע", "העז", "הפ", "הפב", "הפמ", "הצמ", "הש", "השא",
    "השגצ", "השפ", "השר", "הת", "וחק", "וע", "ושמ", "ושק", "ושר", "זי", "חא", "חבר", "חד", "חדא",
    "חדלפ", "חדלת", "חדמ", "חדפ", "חהע", "חי", "חנ", "חסמ", "חעמ", "חעק", "חש", "יוש", "ייתא", "ימא",
    "יס", "כצ", "מ", "מא", "מבכ", "מבס", "מונופולינ", "מזג", "מח", "מחוז", "מחע", "מט", "מטכל", "מי",
    "מיב", "מכ", "ממ", "מס", "מסט", "מעי", "מעת", "מקמ", "מרכז", "מת", "נ", "נב", "נבא", "נמ", "נמב",
    "נעד", "נער", "סבא", "סע", "סעש", "סק", "סקכ", "ע", "עא", "עאח", "עאפ", "עב", "עבאפ", "עבז", "עבח",
    "עבי", "עבל", "עבמצ", "עבעח", "עבפ", "עבר", "עבשהת", "עגר", "עדי", "עדמ", "עהג", "עהס", "עהפ",
    "עו", "עורפ", "עז", "עח", "עחא", "עחדלפ", "עחדפ", "עחדת", "עחהס", "עחע", "עחק", "עחר", "עכב",
    "על", "עלא", "עלבש", "עלח", "עלע", "עמ", "עמא", "עמה", "עמז", "עמח", "עמי", "עמלע", "עממ", "עמנ",
    "עמפ", "עמצ", "עמק", "עמרמ", "עמש", "עמשמ", "עמת", "ענ", "ענא", "ענמ", "ענמא", "ענמש", "ענפ",
    "עסא", "עסק", "עע", "עעא", "עעמ", "עער", "עעתא", "עפ", "עפא", "עפג", "עפהג", "עפמ", "עפמק",
    "עפנ", "עפס", "עפספ", "עפע", "עפר", "עפת", "עצמ", "עק", "עקג", "עקמ", "עקנ", "עקפ", "ער", "ערא",
    "ערגצ", "ערמ", "ערעור", "ערפ", "ערר", "עש", "עשא", "עשמ", "עשר", "עשת", "עשתש", "עת", "עתא",
    "עתמ", "עתפב", "עתצ", "פא", "פה", "פל", "פלא", "פמ", "פמר", "פעמ", "פקח", "פר", "פרק", "פשז",
    "פשר", "פת", "צא", "צבנ", "צה", "צו", "צח", "צמ", "קג", "קפ", "רחדפ", "רמש", "רע", "רעא", "רעב",
    "רעבס", "רעו", "רעמ", "רעס", "רעפ", "רעפא", "רעצ", "רער", "רערצ", "רעש", "רעתא", "רצפ", "רתק",
    "ש", "שבד", "שמ", "שמי", "שנא", "שע", "שעמ", "שק", "שש", "תא", "תאדמ", "תאח", "תאמ", "תאק", "תב",
    "תבכ", "תבע", "תג", "תגא", "תד", "תדא", "תהג", "תהנ", "תהס", "תוב", "תוח", "תח", "תחפ", "תחת",
    "תט", "תי", "תכ", "תלא", "תלב", "תלהמ", "תלפ", "תלתמ", "תמ", "תמהח", "תממ", "תמק", "תמר",
    "תמש", "תנג", "תנז", "תע", "תעא", "תעז", "תפ", "תפב", "תפח", "תפחע", "תפכ", "תפמ", "תפע",
    "תפק", "תצ", "תק", "תקח", "תקמ", "תרמ", "תת", "תתח", "תתע", "תתעא", "תתק"
]

def create_acronym_variants(acronyms):
    acronym_variants = []
    for a in acronyms:
        if len(a) > 1:
            # Case 1: Original acronym with quotes/dots before last letter
            base_acronym = a
            if a.startswith('ב') or a.startswith('ו') or a.startswith('ה'):
                # Also add variant without the prefix letter
                base_acronym = a[1:]
            
            # For each acronym (both with and without prefix)
            for acr in [a, base_acronym]:
                if len(acr) > 1:
                    # Standard quote/dot before last letter
                    quoted = rf"{acr[:-1]}[\"'״]{acr[-1]}"
                    with_dot = rf"{acr[:-1]}\.{acr[-1]}"
                    acronym_variants.append(f"(?:{quoted}|{with_dot})")
                    
                    # Add dot-separated variant
                    dots_between = '\.'.join(list(acr))
                    acronym_variants.append(dots_between)
                    acronym_variants.append(acr)  # Add this line

    
    return '|'.join(acronym_variants)
        
acronym_pattern = create_acronym_variants(acronyms)

# Ensure the numbers follow the correct format
number_pattern = r'''
    (?:
        \d{1,6}[-/]\d{2}[-/]\d{2}  # Format: 31067-11-11
        | \d{1,6}[-/]\d{1,6}         # Format: 895/09
        | \d{1,6}-\d{2}-\d{2}        # Format: 31067-11-11 (hyphenated)
    )
'''
citation_pattern = fr'''
    (?<!\w)                      # Ensure no letter before
    ([א-ת]?)                     # Optional single Hebrew prefix letter (but no isolated matches)
    ({acronym_pattern})           # Captures acronym (short & long)
    \.?                          # Optional dot after acronym
    \s*                          # Optional spaces
    (\((.*?)\))?                  # Optional court location in parentheses
    \s*[-/]?\s*                  # Required space or separator before case number
    ({number_pattern})            # Captures case number formats
    (?!\w)                       # Ensure no letter after
'''.strip()

# Compile regex with verbose flag for readability
citation_regex = re.compile(citation_pattern, re.VERBOSE)


In [None]:
import os
import re
import unicodedata
import pandas as pd
from docx import Document
import re

def clean_hebrew_verdict_text(text: str) -> str:
    # Remove duplicate phrases (e.g. repeated 'בית משפט השלום בקריות')
    parts = list(dict.fromkeys(text.split('<<')))
    cleaned = '<<'.join(parts)

    # Remove nested/multiple angle brackets
    cleaned = re.sub(r'[<]{2,}', '<', cleaned)
    cleaned = re.sub(r'[>]{2,}', '>', cleaned)

    # Remove empty brackets or stray symbols
    cleaned = re.sub(r'<\s*>', '', cleaned)
    cleaned = re.sub(r'[<>]', '', cleaned)

    # Remove escape characters
    cleaned = cleaned.replace("\\'", "'").replace('\\', '')

    # Replace double "נגד נגד" with single
    cleaned = re.sub(r'נגד\s+נגד', 'נגד', cleaned)

    # Normalize spaces
    cleaned = re.sub(r'\s{2,}', ' ', cleaned).strip()

    return cleaned

# --- Normalize & regex helper functions ---

def normalize_text(text):
    text = unicodedata.normalize("NFKC", text)
    return text.replace("\u00A0", " ").replace("\u200f", "").strip()

def normalize_case_name(case_name):
    return re.sub(r'\s+', ' ', case_name.replace('/', "∕")).strip()

# --- Citation extraction ---
def extract_citation_from_docx(docx_path):
    doc = Document(docx_path)
    first_rows = ""
    i = 0
    header = doc.sections[0].header
    head=""
    for paragraph in header.paragraphs:
        head+=paragraph.text


    for block in iterate_block_items(doc):
        if i == 10:
            break
        first_rows += normalize_text(block.text) + " "
    
        if normalize_text(block.text) != "":
            i += 1
    first_rows=clean_hebrew_verdict_text(first_rows)
    # match = citation_regex.search(first_rows)
    match =citation_regex.search(head)
    if match:
        citation = " ".join(map(str, filter(pd.notna, match.groups()))).strip()
        if citation and citation[0] in "בוור":
            citation = citation[1:].lstrip()
        if re.match(r"^על \d+$", citation):
            return None
        citation = re.sub(r"\((.*?)\)\s+\1", r"(\1)", citation)
        return citation
    return None

# --- CSV file ---
# csv_path = "/home/liorkob/M.Sc/thesis/data/drugs/similarity_gt_drugs.csv"
# csv_path_2 = "/home/liorkob/M.Sc/thesis/data/drugs/processed_verdicts_with_gpt.csv"

# df = pd.read_csv(csv_path)
# df_2 = pd.read_csv(csv_path_2)

# --- File renaming ---
docx_dir = '/home/liorkob/M.Sc/thesis/data/5k/docx'

for filename in os.listdir(docx_dir):
    if filename.endswith('.docx'):
        file_path = os.path.join(docx_dir, filename)
        file_stem = filename.rsplit(".", 1)[0]  # remove .docx
        # if file_stem != "SH-08-231-870":
        #     continue
        citation_name = extract_citation_from_docx(file_path)

        if citation_name:
            new_filename = normalize_case_name(citation_name) + '.docx'
            new_file_path = os.path.join(docx_dir, new_filename)
            
            # Rename file
            os.rename(file_path, new_file_path)
            print(f'Renamed "{filename}" → "{new_filename}"')

            new_stem = new_filename.rsplit(".", 1)[0]  # remove .docx

        else:
            print(f'No citation found in "{filename}"')

# --- Save updated CSV ---
# df.to_csv(csv_path, index=False)
# print("CSV updated.")


### split: verdict and appeals


In [None]:
import os

# Path to your folder with renamed .docx files
docx_dir = '/home/liorkob/M.Sc/thesis/data/5k/docx'
verdict_dir = os.path.join(docx_dir, 'verdict')
appeals_dir = os.path.join(docx_dir, 'appeals')

# Create target directories if they don't exist
os.makedirs(verdict_dir, exist_ok=True)
os.makedirs(appeals_dir, exist_ok=True)

# Go through each file and move it to the appropriate folder
for filename in os.listdir(docx_dir):
    if not filename.endswith('.docx'):
        continue

    # Full source file path
    src_path = os.path.join(docx_dir, filename)

    # Skip if already moved
    if os.path.exists(os.path.join(verdict_dir, filename)) or os.path.exists(os.path.join(appeals_dir, filename)):
        continue

    # Check first letter to determine type
    first_letter = filename[0]
    if first_letter == 'ת':  # תיק פלילי → verdict
        dst_path = os.path.join(verdict_dir, filename)
    elif first_letter == 'ע':  # ערעור → appeal
        dst_path = os.path.join(appeals_dir, filename)
    else:
        print(f'Skipping "{filename}": unknown type')
        continue

    # Move the file
    os.rename(src_path, dst_path)
    print(f'Moved "{filename}" to "{dst_path}"')


In [None]:
# from docx import Document

# doc = Document("/home/liorkob/M.Sc/thesis/data/5k/docx/22003050-C08.docx")
# header = doc.sections[0].header
# for paragraph in header.paragraphs:
#     print(paragraph.text)


### Extract verdict from appeals

In [None]:

import os
import re
import zipfile
import pandas as pd
import unicodedata
from openai import OpenAI
from docx import Document
from bs4 import BeautifulSoup

os.environ["OPENAI_API_KEY"] = "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"  # Replace with actual key

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
models = client.models.list()
# Citation patterns
# citation_patterns = {
#     'ת"פ': r'ת"פ\s*(\d+[-/]\d+)',
#     'תפ"ח': r'תפ"ח\s*(\d+[-/]\d+)',
#     'ע"פ': r'ע"פ\s*(\d+[-/]\d+)',
#     'בתי"פ': r'בתי\.פ\.\s*(\d+[-/]\d+)',
#     'תי"פ': r'תי\.פ\.\s*(\d+[-/]\d+)',
#     'ת\.פ\.': r'ת\.פ\.\s*(\d+[-/]\d+)',
#     'בת\.פ\.': r'בת\.פ\.\s*(\d+[-/]\d+)',
#     'תיק': r'תיק\s*(\d+[-/]\d+)',
# }

# # Normalize Hebrew text
# def normalize_text(text):
#     """Normalize spaces and special characters in Hebrew text."""
#     return unicodedata.normalize("NFKC", text).replace("\u00A0", " ").strip()

# # Normalize case names
# def normalize_case_name(case_name):
#     """Normalize case names by removing extra spaces and fixing slashes."""
#     return re.sub(r'\s+', ' ', case_name.replace('/', "∕")).strip()

# Extract hyperlinks from DOCX
def getLinkedText(soup):
    links = []
    for tag in soup.find_all("hyperlink"):
        try:
            links.append({"id": tag["r:id"], "text": tag.text})
        except KeyError:
            pass

    for tag in soup.find_all("instrText"):
        if "HYPERLINK" in tag.text:
            parts = tag.text.split('"')
            if len(parts) > 1:  # Ensure the URL exists before accessing index 1
                url = parts[1]
            else:
                print(f"⚠️ Warning: No valid URL found in HYPERLINK tag: {tag.text}")
                url = None  # Assign None if URL is missing

            temp = tag.parent.next_sibling
            text = ""

            while temp is not None:
                maybe_text = temp.find("t")
                if maybe_text is not None and maybe_text.text.strip() != "":
                    text += maybe_text.text.strip()
                maybe_end = temp.find("fldChar[w:fldCharType]")
                if maybe_end is not None and maybe_end["w:fldCharType"] == "end":
                    break
                temp = temp.next_sibling

            links.append({"id": None, "href": url, "text": text})
    return links
def getURLs(soup, links):
    for link in links:
        if "href" not in link:
            for rel in soup.find_all("Relationship"):
                if rel["Id"] == link["id"]:
                    link["href"] = rel["Target"]
    return links

import zipfile

def extract_hyperlinks(docx_path):
    """
    Extracts hyperlinks from a .docx file and returns a dictionary 
    where the linked text is mapped to its corresponding URL.
    """
    # Open the .docx file as a zip archive
    try:
        archive = zipfile.ZipFile(docx_path, "r")
    except zipfile.BadZipFile:
        print(f"❌ Error: Cannot open {docx_path} (Bad ZIP format)")
        return {}

    # Extract main document XML
    try:
        file_data = archive.read("word/document.xml")
        doc_soup = BeautifulSoup(file_data, "xml")
        linked_text = getLinkedText(doc_soup)
    except KeyError:
        print(f"⚠️ Warning: No document.xml found in {docx_path}")
        return {}

    # Extract hyperlink relationships from _rels/document.xml.rels
    try:
        url_data = archive.read("word/_rels/document.xml.rels")
        url_soup = BeautifulSoup(url_data, "xml")
        links_with_urls = getURLs(url_soup, linked_text)
    except KeyError:
        print(f"⚠️ Warning: No _rels/document.xml.rels found in {docx_path}")
        links_with_urls = linked_text

    # Extract footnotes (if available)
    try:
        footnote_data = archive.read("word/footnotes.xml")
        footnote_soup = BeautifulSoup(footnote_data, "xml")
        footnote_links = getLinkedText(footnote_soup)

        footnote_url_data = archive.read("word/_rels/footnotes.xml.rels")
        footnote_url_soup = BeautifulSoup(footnote_url_data, "xml")
        footnote_links_with_urls = getURLs(footnote_url_soup, footnote_links)

        # Merge footnote links
        links_with_urls += footnote_links_with_urls
    except KeyError:
        pass  # No footnotes found, continue

    # Convert extracted links to a dictionary: {linked_text: URL}
    return {link["text"]: link.get("href", None) for link in links_with_urls}

# GPT-based verdict extraction
def extract_verdict_with_gpt(text):
    prompt = f"""
Given the text of a legal appeal document, identify and extract **only** the referenced verdict that the appeal was filed against. 

- The verdict typically appears in sentences mentioning 'ערעור על' (appeal on) followed by a reference to a previous court decision.
- **Return only the case reference** (e.g., ת"פ 53715-12-15) without any additional text.
- **If multiple verdicts are mentioned, return only the first valid one.**
- If no referenced verdict is found, return "No verdict found".

**Input Text:**
{text}

**Extracted verdict name (only the case reference):**
"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an AI trained to extract legal references accurately."},
            {"role": "user", "content": prompt}
        ]
    )

    return response.choices[0].message.content.strip()

def extract_verdict_from_appeal(docx_path):
    doc = Document(docx_path)
    first_rows = ""
    i = 0

    for block in iterate_block_items(doc):
        if i == 100 or "בשם המערער" in first_rows or "בשם המערערת" in first_rows:
            break

        first_rows += normalize_text(block.text) + " "
        i += 1

    # First try: extract using citation regex
    match = citation_regex.search(first_rows)
    if match:
        citation = " ".join(map(str, filter(pd.notna, match.groups()))).strip()
        if citation and citation[0] in "בוור":
            citation = citation[1:].lstrip()
        citation = re.sub(r"\((.*?)\)\s+\1", r"(\1)", citation)
        return citation

    # Fallback: use GPT if no citation found
    print("Fallback to GPT for:", docx_path)
    return extract_verdict_with_gpt(first_rows)

# Processing all DOCX files
def process_docx_files(docx_dir, output_csv):
    results = []

    for filename in os.listdir(docx_dir):
        if filename.endswith('.docx'):
            file_path = os.path.join(docx_dir, filename)
            appeal_case = os.path.splitext(filename)[0]

            print(f"Processing: {appeal_case}")

            # Extract referenced verdict
            verdict = extract_verdict_from_appeal(file_path)
            
            # Extract URLs
            links = extract_hyperlinks(file_path)
            # print(" Extracted Links:")
            # for key, value in links.items():
            #     print(f"{key} → {value}")

            url = links.get(verdict, "")
            print(f"Extracted verdict:", verdict)
            print(f"Extracted url:", url)


            results.append({
                "verdict": verdict,
                "appeal": appeal_case,
                "url": url
            })

    # Convert results to DataFrame and save
    final_df = pd.DataFrame(results)
    final_df.to_csv(output_csv, index=False, encoding="utf-8-sig")
    print(f"✅ Results saved to {output_csv}")

docx_dir = f'/home/liorkob/thesis/lcp/data/docx_citations'
output_csv = f"/home/liorkob/thesis/lcp/data/docx_citations/verdict_appeal.csv"

process_docx_files(docx_dir, output_csv)


### PRE-PROCESS

In [None]:
###PRE-PROCESS
from tqdm import tqdm

import os
import re
import pandas as pd
import stanza
# from docx import Document
import logging

"""
This script processes `.docx` verdict files, extracting text from them, identifying and classifying specific sections of the document. It saves the results into CSV files with the extracted text, sections, and metadata for further analysis.

### Key Functionalities:

1. **Text Extraction and Preprocessing**:
   - The script iterates through paragraphs in `.docx` files, using custom functions from `utils.py` to identify specific sections based on formatting (e.g., bold text).
   - The extracted sections are stored in a dictionary along with the corresponding full sentences from the document.

2. **Part Identification**:
   - It processes bolded blocks of text as distinct "parts" or sections (e.g., titles or key sections) and appends them to a list.
   - For each sentence, the script associates it with both the most recent part (stored as `part_single`) and a concatenation of all previous parts (stored as `part_concatenated`).

3. **NLP Processing**:
   - The Hebrew Stanza NLP pipeline is used to split the text into sentences, which are then stored in the output alongside the associated document sections.
   - The script also applies filters to skip short paragraphs and unwanted patterns (e.g., references to certain case types).

4. **Error Handling and Logging**:
   - The script uses Python’s `logging` module to provide informative logs, including handling errors if a document can't be opened or processed.
   - It catches and logs any exceptions during the processing of files.

5. **CSV Output**:
   - For each `.docx` file, the extracted data (including text, section titles, and concatenated sections) is saved to a CSV file.

6. **Recursive Directory Processing**:
   - The script recursively processes `.docx` files in a specified root directory (`selenium_downloads\מרב גרינברג`), saving the results for each file in a corresponding output directory (`outputs\merav_grinberg_preproccsed`).

### Main Functions:

- **doc_to_csv(doc_path: str, result_path: str)**:
   - Processes a single `.docx` file, extracting text and metadata.
   - Saves the results to a CSV file if a result path is provided.

- **run()**:
   - Iterates through all `.docx` files in the root directory.
   - For each file, it calls `doc_to_csv` and saves the resulting DataFrame as a CSV.

### Usage:
The script is executed via the `run()` function, which processes all files in the specified directory. It logs the status and outputs CSV files containing preprocessed data for each document.
"""
number_pattern = re.compile(r'''
    (?:
        \d{1,6}[-/]\d{2}[-/]\d{2}  # Format: 31067-11-11
        | \d{1,6}[-/]\d{1,6}         # Format: 895/09
        | \d{1,6}-\d{2}-\d{2}        # Format: 31067-11-11 (hyphenated)
    )
''', re.VERBOSE)

def should_split_sentence(sentence: str) -> bool:
    """
    Determine whether a sentence should be split.
    - A sentence should NOT be split if it contains a citation (matches `number_pattern`).
    """
    return not number_pattern.search(sentence)

def validate_docx(file_path):
    try:
        doc = Document(file_path)
        print("The file is valid.")
        return True
    except Exception as e:
        print(f"Error validating document: {e}")
        return False
    
def docToCsv(doc_path: str = None):
    """
    Converts a DOCX document to a CSV format by extracting relevant parts of the document 
    based on specified conditions like block boldness or specific patterns.

    Parameters:
    - doc_path (str, optional): The path to the DOCX document. Defaults to None.

    Steps:
    1. Initialize data dictionary to hold extracted content.
    2. Open and iterate through the provided DOCX document.
    3. Filter out unnecessary blocks.
    4. Determine if the current block is a title or content.
    5. If it's content, tokenize it using the Stanza library.
    6. Add the extracted content to the data dictionary.
    7. Convert the data dictionary to a Pandas DataFrame.

    Returns:
    - DataFrame: A Pandas DataFrame containing the extracted text from the DOCX document with columns 'text' and 'part'.
    """

    data = {'verdict': [],'text': [], 'part': []}
    data['verdict']=os.path.splitext(os.path.basename(doc_path))[0]
    doc = Document(doc_path)
    part = 'nothing' 

    # for paragraph in doc.paragraphs:


    for block in iterate_block_items(doc): # Updated usage
        flag = False

        # if len(block.text) <= 1 or 'ע"פ' in block.text or 'ת"פ' in block.text or 'עפ"ג' in block.text:
        #     continue
        # if   "מחמד כנעאנה" in block.text :
        #     i=0
# and not re.match(r'^\d', block.text) and not re.match(r'[\u0590-\u05FF][^.)*]*[.)]', block.text)
        if is_block_styled(block) and len(block.text.split(' ')) < 10:
            # התאמה לתחילת כותרת - מספר או אות בעברית עם נקודה/סוגריים
            if re.match(r'^(?:\d+[.)]|[\u0590-\u05FF][.)])', block.text):
                # הסר את החלק התואם מהתחלה
                part = re.sub(r'^(?:\d+[.)]|[\u0590-\u05FF][.)])', '', block.text).strip()
            else:
                # אם לא עונה לתנאים, העתק את הטקסט כפי שהוא
                part = block.text
        else:
            extracted_part_text = extract_part_after_number_or_hebrew_letter(block.text)
            
            # Preserve paragraph integrity while handling sentence splitting
            if len(extracted_part_text.split()) < 10 or not should_split_sentence(extracted_part_text):
                text = extracted_part_text  # Keep paragraph as-is if it's short or contains a citation
            else:
                sentences = nlp(extracted_part_text)
                filtered_sentences = []
                temp_sentence = ""

                # Reconstruct text while avoiding citation splits and handling quotes
                for sentence in sentences.sentences:
                    text = sentence.text.strip()

                    # Handle quotation blocks
                    if text.startswith('"'):
                        flag = True
                        continue
                    if text.endswith('".') or text.endswith('"'):
                        flag = False
                        continue
                    if flag:
                        continue

                    # Skip text if it matches the section title (part)
                    if text == part:
                        continue

                    # Merge sentences to prevent citation splits
                    if should_split_sentence(text):
                        if temp_sentence:
                            filtered_sentences.append(temp_sentence.strip())
                            temp_sentence = ""
                        filtered_sentences.append(text)
                    else:
                        temp_sentence += " " + text  # Merge citation sentence to previous

                if temp_sentence:
                    filtered_sentences.append(temp_sentence.strip())

                text = " ".join(filtered_sentences)  # Keep full paragraph if needed

            if text.strip():  # Avoid empty lines
                data['text'].append(text)
                data['part'].append(part)

    return pd.DataFrame(data)


import os
import logging

def run():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    root_directory = "/home/liorkob/M.Sc/thesis/data/5k/docx/verdict"
    output_dir = "/home/liorkob/M.Sc/thesis/data/5k/verdict_csv"
    os.makedirs(output_dir, exist_ok=True)

    for root, _, files in os.walk(root_directory):
        logging.info(f"Processing directory: {root}")
        for file in tqdm(files, desc=f"Processing files in {root}"):
            if not file.lower().endswith('.docx'):
                continue

            input_path = os.path.join(root, file)
            output_path = os.path.join(output_dir, f"{os.path.splitext(file)[0]}.csv")

            if os.path.exists(output_path):
                logging.info(f"Output already exists, skipping: {output_path}")
                continue

            try:
                df = docToCsv(doc_path=input_path)
                df.to_csv(output_path, index=False)
                logging.info(f"Processed and saved: {output_path}")
            except Exception as e:
                logging.error(f"Error processing {file}: {str(e)}")
if __name__ == "__main__":
    run()


In [None]:
import pandas as pd
import os

import pandas as pd
import os

def verify_verdict_parts_from_csv(output_directory, required_parts):
    """
    Verifies the presence of required parts in each CSV file generated from verdict processing.

    Parameters:
    - output_directory (str): Directory containing the output CSV files.
    - required_parts (list): List of strings representing the required parts.

    Output:
    - Prints the parts for each verdict.
    - Identifies and lists verdicts where none of the required parts are found.
    """
    verdicts_with_no_parts = []  # Store verdicts where none of the parts exist

    for file in os.listdir(output_directory):
        if not file.endswith(".csv"):
            continue
        
        file_path = os.path.join(output_directory, file)
        df = pd.read_csv(file_path)
        verdict_name = os.path.splitext(file)[0]
        
        print(f"Verifying Verdict: {verdict_name}")
        
        # Extract unique parts from the DataFrame
        verdict_parts = df['part'].dropna().astype(str).unique()  # Ensure all parts are strings
        
        # Print all parts for the verdict
        print("  Parts in the verdict:")
        for part in verdict_parts:
            print(f"    - {part}")
        
        # Check if none of the required parts exist
        if not any(any(required in part for part in verdict_parts) for required in required_parts):
            verdicts_with_no_parts.append((verdict_name,verdict_parts))  # Add to the list of problematic verdicts
        
        print("-" * 40)
    
    # Print verdicts with no matching parts
    if verdicts_with_no_parts:
        print("Verdicts with no matching parts:")
        for verdict,parts in verdicts_with_no_parts:
            print(f"  - {verdict}")
            print(f"parts: {parts}")



    else:
        print("All verdicts have at least one matching part.")

# Define the directory containing the output CSV files
output_directory = "/home/liorkob/M.Sc/thesis/data/5k/appeals_csv"

# Define the required parts (partial matching supported)
required_parts = ["אחידות בענישה","מתחם הענישה","מתחם ענישה", "דיון", "ענישה נהוגה", "הענישה הנוהגת","ענישה נוהגת", "מתחם העונש" ,"מתחם עונש","מדיניות הענישה" "והכרעה", "ההרשעה","מדיניות הענישה הנהוגה"]
# required_parts=["הכרעת הדין", "אישום" ,"רקע" ,"כללי" ,"כתב אישום","כתב האישום"]
# Run the verification
verify_verdict_parts_from_csv(output_directory, required_parts)


### chack multy defents

In [None]:
import os
import pandas as pd
from tqdm import tqdm
from docx import Document

# Set paths
docx_directory = "/home/liorkob/M.Sc/thesis/data/5k/docx/verdict"


# Initialize counter
mention_count = 0
mention_1_count = 0

# Define start and end patterns based on the 'part' column (for partial matches)
START_PARTS = [
    "עובדותם", "כללי", "כתב האישום", "האישום", "אישום", "רקע", "גזר", "דין", "פסק","מבוא","הרשעת" ,"בעניינו","עבירות","הורשע","עובדות","השתלשלות", "ג ז ר",  "ד י ן"
]

# Process files
file_list = [f for f in os.listdir(docx_directory) if f.lower().endswith(".docx")]

# Process each DOCX file
for filename in tqdm(file_list, desc="Checking for 'הנאשמים' in DOCX files"):
    try:
        file_path = os.path.join(docx_directory, filename)
        doc = Document(file_path)
        first_rows = ""
        i = 0
        header = doc.sections[0].header
        for paragraph in header.paragraphs:
            first_rows+=paragraph.text


        for block in iterate_block_items(doc):
            if i == 25:
                break
            first_rows += block.text + " "

            import re  # Make sure this is at the top

            if re.search('|'.join(START_PARTS), block.text, flags=re.IGNORECASE):
                break

            if block.text != "":
                i += 1
        # Check for "הנאשמים"
        if "הנאשמים:" in first_rows or "נאשמים:" in first_rows or "הנאשמים :" in first_rows or "נאשמים :" in first_rows:
            mention_count += 1
            print(f"\n📌 Found in: {filename}")
            print(first_rows)


        # Check for "הנאשם"
        elif "הנאשם" in first_rows or "נאשם" in first_rows:
            mention_1_count += 1
            # print(f"\n📌 Found in: {filename}")
            # print(first_rows)

        else :
            print("NOTHING FOUND")
            print(first_rows)

    except Exception as e:
        print(f"❌ Error processing {filename}: {e}")



In [None]:
# Summary
print(f"\n🔍 Total files containing 'הנאשמים' before start part: {mention_count}")
# Summary
print(f"\n🔍 Total files containing 'הנאשם' before start part: {mention_1_count}")
