### extract citation no API

In [None]:
import docx
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from pathlib import Path

required_parts = [
    "מתחמי ענישה", "אחידות בענישה", "מתחם הענישה", "מתחם ענישה", "דיון",
    "ענישה נהוגה", "הענישה הנוהגת", "ענישה נוהגת", "מתחם העונש", "מתחם עונש",
    "מדיניות הענישה", "והכרעה", "ההרשעה", "מדיניות הענישה הנהוגה"
]
citation_patterns = {
    'ע"פ': r'ע"פ (\d+/\d+)',
    'עפ"ג': r'עפ"ג (\d+/\d+)',
    'ת״פ': r'ת״פ (\d+[-/]\d+[-/]\d+)',
    'עפ״ג': r'עפ״ג (\d+/\d+)',
    'רע״פ': r'רע״פ (\d+/\d+)',
    'תפ"ח': r'תפ"ח\s*(\d+[-/]\d+[-/]\d+)',
}

# Load the trained model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "/home/liorkob/best_model.pt"  # Path to your saved model
tokenizer = BertTokenizer.from_pretrained('avichr/heBERT')
model = BertForSequenceClassification.from_pretrained('avichr/heBERT', num_labels=2)
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()


def extract_citations(para_text):
    """
    Extracts all citations and their full references from the text.
    Returns a list of tuples: (citation_type, full_citation).
    """
    citations = []

    for citation_type, pattern in citation_patterns.items():
        matches = re.findall(pattern, para_text)  # Find all matches for the pattern
        for match in matches:
            full_citation = f"{citation_type} {match}"  # Construct full citation
            citations.append((citation_type, full_citation))

    return citations  # List of (citation_type, full_citation)

def filter_csv_relevant_parts(csv_data):
    """Extracts the first occurrence of a required part in the CSV and all subsequent rows."""
    start_index = None
    for idx, row in csv_data.iterrows():
        if any(req_part in str(row.get("part", "")) for req_part in required_parts):
            start_index = idx
            break
    return csv_data.iloc[start_index:] if start_index is not None else pd.DataFrame(columns=csv_data.columns)
import re

def process_and_tag(docx_path: str, csv_path: str, output_path: str):
    """Process a .docx document and its corresponding CSV to check citations and tag with predictions."""
    try:
        # Load the document and CSV
        doc = docx.Document(docx_path)
        csv_data = pd.read_csv(csv_path)
        csv_data = filter_csv_relevant_parts(csv_data)

        results = []

        # Iterate through paragraphs
        for i, paragraph in enumerate(doc.paragraphs):
            para_text = paragraph.text.strip()
            if not para_text:
                continue  # Skip empty paragraphs

            found_citations = extract_citations(para_text)

            if not found_citations:
                continue  # No citations found, move to the next paragraph

            for found_citation, full_citation in found_citations:

                is_relevant = False
                matching_part = None

                # Check if the citation is in relevant parts
                for _, row in csv_data.iterrows():
                    part_text = row.get("text", "")
                    if any(req_part in row.get("part", "") for req_part in required_parts) and part_text in para_text:
                        is_relevant = True
                        matching_part = row["part"]
                        break  # Stop searching once a match is found

                if is_relevant:
                    # Tag the paragraph using the model
                    encoding = tokenizer(para_text, truncation=True, padding=True, max_length=128, return_tensors="pt")
                    encoding = {key: val.to(device) for key, val in encoding.items()}
                    with torch.no_grad():
                        output = model(**encoding)
                        prediction = torch.argmax(output.logits, dim=-1).item()

                    # Append only when is_relevant = True
                    results.append({
                        'paragraph_number': i,
                        'context_text': para_text,
                        'citation': full_citation,
                        'part': matching_part,
                        'predicted_label': prediction, 
                    })

                    print(f"Tagged citation: Paragraph {i}, Part: {matching_part}, Prediction: {prediction}")
                    print(f"Text: {para_text}\n")

        # Convert results to a DataFrame
        results_df = pd.DataFrame(results)


        # Save results
        results_df.to_csv(output_path, index=False, encoding='utf-8-sig')
        print(f"Tagged citations saved to: {output_path}")

    except Exception as e:
        print(f"Error processing {docx_path}: {e}")

if __name__ == "__main__":
    for year in [2018,2019,2020]:
        docx_directory = Path(f'/home/liorkob/thesis/lcp/data/docx_{year}')
        csv_directory = Path(f'/home/liorkob/thesis/lcp/data/docx_csv_{year}')
        output_directory = Path(f'/home/liorkob/thesis/lcp/data/tag_citations_csv_{year}')

        output_directory.mkdir(parents=True, exist_ok=True)

        for file_path in docx_directory.glob("*.docx"):
            try:
                new_file_path = file_path.stem
                print(f"Processing {new_file_path}")

                csv_file = csv_directory / f"{new_file_path}.csv"
                if file_path.exists() and csv_file.exists():
                    output_file = output_directory / f"{file_path.stem}.csv"
                    process_and_tag(str(file_path), str(csv_file), str(output_file))

                else:
                    if not file_path.exists():
                        print(f"Document file not found: {file_path}")
                    if not csv_file.exists():
                        print(f"CSV file not found for: {csv_file}")

            except Exception as e:
                print(f"Error processing {file_path.name}: {e}")




### Merge all results to one csv

In [None]:
import pandas as pd
from pathlib import Path
import docx

def merge_results(csv_directory: str, output_csv: str):
    csv_directory = Path(csv_directory)
    all_data = []
    
    # Iterate over CSV files
    for file_path in csv_directory.glob("*.csv"):
        try:
            if file_path.stat().st_size == 0:  # Check if file is empty
                print(f"Skipping empty file: {file_path.name}")
                continue
            
            df = pd.read_csv(file_path)
            if df.empty:  # Check if the file is empty even after reading
                print(f"Skipping empty DataFrame: {file_path.name}")
                continue

            df["source_file"] = file_path.name  # Add filename column
            all_data.append(df)
        except Exception as e:
            print(f"Error reading {file_path.name}: {e}")
    
    if all_data:
        merged_df = pd.concat(all_data, ignore_index=True)
        merged_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
        print(f"Merged CSV saved to: {output_csv}")
    else:
        print("No valid CSV files found.")


if __name__ == "__main__":
    for year in [2018,2019,2020]:
        csv_directory = f"/home/liorkob/thesis/lcp/data/tag_citations_csv_{year}"
        output_csv = f"{csv_directory}/merged_results_{year}.csv"
        
        merge_results(csv_directory, output_csv)


### extract citation WITH API 

In [None]:
import re
import pandas as pd

def clean_leading_prefix(citation):
    match = re.match(r'^([לבוה])\s*([א-ת"]+)', citation)
    if not match:
        return citation
    prefix = match.group(1)
    maybe_acronym = match.group(2)
    
    # בונה את הצירוף המלא כולל הקידומת
    full = prefix + maybe_acronym

    # מנרמל (מסיר גרשיים) בשביל להשוות לרשימת ראשי התיבות
    def normalize(text):
        return text.replace('"', '').replace("״", "").replace("'", "").replace("׳", "")

    norm_maybe = normalize(maybe_acronym)
    norm_full = normalize(full)

    # תנאי הסרה: החלק שאחרי הקידומת מוכר, אבל הצירוף כולו לא מוכר
    if norm_maybe in acronyms and norm_full not in acronyms:
        return citation[len(prefix):].lstrip()

    return citation

# List of legal acronyms (same as yours)
acronyms = [
    "אב", "אבע", "אימוצ", "אמצ", "אפ", "אפח", "את", "אתפ", "באפ", "באש", "בבנ", "בגצ", "בדא", "בדמ",
    "בדמש", "בהנ", "בהע", "בהש", "בידמ", "בידע", "בל", "בלמ", "במ", "בעא", "בעח", "בעמ", "בעק", "בפ",
    "בפמ", "בפת", "בצא", "בצהמ", "בק", "בקמ", "בקשה", "ברמ", "ברע", "ברש", "בש", "בשא",
    "בשגצ", "בשהת", "בשז", "בשמ", "בשע", "בשפ", "בתת", "גזז", "גמר", "גפ", "דבע", "דח", "דט", "דיונ",
    "דמ", "דמר", "דמש", "דנ", "דנא", "דנגצ", "דנמ", "דנפ", "הד", "הדפ", "הוצלפ", "הט", "הכ", "המ",
    "המד", "הממ", "המע", "המש", "הנ", "הסת", "הע", "העז", "הפ", "הפב", "הפמ", "הצמ", "הש", "השא",
    "השגצ", "השפ", "השר", "הת", "וחק", "וע", "ושמ", "ושק", "ושר", "זי", "חא", "חבר", "חד", "חדא",
    "חדלפ", "חדלת", "חדמ", "חדפ", "חהע", "חי", "חנ", "חסמ", "חעמ", "חעק", "חש", "יוש", "ייתא", "ימא",
    "יס", "כצ", "מ", "מא", "מבכ", "מבס", "מונופולינ", "מזג", "מח", "מחוז", "מחע", "מט", "מטכל", "מי",
    "מיב", "מכ", "ממ", "מס", "מסט", "מעי", "מעת", "מקמ", "מרכז", "מת", "נ", "נב", "נבא", "נמ", "נמב",
    "נעד", "נער", "סבא", "סע", "סעש", "סק", "סקכ", "ע", "עא", "עאח", "עאפ", "עב", "עבאפ", "עבז", "עבח",
    "עבי", "עבל", "עבמצ", "עבעח", "עבפ", "עבר", "עבשהת", "עגר", "עדי", "עדמ", "עהג", "עהס", "עהפ",
    "עו", "עורפ", "עז", "עח", "עחא", "עחדלפ", "עחדפ", "עחדת", "עחהס", "עחע", "עחק", "עחר", "עכב",
    "על", "עלא", "עלבש", "עלח", "עלע", "עמ", "עמא", "עמה", "עמז", "עמח", "עמי", "עמלע", "עממ", "עמנ",
    "עמפ", "עמצ", "עמק", "עמרמ", "עמש", "עמשמ", "עמת", "ענ", "ענא", "ענמ", "ענמא", "ענמש", "ענפ",
    "עסא", "עסק", "עע", "עעא", "עעמ", "עער", "עעתא", "עפ", "עפא", "עפג", "עפהג", "עפמ", "עפמק",
    "עפנ", "עפס", "עפספ", "עפע", "עפר", "עפת", "עצמ", "עק", "עקג", "עקמ", "עקנ", "עקפ", "ער", "ערא",
    "ערגצ", "ערמ", "ערעור", "ערפ", "ערר", "עש", "עשא", "עשמ", "עשר", "עשת", "עשתש", "עת", "עתא",
    "עתמ", "עתפב", "עתצ", "פא", "פה", "פל", "פלא", "פמ", "פמר", "פעמ", "פקח", "פר", "פרק", "פשז",
    "פשר", "פת", "צא", "צבנ", "צה", "צו", "צח", "צמ", "קג", "קפ", "רחדפ", "רמש", "רע", "רעא", "רעב",
    "רעבס", "רעו", "רעמ", "רעס", "רעפ", "רעפא", "רעצ", "רער", "רערצ", "רעש", "רעתא", "רצפ", "רתק",
    "ש", "שבד", "שמ", "שמי", "שנא", "שע", "שעמ", "שק", "שש", "תא", "תאדמ", "תאח", "תאמ", "תאק", "תב",
    "תבכ", "תבע", "תג", "תגא", "תד", "תדא", "תהג", "תהנ", "תהס", "תוב", "תוח", "תח", "תחפ", "תחת",
    "תט", "תי", "תכ", "תלא", "תלב", "תלהמ", "תלפ", "תלתמ", "תמ", "תמהח", "תממ", "תמק", "תמר",
    "תמש", "תנג", "תנז", "תע", "תעא", "תעז", "תפ", "תפב", "תפח", "תפחע", "תפכ", "תפמ", "תפע",
    "תפק", "תצ", "תק", "תקח", "תקמ", "תרמ", "תת", "תתח", "תתע", "תתעא", "תתק"
]

def create_acronym_variants(acronyms):
    acronym_variants = []
    for a in acronyms:
        if len(a) > 1:
            # Case 1: Original acronym with quotes/dots before last letter
            base_acronym = a
            if a.startswith('ב') or a.startswith('ו') or a.startswith('ה'):
                # Also add variant without the prefix letter
                base_acronym = a[1:]
            
            # For each acronym (both with and without prefix)
            for acr in [a, base_acronym]:
                if len(acr) > 1:
                    # Standard quote/dot before last letter
                    quoted = rf"{acr[:-1]}[\"'״]{acr[-1]}"
                    with_dot = rf"{acr[:-1]}\.{acr[-1]}"
                    acronym_variants.append(f"(?:{quoted}|{with_dot})")
                    
                    # Add dot-separated variant
                    dots_between = '\.'.join(list(acr))
                    acronym_variants.append(dots_between)
    
    return '|'.join(acronym_variants)
        
acronym_pattern = create_acronym_variants(acronyms)

# Ensure the numbers follow the correct format
number_pattern = r'''
    (?:
        \d{1,6}[-/]\d{2}[-/]\d{2}  # Format: 31067-11-11
        | \d{1,6}[-/]\d{1,6}         # Format: 895/09
        | \d{1,6}-\d{2}-\d{2}        # Format: 31067-11-11 (hyphenated)
    )
'''
citation_pattern = fr'''
    (?<!\w)                      # Ensure no letter before
    ([א-ת]?)                     # Optional single Hebrew prefix letter (but no isolated matches)
    ({acronym_pattern})           # Captures acronym (short & long)
    \.?                          # Optional dot after acronym
    \s*                          # Optional spaces
    (\((.*?)\))?                  # Optional court location in parentheses
    \s*[-/]?\s*                  # Required space or separator before case number
    ({number_pattern})            # Captures case number formats
    (?!\w)                       # Ensure no letter after
'''.strip()

# Compile regex with verbose flag for readability
citation_regex = re.compile(citation_pattern, re.VERBOSE)


def extract_citations_from_csv(csv_data):
    citations = []
    text_column = csv_data["text"].astype(str)  # Convert to string to avoid NaN issues
    pd.set_option("display.max_colwidth", None)  # Ensure full text is displayed
    # print("\n".join(text_column))  # Print each row as a full text
    # for i, text in enumerate(text_column, 1):
    #     print(f"{i}. {text}")

    matches = text_column.str.extractall(citation_regex)  # Extract structured matches
    # print("Extracted Matches:")
    # print(matches)

    # print("Extracted DataFrame:", matches)  # Debugging step
    
    for _, row in matches.iterrows():
        # Build the citation string, joining all valid elements
        citation = " ".join(map(str, filter(pd.notna, row))).strip()

        # Clean up extra spaces
        citation = re.sub(r"\s{2,}", " ", citation)


        # # Remove invalid extra words (e.g., "על 12")
        if re.match(r"^על \d+$", citation):  
            print('Skip invalid cases like "על 12')
            continue  # Skip invalid cases like "על 12"

        # Fix duplicated court locations, e.g., "(מחוזי מרכז) מחוזי מרכז" → "(מחוזי מרכז)"
        citation = re.sub(r"\((.*?)\)\s+\1", r"(\1)", citation)
        citation=clean_leading_prefix(citation)
        # Add the cleaned citation to the list
        citations.append(citation)
    
    # Return citations as a list, even if some are empty or missing optional groups
    return citations if citations else []




In [None]:
import os
import gc
import torch
import pandas as pd
import docx
import re
from transformers import AutoTokenizer, BertTokenizer, BertForSequenceClassification
from pathlib import Path
from openai import OpenAI
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["OPENAI_API_KEY"] = "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" 
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Define required sections and citation patterns
required_parts = [
    "מתחמי ענישה", "אחידות בענישה", "מתחם הענישה", "מתחם ענישה", "דיון",
    "ענישה נהוגה", "הענישה הנוהגת", "ענישה נוהגת", "מתחם העונש", "מתחם עונש",
    "מדיניות הענישה", "והכרעה", "ההרשעה", "מדיניות הענישה הנהוגה"
]


# Load the trained BERT model and tokenizer
model_path = "/home/liorkob/classifier_relvant_citation_model.pt" 
tokenizer_bert = BertTokenizer.from_pretrained('avichr/heBERT')
model_bert = BertForSequenceClassification.from_pretrained('avichr/heBERT', num_labels=2)
model_bert.load_state_dict(torch.load(model_path, map_location=device))
model_bert.to(device)
model_bert.eval()


def split_preserving_structure(text):
    paragraphs = re.split(r'(?<=\d\.)\s', text)  # Split after numbers followed by a period
    return [para.strip() for para in paragraphs if para.strip()]

def query_gpt(text,citation):
    """
    Queries gpt-4.1-mini to extract and segment legal citations.
    """
    prompt = f"""
    Given the following legal text:

    {text}

    Your task is to extract **only** the part of the text that directly relates to the citation "{citation}".
    
    **Extraction Rules:**
    - **Do not modify any wording.** Keep the original phrasing exactly as it appears in the provided document.
    - **Do not summarize or rephrase.**
    - **Return only the relevant portion**, not the full text.
    - **Handle grouped citations carefully:**
        - If the citation appears in a list following "ראו למשל ..." or similar, include the preceding explanation that applies to all citations.
        - Do not include other citations from the list—return only the text relevant to "{citation}".
    - **Handle case explanations properly:**
        - If the citation is explained in a specific section (e.g., "בע"פ 9373/10 ותד נ' מדינת ישראל..."), extract the **entire explanation** of the case.
        - Do not remove any important context about the court ruling.
    - Do **not** extract only "(רע"פ 2718/04)" without the legal principle it supports.


    Only return the extracted text. Do not include unrelated content or formatting.
    """
    print(f"🧠 Sending to GPT for extraction...")

    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini", 
            messages=[
                {"role": "system", "content": "You are an AI trained to extract and structure legal citations."},
                {"role": "user", "content": prompt}
            ]
        )

        processed_text = response.choices[0].message.content
        return processed_text

    except Exception as e:
        print(f"🚨 GPT API error: {e}")
        return [text]  # Return original text in case of failure
    
def filter_csv_relevant_parts(csv_data):
    """
    Extracts the first occurrence of a required part in the CSV and all subsequent rows.
    """
    start_index = None

    # Find the first row containing a required part
    for idx, row in csv_data.iterrows():
        if any(req_part in str(row.get("part", "")) for req_part in required_parts):
            start_index = idx
            break

    # If a match is found, return only relevant rows
    if start_index is not None:
        return csv_data.iloc[start_index:]
    else:
        # print("NO required parts in data")
        # print("parts in data:")
        # print(csv_data["part"].unique())
        return pd.DataFrame(columns=csv_data.columns)  # Return an empty DataFrame if no matches found



# Function to find all occurrences of a citation in the document
def find_all_occurrences(doc, citation):
    indices = []
    for i, paragraph in enumerate(doc.paragraphs):
        if citation in paragraph.text:
            indices.append(i)  # Store all occurrences of the citation
    return indices

# Function to get relevant context for each occurrence of the citation
def get_context_paragraphs(doc, index, citation):
    context_text = []

    # Search for the closest non-empty previous paragraph
    prev_index = index - 1
    while prev_index >= 0 and not doc.paragraphs[prev_index].text.strip():
        prev_index -= 1  # Move backwards until finding text

    if prev_index >= 0:
        context_text.append(doc.paragraphs[prev_index].text.strip())

    # Get the current paragraph (must exist, but check if empty)
    curr_text = doc.paragraphs[index].text.strip()
    if curr_text:
        context_text.append(curr_text)
    else:
        print(f"⚠️ Warning: Empty paragraph for citation {citation} at index {index}. Skipping occurrence.")
        return None  # Skip this occurrence if the current paragraph is empty

    # Search for the closest non-empty next paragraph
    next_index = index + 1
    while next_index < len(doc.paragraphs) and not doc.paragraphs[next_index].text.strip():
        next_index += 1  # Move forward until finding text

    if next_index < len(doc.paragraphs):
        context_text.append(doc.paragraphs[next_index].text.strip())

    # Ensure we have at least one non-empty paragraph
    if not context_text:
        print(f"⚠️ Warning: No valid text found for citation {citation} at index {index}. Skipping occurrence.")
        return None

    return "\n".join(context_text).strip()
def normalize_case_name_2(name):
    if pd.isna(name):
        return ""
    name = str(name)
    name = re.sub(r"\(.*?\)", "", name)
    name = re.sub(r"[∕/\\]", "-", name)
    name = re.sub(r"\s+", " ", name)
    name = name.strip().lower().replace(" ", "_")
    return name


# Function to process and tag document paragraphs
def process_and_tag_with_split(docx_path: str, csv_path: str, output_path: str):
    """
    Process a .docx document and its corresponding CSV, find relevant paragraphs with context, 
    extract relevant text using GPT, tag with BERT, and store results.
    """
    doc = docx.Document(docx_path)
    csv_data = pd.read_csv(csv_path)
    filtered_csv_data = filter_csv_relevant_parts(csv_data)
    if filtered_csv_data.empty:
        # print("⚠️ Skipping file — no relevant parts found.")
        return

    citations = extract_citations_from_csv(filtered_csv_data)
    results = []
    if len(citations) > 30:
        print(f"TOO MANY CITATIONS IN CSV Found {len(citations)}")
        print(docx_path)


        return
    print(f"🔍 Found {len(citations)} citations in CSV")

    for citation in citations:
        citation_indices = find_all_occurrences(doc, citation)  # Find all occurrences

        # Collect all contexts where the citation appears
        merged_contexts = []
        for index in citation_indices:
            full_context = get_context_paragraphs(doc, index, citation)
            if full_context:
                merged_contexts.append(full_context)

        # If no valid contexts found, skip this citation
        if not merged_contexts:
            continue  

        # Merge all valid contexts into one, ensuring uniqueness
        final_context = "\n".join(set(merged_contexts)).strip()  # Remove duplicates
        # print(citation)
        # print(final_context)

        # Ask GPT to extract the relevant part
        extracted_text = query_gpt(final_context, citation)

        # Tag the extracted text with BERT
        encoding = tokenizer_bert(extracted_text, truncation=True, padding=True, max_length=128, return_tensors="pt")
        encoding = {key: val.to(device) for key, val in encoding.items()}
        with torch.no_grad():
            output = model_bert(**encoding)
            prediction = torch.argmax(output.logits, dim=-1).item()


        citation=normalize_case_name_2(citation)    
        # Store only one result per citation
        result = {
            'citation': citation,
            'context_text': final_context,
            'extracted_text': extracted_text,
            'predicted_label': prediction
        }
        results.append(result)

    # Save to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_path, index=False, encoding="utf-8")
    print(f"Processed document saved to: {output_path}")

if __name__ == "__main__":

    docx_directory = Path('/home/liorkob/M.Sc/thesis/data/drugs_3k/docx/verdict')
    csv_directory = Path('/home/liorkob/M.Sc/thesis/data/drugs_3k/verdict_csv')
    output_directory = Path('/home/liorkob/M.Sc/thesis/data/drugs_3k/gpt/verdicts_tagged_citations')
    output_directory.mkdir(parents=True, exist_ok=True)

    # Stats counters
    total_files = 0
    processed_files = 0
    skipped_empty_or_missing = 0
    missing_csv = 0
    files_with_citations = 0
    total_citations = 0
    total_tagged_as_1 = 0

    all_files = list(docx_directory.glob("*.docx"))
    print(f"🗂 Total DOCX files found: {len(all_files)}")

    for file_path in tqdm(all_files, desc="Processing DOCX files"):
        total_files += 1
        new_file_path = file_path.stem
        csv_file = csv_directory / f"{new_file_path}.csv"
        output_file = output_directory / f"{file_path.stem}.csv"

        if not csv_file.exists():
            print(f"CSV file not found for: {csv_file}")
            missing_csv += 1
            continue

        if output_file.exists() and output_file.stat().st_size > 0:
            try:
                df_existing = pd.read_csv(output_file)
                num_citations = len(df_existing)
                num_tagged_1 = (df_existing["predicted_label"] == 1).sum()

                total_citations += num_citations
                total_tagged_as_1 += num_tagged_1
                files_with_citations += 1
                continue
            except Exception as e:
                print(f"⚠️ Error reading {output_file.name}: {e}")
                skipped_empty_or_missing += 1
                continue

        if output_file.exists() and output_file.stat().st_size == 0:
            skipped_empty_or_missing += 1
            continue

        if not output_file.exists():
            process_and_tag_with_split(str(file_path), str(csv_file), str(output_file))
            if output_file.exists() and output_file.stat().st_size > 0:
                try:
                    df_new = pd.read_csv(output_file)
                    num_citations = len(df_new)
                    num_tagged_1 = (df_new["predicted_label"] == 1).sum()
                    total_citations += num_citations
                    total_tagged_as_1 += num_tagged_1
                    files_with_citations += 1
                    processed_files += 1
                except Exception as e:
                    print(f"⚠️ Failed to read newly written output: {output_file.name}")
                    skipped_empty_or_missing += 1

    # Averages
    avg_citations_per_file = total_citations / files_with_citations if files_with_citations else 0
    avg_tagged_1_per_file = total_tagged_as_1 / files_with_citations if files_with_citations else 0

    print("\n===== 📊 Processing Summary =====")
    print(f"Total DOCX files:               {total_files}")
    print(f"Processed files:                {files_with_citations}  # output file exists and is not empty")
    print(f"Skipped (already processed):    {total_files - files_with_citations}  # output missing or empty")
    print(f"Missing CSV files:              {missing_csv}")
    print(f"Files with citation data:       {files_with_citations}")
    print(f"Total citations:                {total_citations}")
    print(f"Total tagged as 1:              {total_tagged_as_1}")
    print(f"Avg citations per file:         {avg_citations_per_file:.2f}")
    print(f"Avg tagged=1 per file:          {avg_tagged_1_per_file:.2f}")


### process_part_only_citations

In [2]:
import pandas as pd
import re
from pathlib import Path

# === תבנית בסיסית לציטוט משפטי (רק ת"פ לפי הבקשה שלך) ===
citation_regex = re.compile(r'(?<!\w)(ת"פ\s*\d{1,6}[-/]\d{2})(?!\w)')

# === רשימת חלקים רלוונטיים לסינון ראשוני ===
required_parts = [
    "מתחמי ענישה", "אחידות בענישה", "מתחם הענישה", "מתחם ענישה", "דיון",
    "ענישה נהוגה", "הענישה הנוהגת", "ענישה נוהגת", "מתחם העונש", "מתחם עונש",
    "מדיניות הענישה", "והכרעה", "ההרשעה", "מדיניות הענישה הנהוגה"
]

# === נתיב לתיקיית הקבצים ===
csv_dir = Path("/home/liorkob/M.Sc/thesis/data/drugs_3k/verdict_csv")

# === סטטיסטיקות ===
total_files = 0
files_with_required_part = 0
total_citations_found = 0
example_rows = []

# === פונקציה לסינון החלק הרלוונטי ולקטיפת ציטוטים מתוך part ===
def filter_and_find_citations(csv_data):
    start_index = None
    for idx, row in csv_data.iterrows():
        part_val = str(row.get("part", ""))
        if any(req in part_val for req in required_parts):
            start_index = idx
            break

    if start_index is not None:
        filtered = csv_data.iloc[start_index:]
        citations = []
        for row in filtered.itertuples():
            part_text = str(getattr(row, "part", ""))
            matches = citation_regex.findall(part_text)
            if matches:
                for match in matches:
                    citations.append((match, part_text.strip(), getattr(row, "text", "").strip()))
        return filtered, citations
    return pd.DataFrame(columns=csv_data.columns), []

# === מעבר על כל הקבצים ===
for file_path in csv_dir.glob("*.csv"):
    total_files += 1
    try:
        df = pd.read_csv(file_path)
        if 'part' not in df.columns or 'text' not in df.columns:
            continue

        filtered_df, citations = filter_and_find_citations(df)

        if not filtered_df.empty:
            files_with_required_part += 1
            total_citations_found += len(citations)
            if citations:
                for citation, part, text in citations[:2]:  # שמור עד 2 דוגמאות לקובץ
                    example_rows.append({
                        "file": file_path.name,
                        "citation": citation,
                        "part": part,
                        "text": text
                    })

    except Exception as e:
        print(f"⚠️ שגיאה בקובץ {file_path.name}: {e}")

# === הדפסה מסכמת ===
print(f"\n===== 📊 סטטיסטיקה כללית =====")
print(f"סה\"כ קבצים שנבדקו:               {total_files}")
print(f"קבצים עם חלק רלוונטי:            {files_with_required_part}")
print(f"סה\"כ ציטוטים שנמצאו ב-part:      {total_citations_found}")
print(f"🔍 מספר דוגמאות מוצגות:           {len(example_rows)}")

print("\n===== 🧾 דוגמאות =====")
for row in example_rows[:5]:
    print(f"\n📁 קובץ: {row['file']}")
    print(f"📌 ציטוט: {row['citation']}")
    print(f"📂 part: {row['part']}")
    print(f"📝 text: {row['text']}")



===== 📊 סטטיסטיקה כללית =====
סה"כ קבצים שנבדקו:               3046
קבצים עם חלק רלוונטי:            2491
סה"כ ציטוטים שנמצאו ב-part:      163
🔍 מספר דוגמאות מוצגות:           28

===== 🧾 דוגמאות =====

📁 קובץ: תפ_41970-12-21.csv
📌 ציטוט: ת"פ 19611-03
📂 part: ת"פ 19611-03-21 עוסק בעבירת החזקת סמים שלא לצריכה עצמית.
📝 text: נגע הסמים פוגע בציבור קשות. קטינים, צעירים ובוגרים נחשפים אליו, לעתים באקראי, מתנסים בשימוש בסם ובמקרים רבים מתמכרים לו. השימוש בסם פוגע לא רק במשתמשים בו ובבריאותם, אלא גם במעגלים הקרובים והרחוקים מהם, בשל ההשלכות הפוגעניות של השימוש בו- ההתמכרות, העבריינות הנלוות לשימוש במקרים רבים, והשפעות שליליות נוספות.

📁 קובץ: תפ_41970-12-21.csv
📌 ציטוט: ת"פ 19611-03
📂 part: ת"פ 19611-03-21 עוסק בעבירת החזקת סמים שלא לצריכה עצמית.
📝 text: עבירות ההפצה והסחר בסמים, הן אלה המביאות להתפשטות הנגע ולהיותו זמין לכל, ומכאן החומרה שיש בהן. ככל שהעבריין ממקום גבוה יותר בשרשרת הפצת הסם, חלקו בהפצתו גדול יותר, ובהתאמה- עונשו יהא חמור יותר.

📁 קובץ: תפ_43665-08-21.csv
📌 ציטוט: ת"פ 1324-0

### Print results

### get URLS from docx

In [None]:
from docx import Document
import pandas as pd
import re
import os
from docx import Document
import pandas as pd
import re
import os
from docx.oxml.ns import qn
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from bs4 import BeautifulSoup

def normalize_case_name(case_name):
    """Normalize case names by removing extra spaces and fixing slashes."""
    case_name = case_name.replace('״', '"')
    return re.sub(r'\s+', ' ', case_name.replace('∕', '/')).strip()


# def normalize_citation(citation):
#     """Normalize citation by removing prefixes and standardizing format."""
#     if not citation:
#         return None
#     # Standardize quotes
#     citation = citation.replace('״', '"').replace('״', '"').replace('״', '"')
#     # Remove extra spaces
#     citation = re.sub(r'\s+', ' ', citation).strip()
#     # Remove common prefixes, including רע"פ
#     citation = re.sub(r'^(ע"?פ|ת"?פ|עפ"?ג|רע"?פ)\s+', '', citation)
#     return citation


def extract_citations(text):
    """Extracts legal citations from a single text string."""
    matches = citation_regex.findall(text)
    citations = []
    for match in matches:
        citation = " ".join(filter(None, match)).strip()
        citation = re.sub(r"\s{2,}", " ", citation)
        citation = re.sub(r"\((.*?)\)\s+\1", r"(\1)", citation)
        if not re.match(r"^על \d+$", citation):
            citations.append(citation)
    return citations[0] if citations else None


def getLinkedText(soup):
    links = []
    for tag in soup.find_all("hyperlink"):
        try:
            links.append({"id": tag["r:id"], "text": tag.text})
        except KeyError:
            pass

    for tag in soup.find_all("instrText"):
        if "HYPERLINK" in tag.text:
            parts = tag.text.split('"')
            if len(parts) > 1:  # Ensure the URL exists before accessing index 1
                url = parts[1]
            else:
                print(f"⚠️ Warning: No valid URL found in HYPERLINK tag: {tag.text}")
                url = None  # Assign None if URL is missing

            temp = tag.parent.next_sibling
            text = ""

            while temp is not None:
                maybe_text = temp.find("t")
                if maybe_text is not None and maybe_text.text.strip() != "":
                    text += maybe_text.text.strip()
                maybe_end = temp.find("fldChar[w:fldCharType]")
                if maybe_end is not None and maybe_end["w:fldCharType"] == "end":
                    break
                temp = temp.next_sibling

            links.append({"id": None, "href": url, "text": text})
    return links
def getURLs(soup, links):
    for link in links:
        if "href" not in link:
            for rel in soup.find_all("Relationship"):
                if rel["Id"] == link["id"]:
                    link["href"] = rel["Target"]
    return links

import zipfile

def extract_hyperlinks(docx_path):
    """
    Extracts hyperlinks from a .docx file and returns a dictionary 
    where the linked text is mapped to its corresponding URL.
    """
    # Open the .docx file as a zip archive
    try:
        archive = zipfile.ZipFile(docx_path, "r")
    except zipfile.BadZipFile:
        print(f"❌ Error: Cannot open {docx_path} (Bad ZIP format)")
        return {}

    # Extract main document XML
    try:
        file_data = archive.read("word/document.xml")
        doc_soup = BeautifulSoup(file_data, "xml")
        linked_text = getLinkedText(doc_soup)
    except KeyError:
        print(f"⚠️ Warning: No document.xml found in {docx_path}")
        return {}

    # Extract hyperlink relationships from _rels/document.xml.rels
    try:
        url_data = archive.read("word/_rels/document.xml.rels")
        url_soup = BeautifulSoup(url_data, "xml")
        links_with_urls = getURLs(url_soup, linked_text)
    except KeyError:
        print(f"⚠️ Warning: No _rels/document.xml.rels found in {docx_path}")
        links_with_urls = linked_text

    # Extract footnotes (if available)
    try:
        footnote_data = archive.read("word/footnotes.xml")
        footnote_soup = BeautifulSoup(footnote_data, "xml")
        footnote_links = getLinkedText(footnote_soup)

        footnote_url_data = archive.read("word/_rels/footnotes.xml.rels")
        footnote_url_soup = BeautifulSoup(footnote_url_data, "xml")
        footnote_links_with_urls = getURLs(footnote_url_soup, footnote_links)

        # Merge footnote links
        links_with_urls += footnote_links_with_urls
    except KeyError:
        pass  # No footnotes found, continue

    # Convert extracted links to a dictionary: {linked_text: URL}
    return {link["text"]: link.get("href", None) for link in links_with_urls}


import pandas as pd
from pathlib import Path

def update_csv_with_links(csv_path, doc_path):
    csv_path = Path(csv_path)  # Convert to Path object if not already
    
    # **Check if CSV is empty before reading**
    if not csv_path.exists() or csv_path.stat().st_size == 0:  
        print(f"Skipping empty or missing file: {csv_path.name}")
        return
    
    try:
        df = pd.read_csv(csv_path, encoding="utf-8", on_bad_lines="skip")
        # **Check if the DataFrame is empty after loading**
        if df.empty:
            print(f"Skipping empty DataFrame: {csv_path.name}")
            return
        # Skip if file already updated with links
        if "link" in df.columns and df["link"].notna().sum() > 0:
            print(f"⏭️ Skipping {csv_path.name} — already has links.")
            return

        # Normalize extracted citations
        df["extracted_citation"] = df["context_text"].apply(
            lambda text: normalize_case_name(cit) if (pd.notna(text) and (cit := extract_citations(text))) else None
        )
        
        # Normalize citation_links keys
        citation_links = extract_hyperlinks(doc_path)
        normalized_citation_links = {normalize_case_name(k): v for k, v in citation_links.items()}
        
        # Assign URLs to citations
        df["link"] = df["extracted_citation"].apply(
            lambda text: normalized_citation_links.get(text, None) if pd.notna(text) else None
        )
        # Print only the rows that got a link
        # linked_rows = df[df["link"].notna()]
        # if not linked_rows.empty:
        #     print(f"🔗 {len(linked_rows)} rows updated with links in: {csv_path.name}")
        #     print(linked_rows[["extracted_citation", "link"]].head(5).to_string(index=False))  # Show first 5

        df.to_csv(csv_path, index=False)
        print(f"Updated CSV saved to: {csv_path}")

    except pd.errors.EmptyDataError:
        print(f"Skipping {csv_path.name}: CSV file is empty or unreadable.")
        return
    except UnicodeDecodeError:
        print(f"⚠️ Skipping (encoding error): {csv_path}")
        return


    

def find_matching_docx(csv_name, docx_directory):
    normalized_csv_name = normalize_case_name(csv_name.replace('.csv', '.docx'))
    for root, _, files in os.walk(docx_directory):
        for file in files:
            if file.endswith(".docx") and normalize_case_name(file) == normalized_csv_name:
                return os.path.join(root, file)
    print(f"❌ No match found for {csv_name}")

    return None

from tqdm import tqdm

def process_all_csvs(citations_dir, docx_directory):
    # Collect all CSV files first
    all_csv_files = []
    for root, _, files in os.walk(citations_dir):
        for file in files:
            if file.endswith(".csv"):
                all_csv_files.append(os.path.join(root, file))

    # Iterate with tqdm progress bar
    for csv_path in tqdm(all_csv_files, desc="🔍 Processing CSVs"):
        file = os.path.basename(csv_path)
        docx_path = find_matching_docx(file, docx_directory)

        if docx_path:
            update_csv_with_links(csv_path, docx_path)
        else:
            print(f"❌ No matching DOCX found for: {file}")

docx_dir = f"/home/liorkob/M.Sc/thesis/data/5k/docx/verdict"
citations_dir = f"/home/liorkob/M.Sc/thesis/data/5k/gpt/verdict_tagged_citations"
process_all_csvs(citations_dir, docx_dir)


### Genrate smilar pairs 

In [None]:
import pandas as pd
import os
from glob import glob
import re
import difflib
def normalize_case_name(name):
    if pd.isna(name):
        return ""
    name = str(name)
    name = name.replace('"', '').replace("״", '').replace("'", "")
    name = re.sub(r"\(.*?\)", "", name)
    name = re.sub(r"[∕/\\]", "-", name)
    name = re.sub(r"\s+", " ", name)
    name = name.strip().lower().replace(" ", "_")
    return name

# def normalize_case_name(name):
#     if pd.isna(name):
#         return ""
#     name = str(name)
#     name = re.sub(r"\(.*?\)", "", name)
#     name = re.sub(r"[∕/\\]", "-", name)
#     name = re.sub(r"\s+", " ", name)
#     name = name.strip().lower().replace(" ", "_")
#     return name

def get_only_numbers(name):
    return ''.join(filter(str.isdigit, name))

# Load and normalize all verdicts
df1 = pd.read_csv("/home/liorkob/M.Sc/thesis/data/drugs_3k/gpt/processed_verdicts_with_gpt.csv")
# df2 = pd.read_csv("/home/liorkob/M.Sc/thesis/data/5k/gpt/processed_appeals_with_gpt_2.csv")
# combined_df = pd.concat([df1, df2], ignore_index=True)

# Map normalized verdicts to their extracted_gpt_facts
verdict_to_facts = {
    normalize_case_name(row["verdict"]): row["extracted_gpt_facts"]
    for _, row in df1.iterrows()
    if pd.notna(row["verdict"]) and pd.notna(row["extracted_gpt_facts"])
}

# Define directories
citations_dirs = [
    "/home/liorkob/M.Sc/thesis/data/drugs_3k/gpt/verdicts_tagged_citations"
#     ,"/home/liorkob/M.Sc/thesis/data/5k/gpt/appeals_tagged_citations"
]

records = []
missing_verdicts = set()
all_verdicts = list(verdict_to_facts.keys())

# Loop
for citations_dir in citations_dirs:
    for file_path in glob(os.path.join(citations_dir, "*.csv")):
        try:
            df_cite = pd.read_csv(file_path)
            source_verdict = normalize_case_name(os.path.basename(file_path).replace(".csv", ""))

            if source_verdict not in verdict_to_facts:
                continue

            df_cite = df_cite[df_cite["predicted_label"] == 1]

            for cited in df_cite["citation"].dropna():
                cited_norm = normalize_case_name(cited)

                if cited_norm in verdict_to_facts:
                    # Exact match
                    print(f"✅ Exact match:")
                    print(f"   Source (raw): {os.path.basename(file_path).replace('.csv', '')} → Cited (raw): {cited}")
                    print(f"   Source (normalized): {source_verdict} → Cited (normalized): {cited_norm}")

                    log = []
                    if not verdict_to_facts.get(source_verdict):
                        log.append("missing facts_a")
                    if not verdict_to_facts.get(cited_norm):
                        log.append("missing facts_b")

                    records.append({
                        "verdict_a": source_verdict,
                        "verdict_b": cited_norm,
                        "gpt_facts_a": verdict_to_facts.get(source_verdict),
                        "gpt_facts_b": verdict_to_facts.get(cited_norm),
                        "log": "; ".join(log) if log else "ok"
                    })

                else:
                    # Try smart match
                    matches = difflib.get_close_matches(cited_norm, all_verdicts, n=1, cutoff=0.8)
                    if matches:
                        match = matches[0]
                        if cited_norm.split("_")[0] == match.split("_")[0]:  # same type
                            num_cited = get_only_numbers(cited_norm)
                            num_match = get_only_numbers(match)

                            min_len = min(len(num_cited), len(num_match))
                            if min_len >= 4 and num_cited[:min_len] == num_match[:min_len]:
                                # Smart match found, ask user
                                print(f"🧐 Smart match suggestion:", flush=True)
                                print(f"   Citation: {cited_norm}", flush=True)
                                print(f"   Closest:  {match}", flush=True)
                                answer = input("👉 Accept this match? (y/n): ").strip().lower()

                                if answer == "y":
                                    print(f"✅ Smart accepted match:")
                                    print(f"   Source (raw): {os.path.basename(file_path).replace('.csv', '')} → Cited (raw): {cited}")
                                    print(f"   Source (normalized): {source_verdict} → Closest (normalized): {match}")

                                    log = []
                                    if not verdict_to_facts.get(source_verdict):
                                        log.append("missing facts_a")
                                    if not verdict_to_facts.get(match):
                                        log.append("missing facts_b")

                                    records.append({
                                        "verdict_a": source_verdict,
                                        "verdict_b": match,
                                        "gpt_facts_a": verdict_to_facts.get(source_verdict),
                                        "gpt_facts_b": verdict_to_facts.get(match),
                                        "log": "; ".join(log) if log else "smart match"
                                    })
                                    continue

                    # No match accepted → mark as missing
                    # print(f"❌ Could not find match for: {cited_norm}")
                    missing_verdicts.add(cited_norm)
                    records.append({
                        "verdict_a": source_verdict,
                        "verdict_b": cited_norm,
                        "gpt_facts_a": verdict_to_facts.get(source_verdict),
                        "gpt_facts_b": None,
                        "log": "missing cited verdict"
                    })

        except Exception as e:
            continue

# Remove duplicates based on verdict pairs
valid_pairs_df = pd.DataFrame(records)
valid_pairs_df = valid_pairs_df.drop_duplicates(subset=["verdict_a", "verdict_b"])

# Save
valid_pairs_df.to_csv("/home/liorkob/M.Sc/thesis/data/drugs_3k/gpt/valid_pairs_with_log.csv", index=False, encoding="utf-8-sig")

# Save unique missing verdicts not present in verdict_to_facts
filtered_missing = [v for v in sorted(missing_verdicts) if v not in verdict_to_facts]
missing_df = pd.DataFrame(filtered_missing, columns=["missing_verdict"])
missing_df = missing_df.drop_duplicates()
missing_df.to_csv("/home/liorkob/M.Sc/thesis/data/drugs_3k/gpt/missing_verdicts.csv", index=False, encoding="utf-8-sig")

print(f"\n✅ Total valid pairs collected (including missing): {len(valid_pairs_df)}")
print(f"⚠️ Total unmatched citations: {len(missing_verdicts)}")


In [None]:
for citation in df_cite["citation"].dropna():
    cited_norm = normalize_case_name(citation)
    if cited_norm not in verdict_to_facts:
        print("❌ No match for:", cited_norm)
        close = difflib.get_close_matches(cited_norm, verdict_to_facts.keys(), n=1, cutoff=0.8)
        print("🔍 Closest match:", close)


In [None]:
import pandas as pd
import re

def normalize_case_name(name):
    if pd.isna(name):
        return ""
    name = str(name)
    name = re.sub(r"\(.*?\)", "", name)
    name = re.sub(r"[∕/\\]", "-", name)
    name = re.sub(r"\s+", " ", name)
    name = name.strip().lower().replace(" ", "_")
    return name
# Load data
valid_pairs_df = pd.read_csv("/home/liorkob/M.Sc/thesis/data/5k/valid_pairs_with_log_2.csv")
df1 = pd.read_csv("/home/liorkob/M.Sc/thesis/data/5k/gpt/processed_verdicts_with_gpt.csv")
df2 = pd.read_csv("/home/liorkob/M.Sc/thesis/data/5k/gpt/processed_appeals_with_gpt.csv")

# Normalize verdict names
verdicts = set(df1["verdict"].dropna().apply(normalize_case_name))
appeals = set(df2["verdict"].dropna().apply(normalize_case_name))

def get_type(name):
    if name in appeals:
        return "appeal"
    elif name in verdicts:
        return "verdict"
    else:
        return "unknown"

# Normalize all verdicts before checking type
valid_pairs_df["norm_a"] = valid_pairs_df["verdict_a"].apply(normalize_case_name)
valid_pairs_df["norm_b"] = valid_pairs_df["verdict_b"].apply(normalize_case_name)
valid_pairs_df["type_a"] = valid_pairs_df["norm_a"].apply(get_type)
valid_pairs_df["type_b"] = valid_pairs_df["norm_b"].apply(get_type)

# Count each category
appeal_to_verdict = ((valid_pairs_df["type_a"] == "appeal") & (valid_pairs_df["type_b"] == "verdict")).sum()
verdict_to_verdict = ((valid_pairs_df["type_a"] == "verdict") & (valid_pairs_df["type_b"] == "verdict")).sum()
appeal_to_appeal = ((valid_pairs_df["type_a"] == "appeal") & (valid_pairs_df["type_b"] == "appeal")).sum()
verdict_to_appeal = ((valid_pairs_df["type_a"] == "verdict") & (valid_pairs_df["type_b"] == "appeal")).sum()

# Debug unknowns
unknown_a = (valid_pairs_df["type_a"] == "unknown").sum()
unknown_b = (valid_pairs_df["type_b"] == "unknown").sum()

# Print results
print(f"📚 Appeal → Verdict pairs: {appeal_to_verdict}")
print(f"📚 Verdict → Verdict pairs: {verdict_to_verdict}")
print(f"📚 Appeal → Appeal pairs: {appeal_to_appeal}")
print(f"📚 Verdict → Appeal pairs: {verdict_to_appeal}")
print(f"❓ Unknown type in verdict_a: {unknown_a}")
print(f"❓ Unknown type in verdict_b: {unknown_b}")


In [None]:
# from pathlib import Path
# import pandas as pd
# import re

# def reextract_full_citation_if_prefix_stripped_single_file(csv_path):
#     print(f"🔁 Fixing citations in: {csv_path.name}")
#     try:
#         df = pd.read_csv(csv_path)
#         if not {"citation", "extracted_text"}.issubset(df.columns):
#             print(f"⚠️ Skipping {csv_path.name} — missing required columns")
#             return

#         new_citations = []

#         for citation, text in zip(df["citation"], df["extracted_text"]):
#             old_matches = []

#             # OLD extraction simulation
#             for m in citation_regex.finditer(str(text)):
#                 row = m.groups()
#                 old_cit = " ".join(filter(None, row)).strip()
#                 old_cit = re.sub(r"\s{2,}", " ", old_cit)
#                 old_cit = re.sub(r"\((.*?)\)\s+\1", r"(\1)", old_cit)
#                 old_cit_stripped = re.sub(r"^[בוור]\"", "\"", old_cit)
#                 old_cit_stripped = re.sub(r"^[בוור] ", "", old_cit_stripped)

#                 if old_cit_stripped == citation:
#                     old_matches.append((old_cit, citation))

#             # NEW extraction
#             fixed_citation = citation
#             for m in citation_regex.finditer(str(text)):
#                 row = m.groups()
#                 full_cit = " ".join(filter(None, row)).strip()
#                 full_cit = re.sub(r"\s{2,}", " ", full_cit)
#                 full_cit = re.sub(r"\((.*?)\)\s+\1", r"(\1)", full_cit)
#                 full_cit = clean_leading_prefix(full_cit)

#                 if any(stripped == citation for stripped_full, stripped in old_matches if stripped_full == full_cit):
#                     fixed_citation = full_cit
#                     if citation != full_cit:
#                         print(f"✅ Updating citation: OLD='{citation}' → NEW='{full_cit}'")
#                     break

#             new_citations.append(fixed_citation)

#         df["citation"] = new_citations
#         df.to_csv(csv_path, index=False, encoding="utf-8")
#         print("✅ File saved")

#     except Exception as e:
#         print(f"❌ Error processing {csv_path.name}: {e}")

# # ==== Run ====
# # target_file = Path("/home/liorkob/M.Sc/thesis/data/5k/gpt/verdict_tagged_citations/ת\"פ 29454-01-13.csv")
# # reextract_full_citation_if_prefix_stripped_single_file(target_file)
# csv_dir = Path("/home/liorkob/M.Sc/thesis/data/5k/gpt/appeals_tagged_citations")
# csv_files = list(Path(csv_dir).glob("*.csv"))
# print(f"🔁 Fixing citations using updated logic in {len(csv_files)} files...")

# for csv_file in tqdm(csv_files, desc="Fixing citation prefixes"):
#     reextract_full_citation_if_prefix_stripped_single_file(csv_file)
