### extract citation no API

In [None]:
import docx
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from pathlib import Path

required_parts = [
    "מתחמי ענישה", "אחידות בענישה", "מתחם הענישה", "מתחם ענישה", "דיון",
    "ענישה נהוגה", "הענישה הנוהגת", "ענישה נוהגת", "מתחם העונש", "מתחם עונש",
    "מדיניות הענישה", "והכרעה", "ההרשעה", "מדיניות הענישה הנהוגה"
]
citation_patterns = {
    'ע"פ': r'ע"פ (\d+/\d+)',
    'עפ"ג': r'עפ"ג (\d+/\d+)',
    'ת״פ': r'ת״פ (\d+[-/]\d+[-/]\d+)',
    'עפ״ג': r'עפ״ג (\d+/\d+)',
    'רע״פ': r'רע״פ (\d+/\d+)',
    'תפ"ח': r'תפ"ח\s*(\d+[-/]\d+[-/]\d+)',
}

# Load the trained model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = "/home/liorkob/best_model.pt"  # Path to your saved model
tokenizer = BertTokenizer.from_pretrained('avichr/heBERT')
model = BertForSequenceClassification.from_pretrained('avichr/heBERT', num_labels=2)
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()


def extract_citations(para_text):
    """
    Extracts all citations and their full references from the text.
    Returns a list of tuples: (citation_type, full_citation).
    """
    citations = []

    for citation_type, pattern in citation_patterns.items():
        matches = re.findall(pattern, para_text)  # Find all matches for the pattern
        for match in matches:
            full_citation = f"{citation_type} {match}"  # Construct full citation
            citations.append((citation_type, full_citation))

    return citations  # List of (citation_type, full_citation)

def filter_csv_relevant_parts(csv_data):
    """Extracts the first occurrence of a required part in the CSV and all subsequent rows."""
    start_index = None
    for idx, row in csv_data.iterrows():
        if any(req_part in str(row.get("part", "")) for req_part in required_parts):
            start_index = idx
            break
    return csv_data.iloc[start_index:] if start_index is not None else pd.DataFrame(columns=csv_data.columns)
import re

def process_and_tag(docx_path: str, csv_path: str, output_path: str):
    """Process a .docx document and its corresponding CSV to check citations and tag with predictions."""
    try:
        # Load the document and CSV
        doc = docx.Document(docx_path)
        csv_data = pd.read_csv(csv_path)
        csv_data = filter_csv_relevant_parts(csv_data)

        results = []

        # Iterate through paragraphs
        for i, paragraph in enumerate(doc.paragraphs):
            para_text = paragraph.text.strip()
            if not para_text:
                continue  # Skip empty paragraphs

            found_citations = extract_citations(para_text)

            if not found_citations:
                continue  # No citations found, move to the next paragraph

            for found_citation, full_citation in found_citations:

                is_relevant = False
                matching_part = None

                # Check if the citation is in relevant parts
                for _, row in csv_data.iterrows():
                    part_text = row.get("text", "")
                    if any(req_part in row.get("part", "") for req_part in required_parts) and part_text in para_text:
                        is_relevant = True
                        matching_part = row["part"]
                        break  # Stop searching once a match is found

                if is_relevant:
                    # Tag the paragraph using the model
                    encoding = tokenizer(para_text, truncation=True, padding=True, max_length=128, return_tensors="pt")
                    encoding = {key: val.to(device) for key, val in encoding.items()}
                    with torch.no_grad():
                        output = model(**encoding)
                        prediction = torch.argmax(output.logits, dim=-1).item()

                    # Append only when is_relevant = True
                    results.append({
                        'paragraph_number': i,
                        'paragraph_text': para_text,
                        'citation': full_citation,
                        'part': matching_part,
                        'predicted_label': prediction, 
                    })

                    print(f"Tagged citation: Paragraph {i}, Part: {matching_part}, Prediction: {prediction}")
                    print(f"Text: {para_text}\n")

        # Convert results to a DataFrame
        results_df = pd.DataFrame(results)


        # Save results
        results_df.to_csv(output_path, index=False, encoding='utf-8-sig')
        print(f"Tagged citations saved to: {output_path}")

    except Exception as e:
        print(f"Error processing {docx_path}: {e}")

if __name__ == "__main__":
    for year in [2018,2019,2020]:
        docx_directory = Path(f'/home/liorkob/thesis/lcp/data/docx_{year}')
        csv_directory = Path(f'/home/liorkob/thesis/lcp/data/docx_csv_{year}')
        output_directory = Path(f'/home/liorkob/thesis/lcp/data/tag_citations_csv_{year}')

        output_directory.mkdir(parents=True, exist_ok=True)

        for file_path in docx_directory.glob("*.docx"):
            try:
                new_file_path = file_path.stem
                print(f"Processing {new_file_path}")

                csv_file = csv_directory / f"{new_file_path}.csv"
                if file_path.exists() and csv_file.exists():
                    output_file = output_directory / f"{file_path.stem}.csv"
                    process_and_tag(str(file_path), str(csv_file), str(output_file))

                else:
                    if not file_path.exists():
                        print(f"Document file not found: {file_path}")
                    if not csv_file.exists():
                        print(f"CSV file not found for: {csv_file}")

            except Exception as e:
                print(f"Error processing {file_path.name}: {e}")




### Merge all results to one csv

In [None]:
import pandas as pd
from pathlib import Path
import docx

def merge_results(csv_directory: str, output_csv: str):
    csv_directory = Path(csv_directory)
    all_data = []
    
    # Iterate over CSV files
    for file_path in csv_directory.glob("*.csv"):
        try:
            if file_path.stat().st_size == 0:  # Check if file is empty
                print(f"Skipping empty file: {file_path.name}")
                continue
            
            df = pd.read_csv(file_path)
            if df.empty:  # Check if the file is empty even after reading
                print(f"Skipping empty DataFrame: {file_path.name}")
                continue

            df["source_file"] = file_path.name  # Add filename column
            all_data.append(df)
        except Exception as e:
            print(f"Error reading {file_path.name}: {e}")
    
    if all_data:
        merged_df = pd.concat(all_data, ignore_index=True)
        merged_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
        print(f"Merged CSV saved to: {output_csv}")
    else:
        print("No valid CSV files found.")


if __name__ == "__main__":
    for year in [2018,2019,2020]:
        csv_directory = f"/home/liorkob/thesis/lcp/data/tag_citations_csv_{year}"
        output_csv = f"{csv_directory}/merged_results_{year}.csv"
        
        merge_results(csv_directory, output_csv)


### extract citation WITH API -v1

In [None]:
import os
import gc
import torch
import pandas as pd
import docx
import re
from transformers import AutoTokenizer, BertTokenizer, BertForSequenceClassification
from pathlib import Path
from openai import OpenAI
import os

os.environ["OPENAI_API_KEY"] = "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"  # Replace with actual key

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
models = client.models.list()
print([m.id for m in models])
model_info = client.models.retrieve("gpt-4o")
print(model_info)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Define required sections and citation patterns
required_parts = [
    "מתחמי ענישה", "אחידות בענישה", "מתחם הענישה", "מתחם ענישה", "דיון",
    "ענישה נהוגה", "הענישה הנוהגת", "ענישה נוהגת", "מתחם העונש", "מתחם עונש",
    "מדיניות הענישה", "והכרעה", "ההרשעה", "מדיניות הענישה הנהוגה"
]
citation_patterns = ['ע"פ', 'ת"פ', 'עפ"ג', 'ע״פ', 'ת״פ', 'עפ״ג']

# Check for CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the trained BERT model and tokenizer
model_path = "/home/liorkob/best_model.pt"  # Path to your saved model
tokenizer_bert = BertTokenizer.from_pretrained('avichr/heBERT')
model_bert = BertForSequenceClassification.from_pretrained('avichr/heBERT', num_labels=2)
model_bert.load_state_dict(torch.load(model_path, map_location=device))
model_bert.to(device)
model_bert.eval()


def split_preserving_structure(text):
    paragraphs = re.split(r'(?<=\d\.)\s', text)  # Split after numbers followed by a period
    return [para.strip() for para in paragraphs if para.strip()]

def query_gpt(text):
    """
    Queries GPT-4o to extract and segment legal citations.
    """
    prompt = (
        "הטקסט הבא מכיל מספר ציטוטים משפטיים, שהם הפניות להחלטות של בתי משפט והם נכתבים בפורמט הבא: "
        "סוג ההליך (ע\"פ, עפ\"ג, ת\"פ וכו'), מספר התיק, שמות הצדדים, ותאריך ההחלטה בסוגריים. "
        "לדוגמה: ע\"פ 4173/07 פלוני נ' מדינת ישראל (2007).\n\n"

        " **הנחיות קריטיות:**\n"
        " **אין לערוך, לשנות או להשמיט שום חלק מהטקסט המקורי** – כל התוכן חייב להופיע כפי שהוא.\n"
        " **יש לפצל לפסקאות לפי הציטוטים המשפטיים**, כך שכל פסקה תכיל ציטוט עם ההקשר המתאים.\n"
        " **אם מספר ציטוטים מתייחסים לאותו מקרה יש להשאירם יחד באותה פסקה**.\n"
        " **הפסקאות חייבות להופיע בסדר המקורי שלהן** – אין לערבב או להזיז חלקים בטקסט.\n"
        " **אין ליצור פסקאות שאין בהן ציטוט משפטי**.\n\n"

        "### 🔍 דוגמאות לפיצול נכון ושגוי:\n\n"

        "❌ **פיצול שגוי (לא נכון):**\n"
        "1. ע\"פ 1234/20 מדינת ישראל נ' כהן (2020)\n"
        "2. טקסט כללי בלי ציטוט – אין לאפשר זאת.\n\n"

        "✅ **פיצול נכון (כן נכון):**\n"
        "1. ע\"פ 5678/15 לוי נ' מדינת ישראל (2015) - במקרה זה נקבע כי...\n"
        "2. ע\"פ 9876/18 כהן נ' מדינת ישראל (2018) - בנסיבות דומות, הוחלט כי...\n\n"

        "✅ **כאשר כל הציטוטים שייכים לאותו הקשר - יש להשאירם יחד:**\n"
        "1. \"על דרך הכלל, בית משפט זה נדרש לצערי לא אחת לאירועים מעין אלה של פתרון סכסוכים בדרכי אלימות, "
        "ולא תתכן מחלוקת כי יש להטיל עונשים משמעותיים, על פי רוב מאחורי סורג ובריח, כדי לעקור תופעות אלה מהשורש. "
        "אין מקום לסובלנות כלפי יד קלה על ההדק או קת סכין או במקל חובלים. ועל כך נאמר יש מקום שגם בית המשפט יתרום את חלקו "
        "למלחמה נגד האלימות. לעיתים יש תחושה שכל אמרה לא נכונה או התנהגות שבעיני אחר סוטה מן השורה, ולו במקצת, "
        "מהווה הצדקה עבור הפוגע וסביבתו לפגוע באמצעות נשק קר ביחיד ובסביבתו\" "
        "(עוד לעניין זה ראו ע\"פ 4173/07 פלוני נ' מדינת ישראל (2007); ע\"פ 8991/10 מכבי נ' מדינת ישראל (2011); "
        "ע\"פ 7360/13 טאהא נ' מדינת ישראל (2014).)\n\n"

        "⚠️ **חשוב מאוד:**\n"
        "✔ **כל הטקסט חייב להופיע כפי שהוא, ללא שינוי, עריכה או השמטה.**\n"
        "✔ **כל פסקה חייבת להכיל ציטוט משפטי ולשמור על ההקשר המקורי שלה.**\n"
        "✔ **אם יש ציטוטים הקשורים זה לזה, יש להשאירם יחד.**\n"
        "✔ **אין לשנות את סדר הפסקאות ואין ליצור טקסטים חדשים.**\n\n"

        f"Text: {text}\n"
        "Processed Segments:"
    )

    try:
        response = client.chat.completions.create(
            model="gpt-4o",  # Change from "gpt-4o-2024-11-20"
            messages=[
                {"role": "system", "content": "You are an AI trained to extract and structure legal citations."},
                {"role": "user", "content": prompt}
            ]
        )

        processed_text = response.choices[0].message.content

        # Debugging: Print raw GPT response
        print("RAW GPT OUTPUT:\n", processed_text)

        # Improved paragraph splitting while preserving structure
        split_paragraphs = split_preserving_structure(processed_text)

        # Remove duplicates
        split_paragraphs = list(dict.fromkeys(split_paragraphs))

        # Debugging: Print processed paragraphs
        print("PROCESSED PARAGRAPHS:\n", split_paragraphs)

        # Return processed text
        return split_paragraphs if split_paragraphs else [text]

    except Exception as e:
        print(f"🚨 GPT API error: {e}")
        return [text]  # Return original text in case of failure
def filter_csv_relevant_parts(csv_data):
    """
    Extracts the first occurrence of a required part in the CSV and all subsequent rows.
    """
    start_index = None

    # Find the first row containing a required part
    for idx, row in csv_data.iterrows():
        if any(req_part in str(row.get("part", "")) for req_part in required_parts):
            start_index = idx
            break

    # If a match is found, return only relevant rows
    if start_index is not None:
        return csv_data.iloc[start_index:]
    else:
        return pd.DataFrame(columns=csv_data.columns)  # Return an empty DataFrame if no matches found

def enforce_citation_splitting(split_paragraphs):
    """
    Ensures each citation is properly separated, even if GPT fails.
    """
    refined = []
    citation_pattern = re.compile(r'(ע"?פ|עפ"ג|ת"?פ) \d+[-/]?\d{2,5} .*?\[\d{1,2}\.\d{2,4}\]')

    for para in split_paragraphs:
        matches = citation_pattern.findall(para)

        if len(matches) > 1:
            segments = citation_pattern.split(para)
            for i in range(1, len(segments), 2):  
                citation = segments[i].strip()
                context = segments[i + 1].strip() if i + 1 < len(segments) else ""
                refined.append(f"{citation} {context}")
        else:
            refined.append(para.strip())

    return refined

def process_and_tag_with_split(docx_path: str, csv_path: str, output_path: str):
    """
    Process a .docx document and its corresponding CSV, split paragraphs only if they contain 
    multiple citations, ensure they are within relevant parts, and tag with predictions.
    """
    doc = docx.Document(docx_path)
    csv_data = pd.read_csv(csv_path)
    filtered_csv_data = filter_csv_relevant_parts(csv_data)

    results = []
    for i, paragraph in enumerate(doc.paragraphs):
        para_text = paragraph.text.strip()
        if not para_text:
            continue  # Skip empty paragraphs

        # Count the number of citations in the paragraph
        citation_count = sum(para_text.count(pattern) for pattern in citation_patterns)

        is_relevant = False
        matching_part = None

        for _, row in filtered_csv_data.iterrows():
            part_text = row.get("text", "")
            if any(req_part in row.get("part", "") for req_part in required_parts) and part_text in para_text:
                is_relevant = True
                matching_part = row["part"]
                break  # Stop searching once a match is found

        if is_relevant:
            # Tag the paragraph using the model
            encoding = tokenizer_bert(para_text, truncation=True, padding=True, max_length=128, return_tensors="pt")
            encoding = {key: val.to(device) for key, val in encoding.items()}
            with torch.no_grad():
                output = model_bert(**encoding)
                prediction = torch.argmax(output.logits, dim=-1).item()

            # SPLIT ONLY IF TAG IS 1
            if prediction == 1:
                if citation_count > 1:
                    split_paragraphs = query_gpt(para_text)
                    split_paragraphs = enforce_citation_splitting(split_paragraphs)
                    original_paragraph = para_text  # Store the original text
                else:
                    split_paragraphs = [para_text]
                    original_paragraph = None  # Not split, so no original needed
            else:
                split_paragraphs = [para_text]  # Keep as is
                original_paragraph = None  # No need to store original

            for split_text in split_paragraphs:
                # Check if the split paragraph contains a citation
                if not any(pattern in split_text for pattern in citation_patterns):
                    continue  # Skip non-citation paragraphs

                # Save results
                result = {
                    'paragraph_number': i,
                    'original_paragraph': original_paragraph if citation_count > 1 else split_text,  # Store original if split
                    'paragraph_text': split_text,
                    'part': matching_part,
                    'predicted_label': prediction
                }
                results.append(result)


    results_df = pd.DataFrame(results)
    results_df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"Tagged citations saved to: {output_path}")

if __name__ == "__main__":
    docx_directory = Path('/home/liorkob/thesis/lcp/data/docx_2019')
    csv_directory = Path('/home/liorkob/thesis/lcp/data/docx_csv_2019')
    output_directory = Path('/home/liorkob/thesis/lcp/data/tag_citations_csv_2019')

    output_directory.mkdir(parents=True, exist_ok=True)

    for file_path in docx_directory.glob("*.docx"):
        new_file_path = file_path.stem
        print(f"Processing {new_file_path}")

        csv_file = csv_directory / f"{new_file_path}.csv"
        
        if file_path.exists() and csv_file.exists():
            output_file = output_directory / f"{file_path.stem}.csv"
            process_and_tag_with_split(str(file_path), str(csv_file), str(output_file))
        else:
            if not file_path.exists():
                print(f"Document file not found: {file_path}")
            if not csv_file.exists():
                print(f"CSV file not found for: {csv_file}")


### extract citation WITH API -v2

In [1]:
import re
import pandas as pd

# List of legal acronyms (same as yours)
acronyms = [
    "אב", "אבע", "אימוצ", "אמצ", "אפ", "אפח", "את", "אתפ", "באפ", "באש", "בבנ", "בגצ", "בדא", "בדמ",
    "בדמש", "בהנ", "בהע", "בהש", "בידמ", "בידע", "בל", "בלמ", "במ", "בעא", "בעח", "בעמ", "בעק", "בפ",
    "בפמ", "בפת", "בצא", "בצהמ", "בק", "בקמ", "בקשה", "ברמ", "ברע", "ברעפ", "ברש", "בש", "בשא",
    "בשגצ", "בשהת", "בשז", "בשמ", "בשע", "בשפ", "בתת", "גזז", "גמר", "גפ", "דבע", "דח", "דט", "דיונ",
    "דמ", "דמר", "דמש", "דנ", "דנא", "דנגצ", "דנמ", "דנפ", "הד", "הדפ", "הוצלפ", "הט", "הכ", "המ",
    "המד", "הממ", "המע", "המש", "הנ", "הסת", "הע", "העז", "הפ", "הפב", "הפמ", "הצמ", "הש", "השא",
    "השגצ", "השפ", "השר", "הת", "וחק", "וע", "ושמ", "ושק", "ושר", "זי", "חא", "חבר", "חד", "חדא",
    "חדלפ", "חדלת", "חדמ", "חדפ", "חהע", "חי", "חנ", "חסמ", "חעמ", "חעק", "חש", "יוש", "ייתא", "ימא",
    "יס", "כצ", "מ", "מא", "מבכ", "מבס", "מונופולינ", "מזג", "מח", "מחוז", "מחע", "מט", "מטכל", "מי",
    "מיב", "מכ", "ממ", "מס", "מסט", "מעי", "מעת", "מקמ", "מרכז", "מת", "נ", "נב", "נבא", "נמ", "נמב",
    "נעד", "נער", "סבא", "סע", "סעש", "סק", "סקכ", "ע", "עא", "עאח", "עאפ", "עב", "עבאפ", "עבז", "עבח",
    "עבי", "עבל", "עבמצ", "עבעח", "עבפ", "עבר", "עבשהת", "עגר", "עדי", "עדמ", "עהג", "עהס", "עהפ",
    "עו", "עורפ", "עז", "עח", "עחא", "עחדלפ", "עחדפ", "עחדת", "עחהס", "עחע", "עחק", "עחר", "עכב",
    "על", "עלא", "עלבש", "עלח", "עלע", "עמ", "עמא", "עמה", "עמז", "עמח", "עמי", "עמלע", "עממ", "עמנ",
    "עמפ", "עמצ", "עמק", "עמרמ", "עמש", "עמשמ", "עמת", "ענ", "ענא", "ענמ", "ענמא", "ענמש", "ענפ",
    "עסא", "עסק", "עע", "עעא", "עעמ", "עער", "עעתא", "עפ", "עפא", "עפג", "עפהג", "עפמ", "עפמק",
    "עפנ", "עפס", "עפספ", "עפע", "עפר", "עפת", "עצמ", "עק", "עקג", "עקמ", "עקנ", "עקפ", "ער", "ערא",
    "ערגצ", "ערמ", "ערעור", "ערפ", "ערר", "עש", "עשא", "עשמ", "עשר", "עשת", "עשתש", "עת", "עתא",
    "עתמ", "עתפב", "עתצ", "פא", "פה", "פל", "פלא", "פמ", "פמר", "פעמ", "פקח", "פר", "פרק", "פשז",
    "פשר", "פת", "צא", "צבנ", "צה", "צו", "צח", "צמ", "קג", "קפ", "רחדפ", "רמש", "רע", "רעא", "רעב",
    "רעבס", "רעו", "רעמ", "רעס", "רעפ", "רעפא", "רעצ", "רער", "רערצ", "רעש", "רעתא", "רצפ", "רתק",
    "ש", "שבד", "שמ", "שמי", "שנא", "שע", "שעמ", "שק", "שש", "תא", "תאדמ", "תאח", "תאמ", "תאק", "תב",
    "תבכ", "תבע", "תג", "תגא", "תד", "תדא", "תהג", "תהנ", "תהס", "תוב", "תוח", "תח", "תחפ", "תחת",
    "תט", "תי", "תכ", "תלא", "תלב", "תלהמ", "תלפ", "תלתמ", "תמ", "תמהח", "תממ", "תמק", "תמר",
    "תמש", "תנג", "תנז", "תע", "תעא", "תעז", "תפ", "תפב", "תפח", "תפחע", "תפכ", "תפמ", "תפע",
    "תפק", "תצ", "תק", "תקח", "תקמ", "תרמ", "תת", "תתח", "תתע", "תתעא", "תתק"
]

def create_acronym_variants(acronyms):
    acronym_variants = []
    for a in acronyms:
        if len(a) > 1:
            # Case 1: Original acronym with quotes/dots before last letter
            base_acronym = a
            if a.startswith('ב') or a.startswith('ו') or a.startswith('ה'):
                # Also add variant without the prefix letter
                base_acronym = a[1:]
            
            # For each acronym (both with and without prefix)
            for acr in [a, base_acronym]:
                if len(acr) > 1:
                    # Standard quote/dot before last letter
                    quoted = rf"{acr[:-1]}[\"'״]{acr[-1]}"
                    with_dot = rf"{acr[:-1]}\.{acr[-1]}"
                    acronym_variants.append(f"(?:{quoted}|{with_dot})")
                    
                    # Add dot-separated variant
                    dots_between = '\.'.join(list(acr))
                    acronym_variants.append(dots_between)
    
    return '|'.join(acronym_variants)
        
acronym_pattern = create_acronym_variants(acronyms)

# Ensure the numbers follow the correct format
number_pattern = r'''
    (?:
        \d{1,6}[-/]\d{2}[-/]\d{2}  # Format: 31067-11-11
        | \d{1,6}[-/]\d{1,6}         # Format: 895/09
        | \d{1,6}-\d{2}-\d{2}        # Format: 31067-11-11 (hyphenated)
    )
'''
citation_pattern = fr'''
    (?<!\w)                      # Ensure no letter before
    ([א-ת]?)                     # Optional single Hebrew prefix letter (but no isolated matches)
    ({acronym_pattern})           # Captures acronym (short & long)
    \.?                          # Optional dot after acronym
    \s*                          # Optional spaces
    (\((.*?)\))?                  # Optional court location in parentheses
    \s*[-/]?\s*                  # Required space or separator before case number
    ({number_pattern})            # Captures case number formats
    (?!\w)                       # Ensure no letter after
'''.strip()

# Compile regex with verbose flag for readability
citation_regex = re.compile(citation_pattern, re.VERBOSE)

# Test the regex with example text
test_text = 'בעפ"ג (ב"ש) 31067-11-11'
match = citation_regex.search(test_text)
if match:
    print(f"Matched: {match.group()}")
else:
    print("No match")

# Test cases
test_cases = [
    "בת״פ 18402-08-13",
    "תפ״א 12345-01-22",
    "ת״פ 12345-01-22",
    "ע.פ. 567/22",
"ע״פ 567/22", 
'בת.פ. 56255-02-12',
'עפ"ג 6074/93'
]
# Extract matches
for text in test_cases:
    match = re.search(citation_regex, text)

    if match:
        print(f"match.group(): {match.group()}")



import re
import pandas as pd
def extract_citations_from_csv(csv_data):
    citations = []
    text_column = csv_data["text"].astype(str)  # Convert to string to avoid NaN issues
    pd.set_option("display.max_colwidth", None)  # Ensure full text is displayed
    # print("\n".join(text_column))  # Print each row as a full text
    for i, text in enumerate(text_column, 1):
        print(f"{i}. {text}")

    matches = text_column.str.extractall(citation_regex)  # Extract structured matches
    print("Extracted Matches:")
    print(matches)

    print("Extracted DataFrame:", matches)  # Debugging step
    
    for _, row in matches.iterrows():
        # Build the citation string, joining all valid elements
        citation = " ".join(map(str, filter(pd.notna, row))).strip()

        # Clean up extra spaces
        citation = re.sub(r"\s{2,}", " ", citation)

        # Optionally remove unwanted prefixes like "ב", "ו", "ר"
        citation = re.sub(r"^\b[בוור]\b\s*", "", citation)

        # Remove invalid extra words (e.g., "על 12")
        if re.match(r"^על \d+$", citation):  
            continue  # Skip invalid cases like "על 12"

        # Fix duplicated court locations, e.g., "(מחוזי מרכז) מחוזי מרכז" → "(מחוזי מרכז)"
        citation = re.sub(r"\((.*?)\)\s+\1", r"(\1)", citation)

        # Add the cleaned citation to the list
        citations.append(citation)
    
    # Return citations as a list, even if some are empty or missing optional groups
    return citations if citations else []




Matched: בעפ"ג (ב"ש) 31067-11-11
match.group(): בת״פ 18402-08-13
match.group(): תפ״א 12345-01-22
match.group(): ת״פ 12345-01-22
match.group(): ע.פ. 567/22
match.group(): ע״פ 567/22
match.group(): בת.פ. 56255-02-12
match.group(): עפ"ג 6074/93


In [None]:
import os
import gc
import torch
import pandas as pd
import docx
import re
from transformers import AutoTokenizer, BertTokenizer, BertForSequenceClassification
from pathlib import Path
from openai import OpenAI
import os

os.environ["OPENAI_API_KEY"] = "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"  # Replace with actual key

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Define required sections and citation patterns
required_parts = [
    "מתחמי ענישה", "אחידות בענישה", "מתחם הענישה", "מתחם ענישה", "דיון",
    "ענישה נהוגה", "הענישה הנוהגת", "ענישה נוהגת", "מתחם העונש", "מתחם עונש",
    "מדיניות הענישה", "והכרעה", "ההרשעה", "מדיניות הענישה הנהוגה"
]


# Check for CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the trained BERT model and tokenizer
model_path = "/home/liorkob/classifier_relvant_citation_model.pt" 
tokenizer_bert = BertTokenizer.from_pretrained('avichr/heBERT')
model_bert = BertForSequenceClassification.from_pretrained('avichr/heBERT', num_labels=2)
model_bert.load_state_dict(torch.load(model_path, map_location=device))
model_bert.to(device)
model_bert.eval()


def split_preserving_structure(text):
    paragraphs = re.split(r'(?<=\d\.)\s', text)  # Split after numbers followed by a period
    return [para.strip() for para in paragraphs if para.strip()]

def query_gpt(text,citation):
    """
    Queries GPT-4o to extract and segment legal citations.
    """
    prompt = f"""
    Given the following legal text:

    {text}

    Your task is to extract **only** the part of the text that directly relates to the citation "{citation}".
    
    **Extraction Rules:**
    - **Do not modify any wording.** Keep the original phrasing exactly as it appears in the provided document.
    - **Do not summarize or rephrase.**
    - **Return only the relevant portion**, not the full text.
    - **Handle grouped citations carefully:**
        - If the citation appears in a list following "ראו למשל ..." or similar, include the preceding explanation that applies to all citations.
        - Do not include other citations from the list—return only the text relevant to "{citation}".
    - **Handle case explanations properly:**
        - If the citation is explained in a specific section (e.g., "בע"פ 9373/10 ותד נ' מדינת ישראל..."), extract the **entire explanation** of the case.
        - Do not remove any important context about the court ruling.
    - Do **not** extract only "(רע"פ 2718/04)" without the legal principle it supports.


    Only return the extracted text. Do not include unrelated content or formatting.
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4o", 
            messages=[
                {"role": "system", "content": "You are an AI trained to extract and structure legal citations."},
                {"role": "user", "content": prompt}
            ]
        )

        processed_text = response.choices[0].message.content


            # Debug: Print the extracted response from GPT
        # print("\n===== DEBUG: GPT RESPONSE =====")
        # print(f"Citation: {citation}")
        # print("Prompt Sent to GPT:")
        # print(prompt)
        # print("Extracted Text:")
        # print(processed_text)
        # print("==============================\n")

        return processed_text

    except Exception as e:
        print(f"🚨 GPT API error: {e}")
        return [text]  # Return original text in case of failure
def filter_csv_relevant_parts(csv_data):
    """
    Extracts the first occurrence of a required part in the CSV and all subsequent rows.
    """
    start_index = None

    # Find the first row containing a required part
    for idx, row in csv_data.iterrows():
        if any(req_part in str(row.get("part", "")) for req_part in required_parts):
            start_index = idx
            break

    # If a match is found, return only relevant rows
    if start_index is not None:
        return csv_data.iloc[start_index:]
    else:
        return pd.DataFrame(columns=csv_data.columns)  # Return an empty DataFrame if no matches found



# Function to find all occurrences of a citation in the document
def find_all_occurrences(doc, citation):
    indices = []
    for i, paragraph in enumerate(doc.paragraphs):
        if citation in paragraph.text:
            indices.append(i)  # Store all occurrences of the citation
    return indices

# Function to get relevant context for each occurrence of the citation
def get_context_paragraphs(doc, index, citation):
    context_text = []

    # Search for the closest non-empty previous paragraph
    prev_index = index - 1
    while prev_index >= 0 and not doc.paragraphs[prev_index].text.strip():
        prev_index -= 1  # Move backwards until finding text

    if prev_index >= 0:
        context_text.append(doc.paragraphs[prev_index].text.strip())

    # Get the current paragraph (must exist, but check if empty)
    curr_text = doc.paragraphs[index].text.strip()
    if curr_text:
        context_text.append(curr_text)
    else:
        print(f"⚠️ Warning: Empty paragraph for citation {citation} at index {index}. Skipping occurrence.")
        return None  # Skip this occurrence if the current paragraph is empty

    # Search for the closest non-empty next paragraph
    next_index = index + 1
    while next_index < len(doc.paragraphs) and not doc.paragraphs[next_index].text.strip():
        next_index += 1  # Move forward until finding text

    if next_index < len(doc.paragraphs):
        context_text.append(doc.paragraphs[next_index].text.strip())

    # Ensure we have at least one non-empty paragraph
    if not context_text:
        print(f"⚠️ Warning: No valid text found for citation {citation} at index {index}. Skipping occurrence.")
        return None

    return "\n".join(context_text).strip()


# Function to process and tag document paragraphs
def process_and_tag_with_split(docx_path: str, csv_path: str, output_path: str):
    """
    Process a .docx document and its corresponding CSV, find relevant paragraphs with context, 
    extract relevant text using GPT, tag with BERT, and store results.
    """
    doc = docx.Document(docx_path)
    csv_data = pd.read_csv(csv_path)
    filtered_csv_data = filter_csv_relevant_parts(csv_data)

    citations = extract_citations_from_csv(filtered_csv_data)
    results = []

    for citation in citations:
        citation_indices = find_all_occurrences(doc, citation)  # Find all occurrences

        # Collect all contexts where the citation appears
        merged_contexts = []
        for index in citation_indices:
            full_context = get_context_paragraphs(doc, index, citation)
            if full_context:
                merged_contexts.append(full_context)

        # If no valid contexts found, skip this citation
        if not merged_contexts:
            continue  

        # Merge all valid contexts into one, ensuring uniqueness
        final_context = "\n".join(set(merged_contexts)).strip()  # Remove duplicates
        print(citation)
        # print(final_context)

        # Ask GPT to extract the relevant part
        extracted_text = query_gpt(final_context, citation)

        # Tag the extracted text with BERT
        encoding = tokenizer_bert(extracted_text, truncation=True, padding=True, max_length=128, return_tensors="pt")
        encoding = {key: val.to(device) for key, val in encoding.items()}
        with torch.no_grad():
            output = model_bert(**encoding)
            prediction = torch.argmax(output.logits, dim=-1).item()

        # Store only one result per citation
        result = {
            'citation': citation,
            'context_text': final_context,
            'extracted_text': extracted_text,
            'predicted_label': prediction
        }
        results.append(result)

    # Save to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_path, index=False, encoding="utf-8")
    print(f"Processed document saved to: {output_path}")

if __name__ == "__main__":
    docx_directory = Path('/home/liorkob/M.Sc/thesis/data/drugs/docx')
    csv_directory = Path('/home/liorkob/M.Sc/thesis/data/drugs/docx_csv')
    output_directory = Path('/home/liorkob/M.Sc/thesis/data/drugs/tag_citations')

    output_directory.mkdir(parents=True, exist_ok=True)

    for file_path in docx_directory.glob("*.docx"):
        new_file_path = file_path.stem
        print(f"Processing {new_file_path}")

        csv_file = csv_directory / f"{new_file_path}.csv"
        
        if file_path.exists() and csv_file.exists():
            output_file = output_directory / f"{file_path.stem}.csv"
            if output_file.exists():
                continue
            process_and_tag_with_split(str(file_path), str(csv_file), str(output_file))
        else:
            if not file_path.exists():
                print(f"Document file not found: {file_path}")
            if not csv_file.exists():
                print(f"CSV file not found for: {csv_file}")


2025-04-08 11:06:04.086039: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744099564.937118 2889771 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744099565.205763 2889771 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744099569.116398 2889771 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744099569.116480 2889771 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744099569.116485 2889771 computation_placer.cc:177] computation placer alr

Processing ת"פ 08∕207
Processing ת"פ 34607-03-17
Processing תפ"ק 19437-12-16
Processing ת"פ 16759-07-17
Processing ת"פ 58269-02-19
Processing ת"פ 50607-03-19
Processing ת"פ 41247-05-19
Processing ת"פ 45102-06-19
Processing ת"פ 18218-11-19
Processing ת"פ 6047-11-19
Processing ת"פ 65763-12-19
Processing ת"פ 48152-02-20
Processing ת"פ 28746-05-20
Processing ת"פ 57459-06-20
Processing ת"פ 12380-06-20
Processing ת"פ 8440-07-20
Processing ת"פ 24302-08-20
Processing ת"פ 43084-09-20
Processing תפ"ק 29591-10-20
Processing ת"פ 39690-02-21
Processing ת"פ 40908-02-21
Processing ת"פ 10393-03-21
Processing ת"פ 56850-03-21
Processing תפ"ק 7171-03-21
Processing ת"פ 3526-04-21
Processing ת"פ 986-06-21
Processing ת"פ 36546-08-21
Processing ת"פ 42879-09-21
Processing ת"פ 30884-10-21
Processing ת"פ 47780-11-21
Processing ת"פ 9851-11-21
Processing ת"פ 16014-01-22
Processing ת"פ 54272-09-14
Processing ת"פ 18588-11-10
Processing ת"פ 2892-07-10
Processing ת"פ 56235-06-13
Processing ת"פ 44226-07-14
Processing 

: 

### Print results

### get URLS from docx

In [None]:
from docx import Document
import pandas as pd
import re
import os
from docx import Document
import pandas as pd
import re
import os
from docx.oxml.ns import qn
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from bs4 import BeautifulSoup

def normalize_case_name(case_name):
    """Normalize case names by removing extra spaces and fixing slashes."""
    return re.sub(r'\s+', ' ', case_name.replace('∕', '/')).strip()


def normalize_citation(citation):
    """Normalize citation by removing prefixes and standardizing format."""
    if not citation:
        return None
    # Standardize quotes
    citation = citation.replace('״', '"').replace('״', '"').replace('״', '"')
    # Remove extra spaces
    citation = re.sub(r'\s+', ' ', citation).strip()
    # Remove common prefixes, including רע"פ
    citation = re.sub(r'^(ע"?פ|ת"?פ|עפ"?ג|רע"?פ)\s+', '', citation)
    return citation


# citation_patterns = {
#     'ע"פ': r'ע"פ (\d+/\d+)',
#     'עפ"ג': r'עפ"ג (\d+/\d+)',
#     'ת״פ': r'ת״פ (\d+[-/]\d+[-/]\d+)',
#     'עפ״ג': r'עפ״ג (\d+/\d+)',
#     'רע״פ': r'רע״פ (\d+/\d+)',
#     'תפ"ח': r'תפ"ח\s*(\d+[-/]\d+[-/]\d+)',
# }

# def extract_citations(text):
#     """Extracts citations from the paragraph_text column based on predefined patterns."""
#     matches = []
#     for label, pattern in citation_patterns.items():
#         found = re.findall(pattern, text)
#         matches.extend([f"{label} {m}" for m in found])
#     return matches[0] if matches else None

def extract_citations(text):
    """Extracts legal citations from a single text string."""
    matches = citation_regex.findall(text)
    citations = []
    for match in matches:
        citation = " ".join(filter(None, match)).strip()
        citation = re.sub(r"\s{2,}", " ", citation)
        citation = re.sub(r"^\b[בוור]\b\s*", "", citation)
        citation = re.sub(r"\((.*?)\)\s+\1", r"(\1)", citation)
        if not re.match(r"^על \d+$", citation):
            citations.append(citation)
    return citations[0] if citations else None


def getLinkedText(soup):
    links = []
    for tag in soup.find_all("hyperlink"):
        try:
            links.append({"id": tag["r:id"], "text": tag.text})
        except KeyError:
            pass

    for tag in soup.find_all("instrText"):
        if "HYPERLINK" in tag.text:
            parts = tag.text.split('"')
            if len(parts) > 1:  # Ensure the URL exists before accessing index 1
                url = parts[1]
            else:
                print(f"⚠️ Warning: No valid URL found in HYPERLINK tag: {tag.text}")
                url = None  # Assign None if URL is missing

            temp = tag.parent.next_sibling
            text = ""

            while temp is not None:
                maybe_text = temp.find("t")
                if maybe_text is not None and maybe_text.text.strip() != "":
                    text += maybe_text.text.strip()
                maybe_end = temp.find("fldChar[w:fldCharType]")
                if maybe_end is not None and maybe_end["w:fldCharType"] == "end":
                    break
                temp = temp.next_sibling

            links.append({"id": None, "href": url, "text": text})
    return links
def getURLs(soup, links):
    for link in links:
        if "href" not in link:
            for rel in soup.find_all("Relationship"):
                if rel["Id"] == link["id"]:
                    link["href"] = rel["Target"]
    return links

import zipfile

def extract_hyperlinks(docx_path):
    """
    Extracts hyperlinks from a .docx file and returns a dictionary 
    where the linked text is mapped to its corresponding URL.
    """
    # Open the .docx file as a zip archive
    try:
        archive = zipfile.ZipFile(docx_path, "r")
    except zipfile.BadZipFile:
        print(f"❌ Error: Cannot open {docx_path} (Bad ZIP format)")
        return {}

    # Extract main document XML
    try:
        file_data = archive.read("word/document.xml")
        doc_soup = BeautifulSoup(file_data, "xml")
        linked_text = getLinkedText(doc_soup)
    except KeyError:
        print(f"⚠️ Warning: No document.xml found in {docx_path}")
        return {}

    # Extract hyperlink relationships from _rels/document.xml.rels
    try:
        url_data = archive.read("word/_rels/document.xml.rels")
        url_soup = BeautifulSoup(url_data, "xml")
        links_with_urls = getURLs(url_soup, linked_text)
    except KeyError:
        print(f"⚠️ Warning: No _rels/document.xml.rels found in {docx_path}")
        links_with_urls = linked_text

    # Extract footnotes (if available)
    try:
        footnote_data = archive.read("word/footnotes.xml")
        footnote_soup = BeautifulSoup(footnote_data, "xml")
        footnote_links = getLinkedText(footnote_soup)

        footnote_url_data = archive.read("word/_rels/footnotes.xml.rels")
        footnote_url_soup = BeautifulSoup(footnote_url_data, "xml")
        footnote_links_with_urls = getURLs(footnote_url_soup, footnote_links)

        # Merge footnote links
        links_with_urls += footnote_links_with_urls
    except KeyError:
        pass  # No footnotes found, continue

    # Convert extracted links to a dictionary: {linked_text: URL}
    return {link["text"]: link.get("href", None) for link in links_with_urls}


import pandas as pd
from pathlib import Path

def update_csv_with_links(csv_path, doc_path):
    csv_path = Path(csv_path)  # Convert to Path object if not already
    
    # **Check if CSV is empty before reading**
    if not csv_path.exists() or csv_path.stat().st_size == 0:  
        print(f"Skipping empty or missing file: {csv_path.name}")
        return
    
    try:
        df = pd.read_csv(csv_path)
        
        # **Check if the DataFrame is empty after loading**
        if df.empty:
            print(f"Skipping empty DataFrame: {csv_path.name}")
            return
        
        # Normalize extracted citations
        df["extracted_citation"] = df["paragraph_text"].apply(
            lambda text: normalize_citation(extract_citations(text)) if pd.notna(text) else None
        )
        
        # Normalize citation_links keys
        citation_links = extract_hyperlinks(doc_path)
        normalized_citation_links = {normalize_citation(k): v for k, v in citation_links.items()}
        
        # Assign URLs to citations
        df["link"] = df["extracted_citation"].apply(
            lambda text: normalized_citation_links.get(text, None) if pd.notna(text) else None
        )
        
        df.to_csv(csv_path, index=False)
        print(f"Updated CSV saved to: {csv_path}")

    except pd.errors.EmptyDataError:
        print(f"Skipping {csv_path.name}: CSV file is empty or unreadable.")
        return

    

def find_matching_docx(csv_name, docx_directory):
    normalized_csv_name = normalize_case_name(csv_name.replace('.csv', '.docx'))
    for root, _, files in os.walk(docx_directory):
        for file in files:
            if file.endswith(".docx") and normalize_case_name(file) == normalized_csv_name:
                return os.path.join(root, file)
    return None

def process_all_csvs(csv_directory, docx_directory):
    for root, _, files in os.walk(csv_directory):
        for file in files:
            if file.endswith(".csv"):
                csv_path = os.path.join(root, file)
                docx_path = find_matching_docx(file, docx_directory)
                # if file != 'ת"פ 49772-11-16.csv':
                #     continue
                if docx_path:
                    update_csv_with_links(csv_path, docx_path)
                else:
                    print(f"No matching DOCX found for: {file}")

docx_csv_dir = f"/home/liorkob/M.Sc/thesis/data/drugs/docx_csv"
citations_dir = f"/home/liorkob/M.Sc/thesis/data/drugs/tag_citations"
process_all_csvs(citations_dir, docx_csv_dir)


In [None]:
import os
import pandas as pd

# Define directories and paths
UPDATED_CSV_DIR = "/home/liorkob/M.Sc/thesis/data/drugs/tag_citations"

def verify_updated_files_tag_1():
    """Iterates over updated CSV files and prints missing links for rows where tag is 1."""

    for root, _, files in os.walk(UPDATED_CSV_DIR):
        for file in files:
            if file.endswith(".csv"):
                csv_path = os.path.join(root, file)
                df = pd.read_csv(csv_path)

                # Filter rows with tag = 1 and missing links
                missing_links = df[(df["predicted_label"] == 1) & (df["extracted_citation"].notna()) & (df["link"].isna())]
                
                if not missing_links.empty:
                    print(f"\n🔍 Missing links in file: {file}")
                    for _, row in missing_links.iterrows():
                        print(f"- Citation: {row['extracted_citation']}")
                        print(f"  Paragraph: {row['paragraph_text'][:200]}...")  # Show first 200 chars
                        print("-" * 50)

# Run verification for tag = 1
verify_updated_files_tag_1()
