### UTILS


In [4]:
# pip install stanza
# import stanza
# stanza.download('he') 


In [5]:
import stanza
# nlp = stanza.Pipeline('he', use_gpu=False)
nlp = stanza.Pipeline('he', processors='tokenize', use_gpu=True)

import re
from docx.text.paragraph import Paragraph
from docx import Document

from docx.table import _Cell, Table
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl

import sys
print(sys.executable)

# Modify property of Paragraph.text to include hyperlink text
Paragraph.text = property(lambda self: get_paragraph_text(self))

def get_paragraph_text(paragraph) -> str:
    """
    Extract text from paragraph, including hyperlink text.
    """
    def get_xml_tag(element):
        return "%s:%s" % (element.prefix, re.match("{.*}(.*)", element.tag).group(1))

    text_content = ''
    run_count = 0
    for child in paragraph._p:
        tag = get_xml_tag(child)
        if tag == "w:r":
            text_content += paragraph.runs[run_count].text
            run_count += 1
        if tag == "w:hyperlink":
            for sub_child in child:
                if get_xml_tag(sub_child) == "w:r":
                    text_content += sub_child.text
    return text_content


def is_paragraph_bold(block) -> bool:
    if block.style and block.style.font:
        if block.style.font.bold:  # Check if bold is part of the style
            return True
    return False

def is_block_bold(block) -> bool:
    # Check if the paragraph style indicates a bold style (e.g., "כותרת")
    if block.style and block.style.name in ["כותרת", "Heading", "Title"]:  
        return True

    # Check if the style font is bold
    if block.style and block.style.font and block.style.font.bold:
        return True

    # # Check if any run is bold
    # if block.runs:
    #     for run in block.runs:
    #         if run.bold or (run.font and run.font.bold):
    #             return True
    return False
def is_run_bold(run) -> bool:
    """
    Check if a run is bold, including inherited and complex script (cs_bold) styles.
    """
    if run.bold is not None:
        return run.bold
    if run.font and run.font.bold is not None:
        return run.font.bold
    if run.font and run.font.cs_bold is not None:
        return run.font.cs_bold  # Check for complex script bold
    return False

def is_block_styled(block) -> bool:
    """
    Check if the entire block/paragraph text is fully bold or fully underlined,
    while handling:
    - Allow the first run to differ in style if it is a prefix (e.g., 'א.', '1.', 'א)', '1)').
    - Skip empty or non-alphanumeric runs.
    - Allow trailing punctuation with different styling.
    """
    if hasattr(block, "runs") and block.runs:
        # Combine text from all meaningful runs
        combined_text = " ".join(run.text.strip() for run in block.runs if run.text.strip()).strip()
        
        # Handle empty text
        if not combined_text:
            return False
        
        # Check word count
        word_count = len(combined_text.split())
        if word_count < 4:
            # print(combined_text)
            return True  # Return True if there are fewer than 3 words


        # Identify meaningful runs: Ignore runs that are empty or contain only spaces/non-alphanumeric characters
        meaningful_runs = [run for run in block.runs if run.text.strip() and any(c.isalnum() for c in run.text)]

        if not meaningful_runs:
            return False

        # Check if the first run is a prefix (e.g., "א.", "1.", "א)", "1)")
        first_run_text = meaningful_runs[0].text.strip()
        is_prefix = bool(re.match(r'^[\u0590-\u05FF]\.|^[\u0590-\u05FF]\)|^\d+\.|^\d+\)', first_run_text))

        # Allow the first run to differ in style if it's a valid prefix
        runs_to_check = meaningful_runs[1:] if is_prefix else meaningful_runs

        # Check if all remaining runs are styled as bold or underlined
        all_bold =is_block_bold(block) or all(is_run_bold(run) or run.text in [":", ".", ","] for run in runs_to_check)
        all_underlined = all(run.underline is True or run.text in [":", ".", ","] for run in runs_to_check)

        # Allow for trailing punctuation to differ in style
        if combined_text[-1] in [":", ".", ","]:
            return all_bold or all_underlined
        else:
            return is_block_bold(block) or all(is_run_bold(run) or run.underline is True for run in runs_to_check)

    return False



def iterate_block_items(parent):
    """
    Iterate over paragraphs and tables in a document or cell.
    """
    if hasattr(parent, "element") and hasattr(parent.element, "body"):
        parent_element = parent.element.body
    elif hasattr(parent, "_tc"):
        parent_element = parent._tc
    else:
        print(f"Unsupported parent type: {type(parent)}")
        return

    for child in parent_element.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            table = Table(child, parent)
            for row in table.rows:
                for cell in row.cells:
                    yield from iterate_block_items(cell)

def extract_part_after_number_or_hebrew_letter(sentence: str) -> str:
    """
    Extract text following a pattern of number or Hebrew letter.
    """
    pattern = r'^(?:[0-9\u05D0-\u05EA]+)\.\s*(.*)'
    match = re.search(pattern, sentence)
    return match.group(1).strip() if match else sentence

def count_patterns_in_block(block) -> int:
    """
    Count the number-dot or dot-number patterns in a block.
    """
    pattern = r'\s*(?:\.\d+|\d+\.)'
    return len(re.findall(pattern, block.text))

def count_consecutive_blocks_starting_with_number(blocks) -> int:
    """
    Count consecutive blocks starting with a number or Hebrew letter.
    """
    count = 0
    for block in blocks:
        if 'הנאשם' in block.text:
            return 1
        count += count_patterns_in_block(block)
        if 'חקיקה שאוזכרה' in block.text:
            break
    return count

def extract_name_after_word(text: str, word: str) -> str:
    """
    Extract the words following a given word up to the end of the sentence.
    """
    pattern = re.compile(fr'{word}(?:,)?\s*([\u0590-\u05FF\s\'\(\)-]+)')
    match = pattern.search(text)
    return match.group(1) if match else ''

def extract_violations(text: str) -> list:
    """
    Extract violations from the text based on a pre-defined pattern.
    """

    matches = re.findall(r"(?:סעיף|סעיפים|ס'|סע')\s*\d+\s*(?:\([\s\S]*?\))?.*?(?=\s*(?:ב|ל)(?:חוק|פקודת))\s*(?:ב|ל)(?:חוק|פקודת)\s*ה?(?:עונשין|כניסה לישראל|סמים\s+המסוכנים|\w+)?", text)
    # matches = re.findall(r"(?:סעיף|סעיפים|ס'|סע')\s*\d+\s*(?:\([\s\S]*?\))?.*?(?=\s*(?:ב|ל)(?:חוק|פקודת))\s*(?:ב|ל)(?:חוק|פקודת)\s*ה?(?:עונשין|כניסה לישראל|סמים\s+המסוכנים|[^\[]+)?", text)

    matches = [match.strip() for match in matches]
    return matches

2025-05-27 10:15:31 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-27 10:15:32 INFO: Downloaded file to /home/liorkob/stanza_resources/resources.json
2025-05-27 10:15:32 INFO: Loading these models for language: he (Hebrew):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2025-05-27 10:15:32 INFO: Using device: cuda
2025-05-27 10:15:32 INFO: Loading: tokenize
2025-05-27 10:15:57 INFO: Loading: mwt
2025-05-27 10:15:57 INFO: Done loading processors!


/home/liorkob/.conda/envs/new_env/bin/python


### metadata

In [6]:
import os
import re
import unicodedata
import pandas as pd
from docx import Document
import re
from openai import OpenAI

# ========== API Setup ==========
os.environ["OPENAI_API_KEY"] = "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"  
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def normalize_text(text):
    text = unicodedata.normalize("NFKC", text)
    return text.replace("\u00A0", " ").replace("\u200f", "").strip()
def clean_hebrew_verdict_text(text: str) -> str:
    # Remove duplicate phrases (e.g. repeated 'בית משפט השלום בקריות')
    parts = list(dict.fromkeys(text.split('<<')))
    cleaned = '<<'.join(parts)

    # Remove nested/multiple angle brackets
    cleaned = re.sub(r'[<]{2,}', '<', cleaned)
    cleaned = re.sub(r'[>]{2,}', '>', cleaned)

    # Remove empty brackets or stray symbols
    cleaned = re.sub(r'<\s*>', '', cleaned)
    cleaned = re.sub(r'[<>]', '', cleaned)

    # Remove escape characters
    cleaned = cleaned.replace("\\'", "'").replace('\\', '')

    # Replace double "נגד נגד" with single
    cleaned = re.sub(r'נגד\s+נגד', 'נגד', cleaned)

    # Normalize spaces
    cleaned = re.sub(r'\s{2,}', ' ', cleaned).strip()

    return cleaned

terms= ["בפני","כבוד","לפני","השופט","השופטת","שופט","שופטת","שופטים"]
def extract_judge(docx_path):
    doc = Document(docx_path)
    first_rows = ""
    i = 0
    header = doc.sections[0].header
    head=""
    for paragraph in header.paragraphs:
        head+=paragraph.text

    first_rows+=head

    for block in iterate_block_items(doc):
        if i == 15:
            break
        first_rows += normalize_text(block.text) + " "
    
        if normalize_text(block.text) != "":
            i += 1
    first_rows=clean_hebrew_verdict_text(first_rows)
    # print(first_rows)
    if not any(term in first_rows for term in terms):
        print("\n")
        print(first_rows)

    prompt = (
        "מתוך הקטע הבא, מה שמו של השופט או השופטת? "
        "ענה רק בשם הפרטי ושם המשפחה של השופט/ת, בלי כותרות או קידומות. "
        "לדוגמה: דוד כהן\n\n"
        f"{first_rows}"
    )
    response = client.chat.completions.create(
        model="gpt-4.1-mini", 
        messages=[
            {"role": "system", "content": "You are an AI trained to extract factual allegations from legal texts, ensuring no interpretation or rewording."},
            {"role": "user", "content": prompt}
        ]
    )

    return response.choices[0].message.content.strip()





import os
import csv
from tqdm import tqdm
from docx import Document

# Your existing extract_judge function must be defined above this block

output_rows = []
docx_dir = '/home/liorkob/M.Sc/thesis/data/drugs_3k/docx/verdict'
docx_files = [f for f in os.listdir(docx_dir) if f.endswith('.docx')]

for filename in tqdm(docx_files, desc="📄 Extracting judges"):
    file_path = os.path.join(docx_dir, filename)
    judge_name = extract_judge(file_path)
    print(judge_name)
    output_rows.append({"file name": filename, "judge": judge_name})

# Save to CSV
with open("judges_extracted.csv", "w", newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["file name", "judge"])
    writer.writeheader()
    writer.writerows(output_rows)


📄 Extracting judges:   0%|          | 1/3045 [00:00<50:32,  1.00it/s]

דניאל פיש


📄 Extracting judges:   0%|          | 2/3045 [00:01<38:57,  1.30it/s]

רונית בש


📄 Extracting judges:   0%|          | 3/3045 [00:02<36:32,  1.39it/s]

אבי לוי


📄 Extracting judges:   0%|          | 4/3045 [00:02<35:40,  1.42it/s]

תמר שרון נתנאל


📄 Extracting judges:   0%|          | 5/3045 [00:03<38:18,  1.32it/s]

אליהו ביתן


📄 Extracting judges:   0%|          | 6/3045 [00:04<43:39,  1.16it/s]

דנה מרשק מרום


📄 Extracting judges:   0%|          | 7/3045 [00:05<43:31,  1.16it/s]

אבי לוי


📄 Extracting judges:   0%|          | 8/3045 [00:06<45:29,  1.11it/s]

עמית גורפינקל


📄 Extracting judges:   0%|          | 9/3045 [00:07<42:29,  1.19it/s]

ג'ורג' קרא


📄 Extracting judges:   0%|          | 10/3045 [00:08<41:52,  1.21it/s]

עמית כהן


📄 Extracting judges:   0%|          | 11/3045 [00:08<37:46,  1.34it/s]

רות לורך


📄 Extracting judges:   0%|          | 12/3045 [00:09<37:43,  1.34it/s]

אבי לוי


📄 Extracting judges:   0%|          | 13/3045 [00:10<38:25,  1.32it/s]

דניאל טפרברג


📄 Extracting judges:   0%|          | 14/3045 [00:12<56:51,  1.13s/it]

אמיר טובי


📄 Extracting judges:   0%|          | 15/3045 [00:12<49:13,  1.03it/s]

אהרון משניות


📄 Extracting judges:   1%|          | 16/3045 [00:13<45:29,  1.11it/s]

גיליה רביד


📄 Extracting judges:   1%|          | 17/3045 [00:14<42:12,  1.20it/s]

אמיר טובי


📄 Extracting judges:   1%|          | 18/3045 [00:15<43:22,  1.16it/s]

מרדכי לוי


📄 Extracting judges:   1%|          | 19/3045 [00:15<39:44,  1.27it/s]

ירון לוי


📄 Extracting judges:   1%|          | 20/3045 [00:16<37:21,  1.35it/s]

דוד רוזן


📄 Extracting judges:   1%|          | 21/3045 [00:17<37:00,  1.36it/s]

דוד רוזן


📄 Extracting judges:   1%|          | 22/3045 [00:18<38:53,  1.30it/s]

צבי גורפינקל


📄 Extracting judges:   1%|          | 22/3045 [00:18<41:55,  1.20it/s]


KeyboardInterrupt: 

In [7]:
import pandas as pd
judges_df = pd.read_csv("judges_extracted.csv")  # columns: file name, judge
judges_df["verdict_id"] = judges_df["file name"].str.replace(".docx", "", regex=False)

def normalize_citation(name):
    if pd.isna(name):
        return ""
    name = str(name)
    name = re.sub(r"\(.*?\)", "", name)
    name = re.sub(r"[∕/\\]", "-", name)
    name = re.sub(r"\s+", " ", name)
    name = name.strip().lower().replace(" ", "_")
    return name

import os
from collections import defaultdict

citations_by_judge = defaultdict(list)
csv_dir = "/home/liorkob/M.Sc/thesis/data/drugs_3k/gpt/verdicts_tagged_citations"  # איפה שקבצי הציטוטים

for _, row in judges_df.iterrows():
    judge = row["judge"]
    verdict_id = row["verdict_id"]
    csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")
    
    if not os.path.exists(csv_path):
        continue

    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        print(f"⚠️ Skipped {verdict_id}.csv due to error: {e}")
        continue

    if df.empty or "predicted_label" not in df.columns or "citation" not in df.columns:
        continue

    cited = df[df["predicted_label"] == 1]["citation"].dropna().map(normalize_citation)
    if not cited.empty:
        citations_by_judge[judge].extend(cited)


import matplotlib.pyplot as plt
from collections import Counter

for judge, citations in citations_by_judge.items():
    counter = Counter(citations)
    if not counter:
        continue
    top = counter.most_common(10)
    labels, values = zip(*top)



⚠️ Skipped תפ_26792-04-16.csv due to error: No columns to parse from file
⚠️ Skipped תפ_17316-10-16.csv due to error: No columns to parse from file
⚠️ Skipped תפ_55925-11-17.csv due to error: No columns to parse from file
⚠️ Skipped תפ_13236-03-18.csv due to error: No columns to parse from file
⚠️ Skipped תפ_39573-10-18.csv due to error: No columns to parse from file
⚠️ Skipped תפ_7626-04-20.csv due to error: No columns to parse from file
⚠️ Skipped תפ_43456-12-20.csv due to error: No columns to parse from file
⚠️ Skipped תפ_11879-01-21.csv due to error: No columns to parse from file
⚠️ Skipped תפ_6574-09-21.csv due to error: No columns to parse from file
⚠️ Skipped תפ_48194-05-22.csv due to error: No columns to parse from file
⚠️ Skipped תפ_73351-01-23.csv due to error: No columns to parse from file
⚠️ Skipped תפ_58193-03-23.csv due to error: No columns to parse from file
⚠️ Skipped תפ_13504-12-09.csv due to error: No columns to parse from file
⚠️ Skipped תפ_3781-11-11.csv due to erro

In [8]:
plt.figure(figsize=(10, 6), constrained_layout=True)
labels = [label[:25] + "..." if len(label) > 25 else label for label in labels]
plt.barh(labels, values)
plt.xlabel("Number of citations")
plt.title(f"Top Cited Verdicts by Judge: {judge}", fontsize=12)
plt.gca().invert_yaxis()
plt.savefig(f"judge_{judge.replace(' ', '_')}_citations.png")
plt.close()


In [12]:
import os
import pandas as pd
from collections import defaultdict

# טען את טבלת השופטים
judges_df = pd.read_csv("judges_extracted.csv")
judges_df["verdict_id"] = judges_df["file name"].str.replace(".docx", "", regex=False)

csv_dir = "/home/liorkob/M.Sc/thesis/data/drugs_3k/gpt/verdicts_tagged_citations"  # איפה שקבצי הציטוטים

# צבור נתונים לכל שופט
stats = defaultdict(lambda: {"cases": 0, "citations": 0})

for _, row in judges_df.iterrows():
    judge = row["judge"]
    verdict_id = row["verdict_id"]
    csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")

    if not os.path.exists(csv_path):
        continue

    try:
        df = pd.read_csv(csv_path)
    except:
        continue

    if df.empty or "predicted_label" not in df.columns:
        continue

    citation_count = df["predicted_label"].fillna(0).astype(int).sum()
    stats[judge]["cases"] += 1
    stats[judge]["citations"] += citation_count


In [13]:
print("🧑‍⚖️ Judge Stats:")
print(f"{'Judge':<30} {'Cases':<10} {'Citations':<10}")
print("-" * 50)
for judge, data in stats.items():
    print(f"{judge:<30} {data['cases']:<10} {data['citations']:<10}")


🧑‍⚖️ Judge Stats:
Judge                          Cases      Citations 
--------------------------------------------------
דניאל פיש                      4          2         
רונית בש                       12         87        
אבי לוי                        16         63        
תמר שרון נתנאל                 7          18        
עמית כהן                       2          22        
רות לורך                       3          15        
דניאל טפרברג                   2          3         
אמיר טובי                      12         28        
מרדכי לוי                      4          6         
ירון לוי                       6          35        
דוד רוזן                       3          13        
דנה מרשק מרום                  15         91        
אברהם אליקים                   5          20        
אברהם רובין                    8          46        
בני שגיא                       47         355       
אליהו ביתן                     17         113       
מיכל ברק נבו                  

In [22]:
# המרה ל-DataFrame
stats_df = pd.DataFrame([
    {"judge": judge, "cases": data["cases"], "citations": data["citations"]}
    for judge, data in stats.items()
])

# סינון ל-15 שופטים עם הכי הרבה ציטוטים (או תיקים)
top_judges = stats_df.sort_values(by=["citations", "cases"], ascending=False).head(15)
from collections import Counter
import re


all_top_citations = []

for judge in top_judges["judge"]:
    verdict_ids = judges_df[judges_df["judge"] == judge]["file name"].str.replace(".docx", "", regex=False)

    for verdict_id in verdict_ids:
        csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")
        if not os.path.exists(csv_path):
            continue

        try:
            df = pd.read_csv(csv_path)
        except:
            continue

        if df.empty or "predicted_label" not in df.columns or "citation" not in df.columns:
            continue

        cited = df[df["predicted_label"] == 1]["citation"].dropna().map(normalize_citation)
        all_top_citations.extend(cited)
top_cited = Counter(all_top_citations).most_common(5)

print("🏆 Top 5 Most Cited Verdicts among Top 15 Judges:")
print(f"{'Citation':<20} {'Count':<10}")
print("-" * 35)
for cit, count in top_cited:
    print(f"{cit:<20} {count:<10}")


🏆 Top 5 Most Cited Verdicts among Top 15 Judges:
Citation             Count     
-----------------------------------
ע"פ_7319-08-12       30        
ע"פ_5807-17          27        
עפ"ג_31347-08-14     23        
ת"פ_54706-01-13      22        
עפ"ג_28110-10-15     22        


In [23]:
print("🧑‍⚖️ Top Cited Verdict per Judge:")
print(f"{'Judge':<30} {'Top Citation':<20} {'Count':<10}")
print("-" * 65)

for judge in top_judges["judge"]:
    verdict_ids = judges_df[judges_df["judge"] == judge]["file name"].str.replace(".docx", "", regex=False)
    judge_citations = []

    for verdict_id in verdict_ids:
        csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")
        if not os.path.exists(csv_path):
            continue

        try:
            df = pd.read_csv(csv_path)
        except:
            continue

        if df.empty or "predicted_label" not in df.columns or "citation" not in df.columns:
            continue

        cited = df[df["predicted_label"] == 1]["citation"].dropna().map(normalize_citation)
        judge_citations.extend(cited)

    if judge_citations:
        top_cit, count = Counter(judge_citations).most_common(1)[0]
        print(f"{judge:<30} {top_cit:<20} {count:<10}")
    else:
        print(f"{judge:<30} {'-':<20} {'0':<10}")


🧑‍⚖️ Top Cited Verdict per Judge:
Judge                          Top Citation         Count     
-----------------------------------------------------------------
עמי קובו                       ע"פ_8988-16          9         
מרב גרינברג                    ע"פ_5807-17          12        
רות שפילברג כהן                ת"פ_39589-07-13      14        
בני שגיא                       ת"פ_6090-10-19       8         
נועה חקלאי                     ת"פ_16926-04-16      22        
דוד שאול גבאי ריכטר            עפ"ג_13953-09-19     14        
אפרת פינק                      ת"פ_40639-10-17      7         
יוסי טורס                      עפ"ג_62171-05-17     17        
ארנון איתן                     ת"פ_726-01-14        12        
שוש שטרית                      רע"פ_1830-16         9         
אילה אורן                      רע"פ_3059-21         6         
סימי פלג קימלוב                רע"פ_1273-08         9         
יובל ליבדרו                    ע"פ_4659-12          4         
חנה מרים לומפ     

In [24]:
print("🧑‍⚖️ Top Cited Verdict per Judge + Percentage:")
print(f"{'Judge':<30} {'Top Citation':<20} {'Count':<10} {'% of All':<10}")
print("-" * 80)

for judge in top_judges["judge"]:
# for judge in judges_df["judge"].dropna().unique():

    verdict_ids = judges_df[judges_df["judge"] == judge]["file name"].str.replace(".docx", "", regex=False)
    judge_citations = []

    for verdict_id in verdict_ids:
        csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")
        if not os.path.exists(csv_path):
            continue

        try:
            df = pd.read_csv(csv_path)
        except:
            continue

        if df.empty or "predicted_label" not in df.columns or "citation" not in df.columns:
            continue

        cited = df[df["predicted_label"] == 1]["citation"].dropna().map(normalize_citation)
        judge_citations.extend(cited)

    if judge_citations:
        counter = Counter(judge_citations)
        top_cit, count = counter.most_common(1)[0]
        total = sum(counter.values())
        percent = (count / total) * 100
        print(f"{judge:<30} {top_cit:<20} {count:<10} {percent:>6.1f}%")
    else:
        print(f"{judge:<30} {'-':<20} {'0':<10} {'0.0%':<10}")


🧑‍⚖️ Top Cited Verdict per Judge + Percentage:
Judge                          Top Citation         Count      % of All  
--------------------------------------------------------------------------------
עמי קובו                       ע"פ_8988-16          9             1.2%
מרב גרינברג                    ע"פ_5807-17          12            3.2%
רות שפילברג כהן                ת"פ_39589-07-13      14            3.9%
בני שגיא                       ת"פ_6090-10-19       8             2.3%
נועה חקלאי                     ת"פ_16926-04-16      22            6.3%
דוד שאול גבאי ריכטר            עפ"ג_13953-09-19     14            4.2%
אפרת פינק                      ת"פ_40639-10-17      7             2.2%
יוסי טורס                      עפ"ג_62171-05-17     17            5.6%
ארנון איתן                     ת"פ_726-01-14        12            4.8%
שוש שטרית                      רע"פ_1830-16         9             4.1%
אילה אורן                      רע"פ_3059-21         6             3.1%
סימי פלג קימלוב  

In [25]:
from collections import Counter
import re

def normalize_citation(c):
    c = c.replace("ת\"פ", "").replace(" ", "")
    c = re.sub(r'[^0-9\-_/]', '', c)
    return c.strip("_-/")

print("📊 Top 5 Cited Verdicts per Top Judge (with %):\n")

for judge in top_judges["judge"]:
    verdict_ids = judges_df[judges_df["judge"] == judge]["file name"].str.replace(".docx", "", regex=False)
    judge_citations = []

    for verdict_id in verdict_ids:
        csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")
        if not os.path.exists(csv_path):
            continue

        try:
            df = pd.read_csv(csv_path)
        except:
            continue

        if df.empty or "predicted_label" not in df.columns or "citation" not in df.columns:
            continue

        cited = df[df["predicted_label"] == 1]["citation"].dropna().map(normalize_citation)
        judge_citations.extend(cited)

    print(f"👨‍⚖️ Judge: {judge}")
    if not judge_citations:
        print("   (No citations found)\n")
        continue

    counter = Counter(judge_citations)
    total = sum(counter.values())

    for cit, count in counter.most_common(5):
        percent = (count / total) * 100
        print(f"   • {cit:<20} {count:<5} ({percent:.1f}%)")
    print()


📊 Top 5 Cited Verdicts per Top Judge (with %):

👨‍⚖️ Judge: עמי קובו
   • 8988-16              9     (1.2%)
   • 5093-17              8     (1.0%)
   • 30876-03-17          7     (0.9%)
   • 4008-11              6     (0.8%)
   • 5813-14              6     (0.8%)

👨‍⚖️ Judge: מרב גרינברג
   • 5807-17              12    (3.2%)
   • 126-22               11    (3.0%)
   • 871-20               6     (1.6%)
   • 2596-18              6     (1.6%)
   • 2518-16              5     (1.3%)

👨‍⚖️ Judge: רות שפילברג כהן
   • 39589-07-13          14    (3.9%)
   • 31724-08-12          13    (3.7%)
   • 31347-08-14          13    (3.7%)
   • 48125-05-11          13    (3.7%)
   • 54706-01-13          13    (3.7%)

👨‍⚖️ Judge: בני שגיא
   • 6090-10-19           8     (2.3%)
   • 8820-14              7     (2.0%)
   • 2279-15              6     (1.7%)
   • 4592-15              6     (1.7%)
   • 1313-14              6     (1.7%)

👨‍⚖️ Judge: נועה חקלאי
   • 16926-04-16          22    (6.3%)
   • 6920-11

In [28]:
from collections import defaultdict, Counter
import os
import pandas as pd
import re

def normalize_citation(c):
    c = c.replace("ת\"פ", "").replace(" ", "")
    c = re.sub(r'[^0-9\-_/]', '', c)
    return c.strip("_-/")

print("📊 Top 5 Most Cited Verdicts (by unique verdicts) Per Judge:\n")
for judge in top_judges["judge"]:

# for judge in judges_df["judge"].dropna().unique():
    verdict_ids = judges_df[judges_df["judge"] == judge]["file name"].str.replace(".docx", "", regex=False)

    # ציטוטים ייחודיים לכל תיק שהשופט כתב
    citation_to_verdicts = defaultdict(set)
    all_unique_citations = set()

    for verdict_id in verdict_ids:
        csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")
        if not os.path.exists(csv_path):
            continue

        try:
            df = pd.read_csv(csv_path)
        except:
            continue

        if df.empty or "predicted_label" not in df.columns or "citation" not in df.columns:
            continue

        cited = df[df["predicted_label"] == 1]["citation"].dropna().map(normalize_citation)
        unique_citations = set(cited)

        for cit in unique_citations:
            citation_to_verdicts[cit].add(verdict_id)
        all_unique_citations.update(unique_citations)

    if not citation_to_verdicts:
        continue

    counter = {cit: len(vids) for cit, vids in citation_to_verdicts.items()}
    top5 = Counter(counter).most_common(5)
    total = len(all_unique_citations)

    print(f"👨‍⚖️ Judge: {judge}")
    for cit, count in top5:
        percent = (count / total) * 100
        print(f"   • {cit:<20} in {count:<3} verdicts ({percent:.1f}%)")
    print()


📊 Top 5 Most Cited Verdicts (by unique verdicts) Per Judge:

👨‍⚖️ Judge: עמי קובו
   • 8988-16              in 8   verdicts (1.9%)
   • 5093-17              in 7   verdicts (1.7%)
   • 56230-11-15          in 6   verdicts (1.4%)
   • 4008-11              in 6   verdicts (1.4%)
   • 5813-14              in 6   verdicts (1.4%)

👨‍⚖️ Judge: מרב גרינברג
   • 5807-17              in 7   verdicts (3.6%)
   • 871-20               in 6   verdicts (3.0%)
   • 3398-22              in 5   verdicts (2.5%)
   • 25458-11-18          in 5   verdicts (2.5%)
   • 6161-16              in 5   verdicts (2.5%)

👨‍⚖️ Judge: רות שפילברג כהן
   • 39589-07-13          in 14  verdicts (7.9%)
   • 31347-08-14          in 13  verdicts (7.3%)
   • 48125-05-11          in 13  verdicts (7.3%)
   • 31724-08-12          in 13  verdicts (7.3%)
   • 21605-07-13          in 13  verdicts (7.3%)

👨‍⚖️ Judge: בני שגיא
   • 6090-10-19           in 8   verdicts (4.1%)
   • 2279-15              in 6   verdicts (3.1%)
   • 1313

### verdict citation in db existance

In [None]:
import os
import pandas as pd
import re

# === פונקציית נירמול ===
def normalize_case_name_2(name):
    if pd.isna(name):
        return ""
    name = str(name)
    name = re.sub(r"\(.*?\)", "", name)
    name = re.sub(r"[∕/\\]", "-", name)
    name = re.sub(r"\s+", " ", name)
    name = name.strip().lower().replace(" ", "_")
    return name

# === הנתיב לתיקייה ===
csv_dir = "/home/liorkob/M.Sc/thesis/data/drugs_3k/gpt/verdicts_tagged_citations"

# שמות קבצים לאחר נירמול
existing_files = {
    normalize_case_name_2(os.path.splitext(f)[0])
    for f in os.listdir(csv_dir)
    if f.endswith('.csv')
}

# שמות ציטוטים עם predicted_label == 1
cited_files = set()

for filename in os.listdir(csv_dir):
    if not filename.endswith('.csv'):
        continue
    filepath = os.path.join(csv_dir, filename)
    try:
        df = pd.read_csv(filepath)
        cited_with_label_1 = df[df['predicted_label'] == 1]['citation']
        normalized_citations = cited_with_label_1.dropna().apply(normalize_case_name_2)
        cited_files.update(normalized_citations)
    except Exception as e:
        print(f"⚠️ בעיה בקריאת {filename}: {e}")

# הצלבה בין ציטוטים לבין קבצים קיימים
existing_cited_files = cited_files.intersection(existing_files)

# הדפסת תוצאות
print(f"🔎 מספר תיקים שמצוטטים עם predicted_label == 1: {len(cited_files)}")
print(f"✅ מתוכם קיימים בתיקייה: {len(existing_cited_files)}")
print("📄 רשימת תיקים קיימים:")
for fname in sorted(existing_cited_files):
    print(fname)


⚠️ בעיה בקריאת תפ_41730-12-19.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_53452-10-21.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_11145-01-14.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_35907-11-14.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_66195-07-19.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_19425-03-21.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_45232-07-18.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_46823-08-19.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_48194-05-22.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_17162-05-21.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_26792-04-16.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_43840-06-22.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_26436-05-12.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_54047-01-13.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_6894-03-23.csv: No columns to parse from file
⚠️ בעיה בקריאת תפ_60575-07

In [9]:
print("📂 קבצים קיימים לאחר נירמול:")
for f in sorted(existing_files)[:10]:
    print(f)
print("📑 ציטוטים מנורמלים מתוך predicted_label==1:")
for f in sorted(list(cited_files))[:10]:
    print(f)
print("🔍 השוואה בין מקור למנורמל:")
df_sample = df[df['predicted_label'] == 1].dropna(subset=['citation']).head(10)
for raw in df_sample['citation']:
    norm = normalize_case_name_2(raw)
    print(f"{raw}  -->  {norm}")


📂 קבצים קיימים לאחר נירמול:
תפ_10003-05-16
תפ_10029-03-21
תפ_10054-09-19
תפ_10086-06-18
תפ_10095-08-22
תפ_10096-02-22
תפ_10108-11-21
תפ_10111-08-17
תפ_10140-10-19
תפ_10152-09-17
📑 ציטוטים מנורמלים מתוך predicted_label==1:
ב"פ_31971-05-13
ב"ש_5273-11-22
בש"פ_10638-08
דנ"פ_10402-07
מ"י_,_שם_דובר_באדם_שנהג_בפסילה,_עצר_בכביש_ותקף_קטין_באגרופים_ונהג_בבריונות_בכביש._הנאשם_נדון_לשנת_מאסר_על_סמך_מתחם_שבין_מספר_חודשי_מאסר_ועד_24_חודשים._מנגד,_לא_מעט_תיקי_הכאת_קטינים_מסתיימים_הענישה_מקילה_מטעמי_שיקום_בעונשי_של"ץ_ועבודות_שירות._בהקשר_זה_מפנה_לעפ"ג_5662-06-10
מ"י_._ביחס_להחזקת_סם_שלא_לצריכה_עצמית_קשה_למצוא_בפסיקת_בית_המשפט_העליון_התייחסות_לנסיבות_דומות._יחד_עם_זאת_ניתן_לגזור_לקולא_מהמקרים_הבאים:_ברע"פ_3262-19_מסעוד_נ'_מ"י_אושר_מתחם_שבין_10_ל-30_חודשי_מאסר_ועונש_של_חמישה_5_מאסר_בפועל_לאדם_שהחזיק_בדירתו_למען_אחר_כ-300_גרם_קנביס_וחשיש_וכן_83_גרם_קוקאין;_ברע"פ_8237-15_בן_זקן_נ'_מ"י_אושר_מתחם_שבין_של"ץ_ומאסר_על-תנאי_ועד_24_חודשים_בנסיבות_של_גידול_וייצור_קנביס_במשקל_8.4_ק"ג._אושר_עונש_של_7_חודשי_מאסר_בפ

In [17]:
import os
import pandas as pd
import re

import os
import pandas as pd

# נתיב לתיקייה עם הקבצים
csv_dir = "/home/liorkob/M.Sc/thesis/data/drugs_3k/gpt/verdicts_tagged_citations"

# מילות מפתח שמעידות על המשך טקסט לא שייך
suspicious_keywords = [
    "הורשע", "הוגש", "צירף", "במסגרת", "גזר", "נדון", "תמורת",
    "בית המשפט", "הנאשם", "תקופה", "שנים", "עבירות", "עבר פלילי", "נסיבות"
]

# תנאי לזיהוי ציטוט בעייתי
def is_suspicious(citation):
    if len(citation.split()) > 7:
        return True
    for kw in suspicious_keywords:
        if kw in citation:
            return True
    return False

# סריקה והדפסה
for filename in os.listdir(csv_dir):
    if not filename.endswith(".csv"):
        continue
    path = os.path.join(csv_dir, filename)
    try:
        df = pd.read_csv(path)
        if 'citation' not in df.columns or 'context_text' not in df.columns:
            continue
        df = df[df['predicted_label'] == 1].dropna(subset=['citation', 'context_text'])

        for _, row in df.iterrows():
            citation = str(row['citation']).strip()
            context_text = str(row['context_text']).strip()
            if is_suspicious(citation):
                print(f"\n📁 {filename}:\n  🧷 {citation}\n  📜 context_text: {context_text}\n")
    except Exception as e:
        continue
        print(f"⚠️ שגיאה בקובץ {filename}: {e}")



📁 תפ_50853-02-18.csv:
  🧷 מ"י_._ברע"פ_7996-12_יוסף_נ'_מ"י_נקבע_מתחם_שבין_7_ל-18_חודשי_מאסר_בנסיבות_של_ארבעה_מקרי_סחר_בחשיש_ובקוקאין_._הנאשם_נדון_ל-21_חודשי_מאסר_אך_גם_בגין_מעשים_נוספים;_ברע"פ_1370-17_בן_נעים_נ'_מ"י_נדון_הנאשם_ל-16_חודשי_מאסר_וחילוט_על_בסיס_מתחם_שבין_12_ל-24_חודשי_מאסר_בריבוי_מקרים_של_סחר_בחשיש_לרבות_סחר_בפלטה._שם_דובר_גם_בסחר_של_למעלה_מק"ג_חשיש,_ולכן_המקרה_חמור_ממקרנו._ברע"פ_2139-16_אברמס_נ'_מ"י_אושר_מתחם_ענישה_שבין_8_ל-27_חודשים_ביחס_לריבוי_מקרים_של_סחר_בחשיש_בכמויות_גדולות_מאוד_ושם_התיק_הסתיים_ב-8_חודשי_מאסר_בפועל._בעפ"ג_10715-02-16
  📜 context_text: אשר לנסיבות ביצוע העבירה – בכל האישומים מדובר בעבירות מתוכננות שבוצעו בתחכום. באישום הראשון ניסה הנאשם להערים על המשטרה בעת שנתפס, ויש בכך מממד של חומרה. בנוסף, בכל האישומים ביצע הנאשם את מעשיו בצוותא בשיטה מתוחכמת באמצעות שימוש באפליקציה שהפכה לייעודית בתחום הסחר. הוא מכר סמים ממניעים כלכליים לכל דיכפין, ולכן מידת הנזק רבה וכך גם הנזק הפוטנציאלי אם היה ממשיך במעשיו ולא נעצר. סוג הסם "הקל" וכמותו, הן נסיבות לקולא.
אשר ל