### UTILS


In [4]:
# pip install stanza
# import stanza
# stanza.download('he') 


In [5]:
import stanza
# nlp = stanza.Pipeline('he', use_gpu=False)
nlp = stanza.Pipeline('he', processors='tokenize', use_gpu=True)

import re
from docx.text.paragraph import Paragraph
from docx import Document

from docx.table import _Cell, Table
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl

import sys
print(sys.executable)

# Modify property of Paragraph.text to include hyperlink text
Paragraph.text = property(lambda self: get_paragraph_text(self))

def get_paragraph_text(paragraph) -> str:
    """
    Extract text from paragraph, including hyperlink text.
    """
    def get_xml_tag(element):
        return "%s:%s" % (element.prefix, re.match("{.*}(.*)", element.tag).group(1))

    text_content = ''
    run_count = 0
    for child in paragraph._p:
        tag = get_xml_tag(child)
        if tag == "w:r":
            text_content += paragraph.runs[run_count].text
            run_count += 1
        if tag == "w:hyperlink":
            for sub_child in child:
                if get_xml_tag(sub_child) == "w:r":
                    text_content += sub_child.text
    return text_content


def is_paragraph_bold(block) -> bool:
    if block.style and block.style.font:
        if block.style.font.bold:  # Check if bold is part of the style
            return True
    return False

def is_block_bold(block) -> bool:
    # Check if the paragraph style indicates a bold style (e.g., "╫Ы╫Х╫к╫и╫к")
    if block.style and block.style.name in ["╫Ы╫Х╫к╫и╫к", "Heading", "Title"]:  
        return True

    # Check if the style font is bold
    if block.style and block.style.font and block.style.font.bold:
        return True

    # # Check if any run is bold
    # if block.runs:
    #     for run in block.runs:
    #         if run.bold or (run.font and run.font.bold):
    #             return True
    return False
def is_run_bold(run) -> bool:
    """
    Check if a run is bold, including inherited and complex script (cs_bold) styles.
    """
    if run.bold is not None:
        return run.bold
    if run.font and run.font.bold is not None:
        return run.font.bold
    if run.font and run.font.cs_bold is not None:
        return run.font.cs_bold  # Check for complex script bold
    return False

def is_block_styled(block) -> bool:
    """
    Check if the entire block/paragraph text is fully bold or fully underlined,
    while handling:
    - Allow the first run to differ in style if it is a prefix (e.g., '╫Р.', '1.', '╫Р)', '1)').
    - Skip empty or non-alphanumeric runs.
    - Allow trailing punctuation with different styling.
    """
    if hasattr(block, "runs") and block.runs:
        # Combine text from all meaningful runs
        combined_text = " ".join(run.text.strip() for run in block.runs if run.text.strip()).strip()
        
        # Handle empty text
        if not combined_text:
            return False
        
        # Check word count
        word_count = len(combined_text.split())
        if word_count < 4:
            # print(combined_text)
            return True  # Return True if there are fewer than 3 words


        # Identify meaningful runs: Ignore runs that are empty or contain only spaces/non-alphanumeric characters
        meaningful_runs = [run for run in block.runs if run.text.strip() and any(c.isalnum() for c in run.text)]

        if not meaningful_runs:
            return False

        # Check if the first run is a prefix (e.g., "╫Р.", "1.", "╫Р)", "1)")
        first_run_text = meaningful_runs[0].text.strip()
        is_prefix = bool(re.match(r'^[\u0590-\u05FF]\.|^[\u0590-\u05FF]\)|^\d+\.|^\d+\)', first_run_text))

        # Allow the first run to differ in style if it's a valid prefix
        runs_to_check = meaningful_runs[1:] if is_prefix else meaningful_runs

        # Check if all remaining runs are styled as bold or underlined
        all_bold =is_block_bold(block) or all(is_run_bold(run) or run.text in [":", ".", ","] for run in runs_to_check)
        all_underlined = all(run.underline is True or run.text in [":", ".", ","] for run in runs_to_check)

        # Allow for trailing punctuation to differ in style
        if combined_text[-1] in [":", ".", ","]:
            return all_bold or all_underlined
        else:
            return is_block_bold(block) or all(is_run_bold(run) or run.underline is True for run in runs_to_check)

    return False



def iterate_block_items(parent):
    """
    Iterate over paragraphs and tables in a document or cell.
    """
    if hasattr(parent, "element") and hasattr(parent.element, "body"):
        parent_element = parent.element.body
    elif hasattr(parent, "_tc"):
        parent_element = parent._tc
    else:
        print(f"Unsupported parent type: {type(parent)}")
        return

    for child in parent_element.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            table = Table(child, parent)
            for row in table.rows:
                for cell in row.cells:
                    yield from iterate_block_items(cell)

def extract_part_after_number_or_hebrew_letter(sentence: str) -> str:
    """
    Extract text following a pattern of number or Hebrew letter.
    """
    pattern = r'^(?:[0-9\u05D0-\u05EA]+)\.\s*(.*)'
    match = re.search(pattern, sentence)
    return match.group(1).strip() if match else sentence

def count_patterns_in_block(block) -> int:
    """
    Count the number-dot or dot-number patterns in a block.
    """
    pattern = r'\s*(?:\.\d+|\d+\.)'
    return len(re.findall(pattern, block.text))

def count_consecutive_blocks_starting_with_number(blocks) -> int:
    """
    Count consecutive blocks starting with a number or Hebrew letter.
    """
    count = 0
    for block in blocks:
        if '╫Ф╫а╫Р╫й╫Э' in block.text:
            return 1
        count += count_patterns_in_block(block)
        if '╫Ч╫з╫Щ╫з╫Ф ╫й╫Р╫Х╫Ц╫Ы╫и╫Ф' in block.text:
            break
    return count

def extract_name_after_word(text: str, word: str) -> str:
    """
    Extract the words following a given word up to the end of the sentence.
    """
    pattern = re.compile(fr'{word}(?:,)?\s*([\u0590-\u05FF\s\'\(\)-]+)')
    match = pattern.search(text)
    return match.group(1) if match else ''

def extract_violations(text: str) -> list:
    """
    Extract violations from the text based on a pre-defined pattern.
    """

    matches = re.findall(r"(?:╫б╫в╫Щ╫г|╫б╫в╫Щ╫д╫Щ╫Э|╫б'|╫б╫в')\s*\d+\s*(?:\([\s\S]*?\))?.*?(?=\s*(?:╫С|╫Ь)(?:╫Ч╫Х╫з|╫д╫з╫Х╫У╫к))\s*(?:╫С|╫Ь)(?:╫Ч╫Х╫з|╫д╫з╫Х╫У╫к)\s*╫Ф?(?:╫в╫Х╫а╫й╫Щ╫Я|╫Ы╫а╫Щ╫б╫Ф ╫Ь╫Щ╫й╫и╫Р╫Ь|╫б╫Ю╫Щ╫Э\s+╫Ф╫Ю╫б╫Х╫Ы╫а╫Щ╫Э|\w+)?", text)
    # matches = re.findall(r"(?:╫б╫в╫Щ╫г|╫б╫в╫Щ╫д╫Щ╫Э|╫б'|╫б╫в')\s*\d+\s*(?:\([\s\S]*?\))?.*?(?=\s*(?:╫С|╫Ь)(?:╫Ч╫Х╫з|╫д╫з╫Х╫У╫к))\s*(?:╫С|╫Ь)(?:╫Ч╫Х╫з|╫д╫з╫Х╫У╫к)\s*╫Ф?(?:╫в╫Х╫а╫й╫Щ╫Я|╫Ы╫а╫Щ╫б╫Ф ╫Ь╫Щ╫й╫и╫Р╫Ь|╫б╫Ю╫Щ╫Э\s+╫Ф╫Ю╫б╫Х╫Ы╫а╫Щ╫Э|[^\[]+)?", text)

    matches = [match.strip() for match in matches]
    return matches

2025-05-27 10:15:31 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  тАж

2025-05-27 10:15:32 INFO: Downloaded file to /home/liorkob/stanza_resources/resources.json
2025-05-27 10:15:32 INFO: Loading these models for language: he (Hebrew):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |

2025-05-27 10:15:32 INFO: Using device: cuda
2025-05-27 10:15:32 INFO: Loading: tokenize
2025-05-27 10:15:57 INFO: Loading: mwt
2025-05-27 10:15:57 INFO: Done loading processors!


/home/liorkob/.conda/envs/new_env/bin/python


### metadata

In [6]:
import os
import re
import unicodedata
import pandas as pd
from docx import Document
import re
from openai import OpenAI

# ========== API Setup ==========
os.environ["OPENAI_API_KEY"] = "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"  
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def normalize_text(text):
    text = unicodedata.normalize("NFKC", text)
    return text.replace("\u00A0", " ").replace("\u200f", "").strip()
def clean_hebrew_verdict_text(text: str) -> str:
    # Remove duplicate phrases (e.g. repeated '╫С╫Щ╫к ╫Ю╫й╫д╫Ш ╫Ф╫й╫Ь╫Х╫Э ╫С╫з╫и╫Щ╫Х╫к')
    parts = list(dict.fromkeys(text.split('<<')))
    cleaned = '<<'.join(parts)

    # Remove nested/multiple angle brackets
    cleaned = re.sub(r'[<]{2,}', '<', cleaned)
    cleaned = re.sub(r'[>]{2,}', '>', cleaned)

    # Remove empty brackets or stray symbols
    cleaned = re.sub(r'<\s*>', '', cleaned)
    cleaned = re.sub(r'[<>]', '', cleaned)

    # Remove escape characters
    cleaned = cleaned.replace("\\'", "'").replace('\\', '')

    # Replace double "╫а╫Т╫У ╫а╫Т╫У" with single
    cleaned = re.sub(r'╫а╫Т╫У\s+╫а╫Т╫У', '╫а╫Т╫У', cleaned)

    # Normalize spaces
    cleaned = re.sub(r'\s{2,}', ' ', cleaned).strip()

    return cleaned

terms= ["╫С╫д╫а╫Щ","╫Ы╫С╫Х╫У","╫Ь╫д╫а╫Щ","╫Ф╫й╫Х╫д╫Ш","╫Ф╫й╫Х╫д╫Ш╫к","╫й╫Х╫д╫Ш","╫й╫Х╫д╫Ш╫к","╫й╫Х╫д╫Ш╫Щ╫Э"]
def extract_judge(docx_path):
    doc = Document(docx_path)
    first_rows = ""
    i = 0
    header = doc.sections[0].header
    head=""
    for paragraph in header.paragraphs:
        head+=paragraph.text

    first_rows+=head

    for block in iterate_block_items(doc):
        if i == 15:
            break
        first_rows += normalize_text(block.text) + " "
    
        if normalize_text(block.text) != "":
            i += 1
    first_rows=clean_hebrew_verdict_text(first_rows)
    # print(first_rows)
    if not any(term in first_rows for term in terms):
        print("\n")
        print(first_rows)

    prompt = (
        "╫Ю╫к╫Х╫Ъ ╫Ф╫з╫Ш╫в ╫Ф╫С╫Р, ╫Ю╫Ф ╫й╫Ю╫Х ╫й╫Ь ╫Ф╫й╫Х╫д╫Ш ╫Р╫Х ╫Ф╫й╫Х╫д╫Ш╫к? "
        "╫в╫а╫Ф ╫и╫з ╫С╫й╫Э ╫Ф╫д╫и╫Ш╫Щ ╫Х╫й╫Э ╫Ф╫Ю╫й╫д╫Ч╫Ф ╫й╫Ь ╫Ф╫й╫Х╫д╫Ш/╫к, ╫С╫Ь╫Щ ╫Ы╫Х╫к╫и╫Х╫к ╫Р╫Х ╫з╫Щ╫У╫Х╫Ю╫Х╫к. "
        "╫Ь╫У╫Х╫Т╫Ю╫Ф: ╫У╫Х╫У ╫Ы╫Ф╫Я\n\n"
        f"{first_rows}"
    )
    response = client.chat.completions.create(
        model="gpt-4.1-mini", 
        messages=[
            {"role": "system", "content": "You are an AI trained to extract factual allegations from legal texts, ensuring no interpretation or rewording."},
            {"role": "user", "content": prompt}
        ]
    )

    return response.choices[0].message.content.strip()





import os
import csv
from tqdm import tqdm
from docx import Document

# Your existing extract_judge function must be defined above this block

output_rows = []
docx_dir = '/home/liorkob/M.Sc/thesis/data/drugs_3k/docx/verdict'
docx_files = [f for f in os.listdir(docx_dir) if f.endswith('.docx')]

for filename in tqdm(docx_files, desc="ЁЯУД Extracting judges"):
    file_path = os.path.join(docx_dir, filename)
    judge_name = extract_judge(file_path)
    print(judge_name)
    output_rows.append({"file name": filename, "judge": judge_name})

# Save to CSV
with open("judges_extracted.csv", "w", newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=["file name", "judge"])
    writer.writeheader()
    writer.writerows(output_rows)


ЁЯУД Extracting judges:   0%|          | 1/3045 [00:00<50:32,  1.00it/s]

╫У╫а╫Щ╫Р╫Ь ╫д╫Щ╫й


ЁЯУД Extracting judges:   0%|          | 2/3045 [00:01<38:57,  1.30it/s]

╫и╫Х╫а╫Щ╫к ╫С╫й


ЁЯУД Extracting judges:   0%|          | 3/3045 [00:02<36:32,  1.39it/s]

╫Р╫С╫Щ ╫Ь╫Х╫Щ


ЁЯУД Extracting judges:   0%|          | 4/3045 [00:02<35:40,  1.42it/s]

╫к╫Ю╫и ╫й╫и╫Х╫Я ╫а╫к╫а╫Р╫Ь


ЁЯУД Extracting judges:   0%|          | 5/3045 [00:03<38:18,  1.32it/s]

╫Р╫Ь╫Щ╫Ф╫Х ╫С╫Щ╫к╫Я


ЁЯУД Extracting judges:   0%|          | 6/3045 [00:04<43:39,  1.16it/s]

╫У╫а╫Ф ╫Ю╫и╫й╫з ╫Ю╫и╫Х╫Э


ЁЯУД Extracting judges:   0%|          | 7/3045 [00:05<43:31,  1.16it/s]

╫Р╫С╫Щ ╫Ь╫Х╫Щ


ЁЯУД Extracting judges:   0%|          | 8/3045 [00:06<45:29,  1.11it/s]

╫в╫Ю╫Щ╫к ╫Т╫Х╫и╫д╫Щ╫а╫з╫Ь


ЁЯУД Extracting judges:   0%|          | 9/3045 [00:07<42:29,  1.19it/s]

╫Т'╫Х╫и╫Т' ╫з╫и╫Р


ЁЯУД Extracting judges:   0%|          | 10/3045 [00:08<41:52,  1.21it/s]

╫в╫Ю╫Щ╫к ╫Ы╫Ф╫Я


ЁЯУД Extracting judges:   0%|          | 11/3045 [00:08<37:46,  1.34it/s]

╫и╫Х╫к ╫Ь╫Х╫и╫Ъ


ЁЯУД Extracting judges:   0%|          | 12/3045 [00:09<37:43,  1.34it/s]

╫Р╫С╫Щ ╫Ь╫Х╫Щ


ЁЯУД Extracting judges:   0%|          | 13/3045 [00:10<38:25,  1.32it/s]

╫У╫а╫Щ╫Р╫Ь ╫Ш╫д╫и╫С╫и╫Т


ЁЯУД Extracting judges:   0%|          | 14/3045 [00:12<56:51,  1.13s/it]

╫Р╫Ю╫Щ╫и ╫Ш╫Х╫С╫Щ


ЁЯУД Extracting judges:   0%|          | 15/3045 [00:12<49:13,  1.03it/s]

╫Р╫Ф╫и╫Х╫Я ╫Ю╫й╫а╫Щ╫Х╫к


ЁЯУД Extracting judges:   1%|          | 16/3045 [00:13<45:29,  1.11it/s]

╫Т╫Щ╫Ь╫Щ╫Ф ╫и╫С╫Щ╫У


ЁЯУД Extracting judges:   1%|          | 17/3045 [00:14<42:12,  1.20it/s]

╫Р╫Ю╫Щ╫и ╫Ш╫Х╫С╫Щ


ЁЯУД Extracting judges:   1%|          | 18/3045 [00:15<43:22,  1.16it/s]

╫Ю╫и╫У╫Ы╫Щ ╫Ь╫Х╫Щ


ЁЯУД Extracting judges:   1%|          | 19/3045 [00:15<39:44,  1.27it/s]

╫Щ╫и╫Х╫Я ╫Ь╫Х╫Щ


ЁЯУД Extracting judges:   1%|          | 20/3045 [00:16<37:21,  1.35it/s]

╫У╫Х╫У ╫и╫Х╫Ц╫Я


ЁЯУД Extracting judges:   1%|          | 21/3045 [00:17<37:00,  1.36it/s]

╫У╫Х╫У ╫и╫Х╫Ц╫Я


ЁЯУД Extracting judges:   1%|          | 22/3045 [00:18<38:53,  1.30it/s]

╫ж╫С╫Щ ╫Т╫Х╫и╫д╫Щ╫а╫з╫Ь


ЁЯУД Extracting judges:   1%|          | 22/3045 [00:18<41:55,  1.20it/s]


KeyboardInterrupt: 

In [7]:
import pandas as pd
judges_df = pd.read_csv("judges_extracted.csv")  # columns: file name, judge
judges_df["verdict_id"] = judges_df["file name"].str.replace(".docx", "", regex=False)

def normalize_citation(name):
    if pd.isna(name):
        return ""
    name = str(name)
    name = re.sub(r"\(.*?\)", "", name)
    name = re.sub(r"[тИХ/\\]", "-", name)
    name = re.sub(r"\s+", " ", name)
    name = name.strip().lower().replace(" ", "_")
    return name

import os
from collections import defaultdict

citations_by_judge = defaultdict(list)
csv_dir = "/home/liorkob/M.Sc/thesis/data/drugs_3k/gpt/verdicts_tagged_citations"  # ╫Р╫Щ╫д╫Ф ╫й╫з╫С╫ж╫Щ ╫Ф╫ж╫Щ╫Ш╫Х╫Ш╫Щ╫Э

for _, row in judges_df.iterrows():
    judge = row["judge"]
    verdict_id = row["verdict_id"]
    csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")
    
    if not os.path.exists(csv_path):
        continue

    try:
        df = pd.read_csv(csv_path)
    except Exception as e:
        print(f"тЪая╕П Skipped {verdict_id}.csv due to error: {e}")
        continue

    if df.empty or "predicted_label" not in df.columns or "citation" not in df.columns:
        continue

    cited = df[df["predicted_label"] == 1]["citation"].dropna().map(normalize_citation)
    if not cited.empty:
        citations_by_judge[judge].extend(cited)


import matplotlib.pyplot as plt
from collections import Counter

for judge, citations in citations_by_judge.items():
    counter = Counter(citations)
    if not counter:
        continue
    top = counter.most_common(10)
    labels, values = zip(*top)



тЪая╕П Skipped ╫к╫д_26792-04-16.csv due to error: No columns to parse from file
тЪая╕П Skipped ╫к╫д_17316-10-16.csv due to error: No columns to parse from file
тЪая╕П Skipped ╫к╫д_55925-11-17.csv due to error: No columns to parse from file
тЪая╕П Skipped ╫к╫д_13236-03-18.csv due to error: No columns to parse from file
тЪая╕П Skipped ╫к╫д_39573-10-18.csv due to error: No columns to parse from file
тЪая╕П Skipped ╫к╫д_7626-04-20.csv due to error: No columns to parse from file
тЪая╕П Skipped ╫к╫д_43456-12-20.csv due to error: No columns to parse from file
тЪая╕П Skipped ╫к╫д_11879-01-21.csv due to error: No columns to parse from file
тЪая╕П Skipped ╫к╫д_6574-09-21.csv due to error: No columns to parse from file
тЪая╕П Skipped ╫к╫д_48194-05-22.csv due to error: No columns to parse from file
тЪая╕П Skipped ╫к╫д_73351-01-23.csv due to error: No columns to parse from file
тЪая╕П Skipped ╫к╫д_58193-03-23.csv due to error: No columns to parse from file
тЪая╕П Skipped ╫к╫д_13504-12-09.csv due to

In [8]:
plt.figure(figsize=(10, 6), constrained_layout=True)
labels = [label[:25] + "..." if len(label) > 25 else label for label in labels]
plt.barh(labels, values)
plt.xlabel("Number of citations")
plt.title(f"Top Cited Verdicts by Judge: {judge}", fontsize=12)
plt.gca().invert_yaxis()
plt.savefig(f"judge_{judge.replace(' ', '_')}_citations.png")
plt.close()


In [12]:
import os
import pandas as pd
from collections import defaultdict

# ╫Ш╫в╫Я ╫Р╫к ╫Ш╫С╫Ь╫к ╫Ф╫й╫Х╫д╫Ш╫Щ╫Э
judges_df = pd.read_csv("judges_extracted.csv")
judges_df["verdict_id"] = judges_df["file name"].str.replace(".docx", "", regex=False)

csv_dir = "/home/liorkob/M.Sc/thesis/data/drugs_3k/gpt/verdicts_tagged_citations"  # ╫Р╫Щ╫д╫Ф ╫й╫з╫С╫ж╫Щ ╫Ф╫ж╫Щ╫Ш╫Х╫Ш╫Щ╫Э

# ╫ж╫С╫Х╫и ╫а╫к╫Х╫а╫Щ╫Э ╫Ь╫Ы╫Ь ╫й╫Х╫д╫Ш
stats = defaultdict(lambda: {"cases": 0, "citations": 0})

for _, row in judges_df.iterrows():
    judge = row["judge"]
    verdict_id = row["verdict_id"]
    csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")

    if not os.path.exists(csv_path):
        continue

    try:
        df = pd.read_csv(csv_path)
    except:
        continue

    if df.empty or "predicted_label" not in df.columns:
        continue

    citation_count = df["predicted_label"].fillna(0).astype(int).sum()
    stats[judge]["cases"] += 1
    stats[judge]["citations"] += citation_count


In [13]:
print("ЁЯзСтАНтЪЦя╕П Judge Stats:")
print(f"{'Judge':<30} {'Cases':<10} {'Citations':<10}")
print("-" * 50)
for judge, data in stats.items():
    print(f"{judge:<30} {data['cases']:<10} {data['citations']:<10}")


ЁЯзСтАНтЪЦя╕П Judge Stats:
Judge                          Cases      Citations 
--------------------------------------------------
╫У╫а╫Щ╫Р╫Ь ╫д╫Щ╫й                      4          2         
╫и╫Х╫а╫Щ╫к ╫С╫й                       12         87        
╫Р╫С╫Щ ╫Ь╫Х╫Щ                        16         63        
╫к╫Ю╫и ╫й╫и╫Х╫Я ╫а╫к╫а╫Р╫Ь                 7          18        
╫в╫Ю╫Щ╫к ╫Ы╫Ф╫Я                       2          22        
╫и╫Х╫к ╫Ь╫Х╫и╫Ъ                       3          15        
╫У╫а╫Щ╫Р╫Ь ╫Ш╫д╫и╫С╫и╫Т                   2          3         
╫Р╫Ю╫Щ╫и ╫Ш╫Х╫С╫Щ                      12         28        
╫Ю╫и╫У╫Ы╫Щ ╫Ь╫Х╫Щ                      4          6         
╫Щ╫и╫Х╫Я ╫Ь╫Х╫Щ                       6          35        
╫У╫Х╫У ╫и╫Х╫Ц╫Я                       3          13        
╫У╫а╫Ф ╫Ю╫и╫й╫з ╫Ю╫и╫Х╫Э                  15         91        
╫Р╫С╫и╫Ф╫Э ╫Р╫Ь╫Щ╫з╫Щ╫Э                   5          20        
╫Р╫С╫и╫Ф╫Э ╫и╫Х╫С╫Щ╫Я                    8          46        
╫С╫а╫Щ 

In [22]:
# ╫Ф╫Ю╫и╫Ф ╫Ь-DataFrame
stats_df = pd.DataFrame([
    {"judge": judge, "cases": data["cases"], "citations": data["citations"]}
    for judge, data in stats.items()
])

# ╫б╫Щ╫а╫Х╫Я ╫Ь-15 ╫й╫Х╫д╫Ш╫Щ╫Э ╫в╫Э ╫Ф╫Ы╫Щ ╫Ф╫и╫С╫Ф ╫ж╫Щ╫Ш╫Х╫Ш╫Щ╫Э (╫Р╫Х ╫к╫Щ╫з╫Щ╫Э)
top_judges = stats_df.sort_values(by=["citations", "cases"], ascending=False).head(15)
from collections import Counter
import re


all_top_citations = []

for judge in top_judges["judge"]:
    verdict_ids = judges_df[judges_df["judge"] == judge]["file name"].str.replace(".docx", "", regex=False)

    for verdict_id in verdict_ids:
        csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")
        if not os.path.exists(csv_path):
            continue

        try:
            df = pd.read_csv(csv_path)
        except:
            continue

        if df.empty or "predicted_label" not in df.columns or "citation" not in df.columns:
            continue

        cited = df[df["predicted_label"] == 1]["citation"].dropna().map(normalize_citation)
        all_top_citations.extend(cited)
top_cited = Counter(all_top_citations).most_common(5)

print("ЁЯПЖ Top 5 Most Cited Verdicts among Top 15 Judges:")
print(f"{'Citation':<20} {'Count':<10}")
print("-" * 35)
for cit, count in top_cited:
    print(f"{cit:<20} {count:<10}")


ЁЯПЖ Top 5 Most Cited Verdicts among Top 15 Judges:
Citation             Count     
-----------------------------------
╫в"╫д_7319-08-12       30        
╫в"╫д_5807-17          27        
╫в╫д"╫Т_31347-08-14     23        
╫к"╫д_54706-01-13      22        
╫в╫д"╫Т_28110-10-15     22        


In [23]:
print("ЁЯзСтАНтЪЦя╕П Top Cited Verdict per Judge:")
print(f"{'Judge':<30} {'Top Citation':<20} {'Count':<10}")
print("-" * 65)

for judge in top_judges["judge"]:
    verdict_ids = judges_df[judges_df["judge"] == judge]["file name"].str.replace(".docx", "", regex=False)
    judge_citations = []

    for verdict_id in verdict_ids:
        csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")
        if not os.path.exists(csv_path):
            continue

        try:
            df = pd.read_csv(csv_path)
        except:
            continue

        if df.empty or "predicted_label" not in df.columns or "citation" not in df.columns:
            continue

        cited = df[df["predicted_label"] == 1]["citation"].dropna().map(normalize_citation)
        judge_citations.extend(cited)

    if judge_citations:
        top_cit, count = Counter(judge_citations).most_common(1)[0]
        print(f"{judge:<30} {top_cit:<20} {count:<10}")
    else:
        print(f"{judge:<30} {'-':<20} {'0':<10}")


ЁЯзСтАНтЪЦя╕П Top Cited Verdict per Judge:
Judge                          Top Citation         Count     
-----------------------------------------------------------------
╫в╫Ю╫Щ ╫з╫Х╫С╫Х                       ╫в"╫д_8988-16          9         
╫Ю╫и╫С ╫Т╫и╫Щ╫а╫С╫и╫Т                    ╫в"╫д_5807-17          12        
╫и╫Х╫к ╫й╫д╫Щ╫Ь╫С╫и╫Т ╫Ы╫Ф╫Я                ╫к"╫д_39589-07-13      14        
╫С╫а╫Щ ╫й╫Т╫Щ╫Р                       ╫к"╫д_6090-10-19       8         
╫а╫Х╫в╫Ф ╫Ч╫з╫Ь╫Р╫Щ                     ╫к"╫д_16926-04-16      22        
╫У╫Х╫У ╫й╫Р╫Х╫Ь ╫Т╫С╫Р╫Щ ╫и╫Щ╫Ы╫Ш╫и            ╫в╫д"╫Т_13953-09-19     14        
╫Р╫д╫и╫к ╫д╫Щ╫а╫з                      ╫к"╫д_40639-10-17      7         
╫Щ╫Х╫б╫Щ ╫Ш╫Х╫и╫б                      ╫в╫д"╫Т_62171-05-17     17        
╫Р╫и╫а╫Х╫Я ╫Р╫Щ╫к╫Я                     ╫к"╫д_726-01-14        12        
╫й╫Х╫й ╫й╫Ш╫и╫Щ╫к                      ╫и╫в"╫д_1830-16         9         
╫Р╫Щ╫Ь╫Ф ╫Р╫Х╫и╫Я                      ╫и╫в"╫д_3059-21         6         
╫б╫Щ╫Ю

In [24]:
print("ЁЯзСтАНтЪЦя╕П Top Cited Verdict per Judge + Percentage:")
print(f"{'Judge':<30} {'Top Citation':<20} {'Count':<10} {'% of All':<10}")
print("-" * 80)

for judge in top_judges["judge"]:
# for judge in judges_df["judge"].dropna().unique():

    verdict_ids = judges_df[judges_df["judge"] == judge]["file name"].str.replace(".docx", "", regex=False)
    judge_citations = []

    for verdict_id in verdict_ids:
        csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")
        if not os.path.exists(csv_path):
            continue

        try:
            df = pd.read_csv(csv_path)
        except:
            continue

        if df.empty or "predicted_label" not in df.columns or "citation" not in df.columns:
            continue

        cited = df[df["predicted_label"] == 1]["citation"].dropna().map(normalize_citation)
        judge_citations.extend(cited)

    if judge_citations:
        counter = Counter(judge_citations)
        top_cit, count = counter.most_common(1)[0]
        total = sum(counter.values())
        percent = (count / total) * 100
        print(f"{judge:<30} {top_cit:<20} {count:<10} {percent:>6.1f}%")
    else:
        print(f"{judge:<30} {'-':<20} {'0':<10} {'0.0%':<10}")


ЁЯзСтАНтЪЦя╕П Top Cited Verdict per Judge + Percentage:
Judge                          Top Citation         Count      % of All  
--------------------------------------------------------------------------------
╫в╫Ю╫Щ ╫з╫Х╫С╫Х                       ╫в"╫д_8988-16          9             1.2%
╫Ю╫и╫С ╫Т╫и╫Щ╫а╫С╫и╫Т                    ╫в"╫д_5807-17          12            3.2%
╫и╫Х╫к ╫й╫д╫Щ╫Ь╫С╫и╫Т ╫Ы╫Ф╫Я                ╫к"╫д_39589-07-13      14            3.9%
╫С╫а╫Щ ╫й╫Т╫Щ╫Р                       ╫к"╫д_6090-10-19       8             2.3%
╫а╫Х╫в╫Ф ╫Ч╫з╫Ь╫Р╫Щ                     ╫к"╫д_16926-04-16      22            6.3%
╫У╫Х╫У ╫й╫Р╫Х╫Ь ╫Т╫С╫Р╫Щ ╫и╫Щ╫Ы╫Ш╫и            ╫в╫д"╫Т_13953-09-19     14            4.2%
╫Р╫д╫и╫к ╫д╫Щ╫а╫з                      ╫к"╫д_40639-10-17      7             2.2%
╫Щ╫Х╫б╫Щ ╫Ш╫Х╫и╫б                      ╫в╫д"╫Т_62171-05-17     17            5.6%
╫Р╫и╫а╫Х╫Я ╫Р╫Щ╫к╫Я                     ╫к"╫д_726-01-14        12            4.8%
╫й╫Х╫й ╫й╫Ш╫и╫Щ╫к                      ╫и╫в

In [25]:
from collections import Counter
import re

def normalize_citation(c):
    c = c.replace("╫к\"╫д", "").replace(" ", "")
    c = re.sub(r'[^0-9\-_/]', '', c)
    return c.strip("_-/")

print("ЁЯУК Top 5 Cited Verdicts per Top Judge (with %):\n")

for judge in top_judges["judge"]:
    verdict_ids = judges_df[judges_df["judge"] == judge]["file name"].str.replace(".docx", "", regex=False)
    judge_citations = []

    for verdict_id in verdict_ids:
        csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")
        if not os.path.exists(csv_path):
            continue

        try:
            df = pd.read_csv(csv_path)
        except:
            continue

        if df.empty or "predicted_label" not in df.columns or "citation" not in df.columns:
            continue

        cited = df[df["predicted_label"] == 1]["citation"].dropna().map(normalize_citation)
        judge_citations.extend(cited)

    print(f"ЁЯСитАНтЪЦя╕П Judge: {judge}")
    if not judge_citations:
        print("   (No citations found)\n")
        continue

    counter = Counter(judge_citations)
    total = sum(counter.values())

    for cit, count in counter.most_common(5):
        percent = (count / total) * 100
        print(f"   тАв {cit:<20} {count:<5} ({percent:.1f}%)")
    print()


ЁЯУК Top 5 Cited Verdicts per Top Judge (with %):

ЁЯСитАНтЪЦя╕П Judge: ╫в╫Ю╫Щ ╫з╫Х╫С╫Х
   тАв 8988-16              9     (1.2%)
   тАв 5093-17              8     (1.0%)
   тАв 30876-03-17          7     (0.9%)
   тАв 4008-11              6     (0.8%)
   тАв 5813-14              6     (0.8%)

ЁЯСитАНтЪЦя╕П Judge: ╫Ю╫и╫С ╫Т╫и╫Щ╫а╫С╫и╫Т
   тАв 5807-17              12    (3.2%)
   тАв 126-22               11    (3.0%)
   тАв 871-20               6     (1.6%)
   тАв 2596-18              6     (1.6%)
   тАв 2518-16              5     (1.3%)

ЁЯСитАНтЪЦя╕П Judge: ╫и╫Х╫к ╫й╫д╫Щ╫Ь╫С╫и╫Т ╫Ы╫Ф╫Я
   тАв 39589-07-13          14    (3.9%)
   тАв 31724-08-12          13    (3.7%)
   тАв 31347-08-14          13    (3.7%)
   тАв 48125-05-11          13    (3.7%)
   тАв 54706-01-13          13    (3.7%)

ЁЯСитАНтЪЦя╕П Judge: ╫С╫а╫Щ ╫й╫Т╫Щ╫Р
   тАв 6090-10-19           8     (2.3%)
   тАв 8820-14              7     (2.0%)
   тАв 2279-15              6     (1.7%)
   тАв 4592-15              6     (1.7%)


In [28]:
from collections import defaultdict, Counter
import os
import pandas as pd
import re

def normalize_citation(c):
    c = c.replace("╫к\"╫д", "").replace(" ", "")
    c = re.sub(r'[^0-9\-_/]', '', c)
    return c.strip("_-/")

print("ЁЯУК Top 5 Most Cited Verdicts (by unique verdicts) Per Judge:\n")
for judge in top_judges["judge"]:

# for judge in judges_df["judge"].dropna().unique():
    verdict_ids = judges_df[judges_df["judge"] == judge]["file name"].str.replace(".docx", "", regex=False)

    # ╫ж╫Щ╫Ш╫Х╫Ш╫Щ╫Э ╫Щ╫Щ╫Ч╫Х╫У╫Щ╫Щ╫Э ╫Ь╫Ы╫Ь ╫к╫Щ╫з ╫й╫Ф╫й╫Х╫д╫Ш ╫Ы╫к╫С
    citation_to_verdicts = defaultdict(set)
    all_unique_citations = set()

    for verdict_id in verdict_ids:
        csv_path = os.path.join(csv_dir, f"{verdict_id}.csv")
        if not os.path.exists(csv_path):
            continue

        try:
            df = pd.read_csv(csv_path)
        except:
            continue

        if df.empty or "predicted_label" not in df.columns or "citation" not in df.columns:
            continue

        cited = df[df["predicted_label"] == 1]["citation"].dropna().map(normalize_citation)
        unique_citations = set(cited)

        for cit in unique_citations:
            citation_to_verdicts[cit].add(verdict_id)
        all_unique_citations.update(unique_citations)

    if not citation_to_verdicts:
        continue

    counter = {cit: len(vids) for cit, vids in citation_to_verdicts.items()}
    top5 = Counter(counter).most_common(5)
    total = len(all_unique_citations)

    print(f"ЁЯСитАНтЪЦя╕П Judge: {judge}")
    for cit, count in top5:
        percent = (count / total) * 100
        print(f"   тАв {cit:<20} in {count:<3} verdicts ({percent:.1f}%)")
    print()


ЁЯУК Top 5 Most Cited Verdicts (by unique verdicts) Per Judge:

ЁЯСитАНтЪЦя╕П Judge: ╫в╫Ю╫Щ ╫з╫Х╫С╫Х
   тАв 8988-16              in 8   verdicts (1.9%)
   тАв 5093-17              in 7   verdicts (1.7%)
   тАв 56230-11-15          in 6   verdicts (1.4%)
   тАв 4008-11              in 6   verdicts (1.4%)
   тАв 5813-14              in 6   verdicts (1.4%)

ЁЯСитАНтЪЦя╕П Judge: ╫Ю╫и╫С ╫Т╫и╫Щ╫а╫С╫и╫Т
   тАв 5807-17              in 7   verdicts (3.6%)
   тАв 871-20               in 6   verdicts (3.0%)
   тАв 3398-22              in 5   verdicts (2.5%)
   тАв 25458-11-18          in 5   verdicts (2.5%)
   тАв 6161-16              in 5   verdicts (2.5%)

ЁЯСитАНтЪЦя╕П Judge: ╫и╫Х╫к ╫й╫д╫Щ╫Ь╫С╫и╫Т ╫Ы╫Ф╫Я
   тАв 39589-07-13          in 14  verdicts (7.9%)
   тАв 31347-08-14          in 13  verdicts (7.3%)
   тАв 48125-05-11          in 13  verdicts (7.3%)
   тАв 31724-08-12          in 13  verdicts (7.3%)
   тАв 21605-07-13          in 13  verdicts (7.3%)

ЁЯСитАНтЪЦя╕П Judge: ╫С╫а╫Щ ╫й╫Т╫Щ╫Р
 

### verdict citation in db existance

In [None]:
import os
import pandas as pd
import re

# === ╫д╫Х╫а╫з╫ж╫Щ╫Щ╫к ╫а╫Щ╫и╫Ю╫Х╫Ь ===
def normalize_case_name_2(name):
    if pd.isna(name):
        return ""
    name = str(name)
    name = re.sub(r"\(.*?\)", "", name)
    name = re.sub(r"[тИХ/\\]", "-", name)
    name = re.sub(r"\s+", " ", name)
    name = name.strip().lower().replace(" ", "_")
    return name

# === ╫Ф╫а╫к╫Щ╫С ╫Ь╫к╫Щ╫з╫Щ╫Щ╫Ф ===
csv_dir = "/home/liorkob/M.Sc/thesis/data/drugs_3k/gpt/verdicts_tagged_citations"

# ╫й╫Ю╫Х╫к ╫з╫С╫ж╫Щ╫Э ╫Ь╫Р╫Ч╫и ╫а╫Щ╫и╫Ю╫Х╫Ь
existing_files = {
    normalize_case_name_2(os.path.splitext(f)[0])
    for f in os.listdir(csv_dir)
    if f.endswith('.csv')
}

# ╫й╫Ю╫Х╫к ╫ж╫Щ╫Ш╫Х╫Ш╫Щ╫Э ╫в╫Э predicted_label == 1
cited_files = set()

for filename in os.listdir(csv_dir):
    if not filename.endswith('.csv'):
        continue
    filepath = os.path.join(csv_dir, filename)
    try:
        df = pd.read_csv(filepath)
        cited_with_label_1 = df[df['predicted_label'] == 1]['citation']
        normalized_citations = cited_with_label_1.dropna().apply(normalize_case_name_2)
        cited_files.update(normalized_citations)
    except Exception as e:
        print(f"тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к {filename}: {e}")

# ╫Ф╫ж╫Ь╫С╫Ф ╫С╫Щ╫Я ╫ж╫Щ╫Ш╫Х╫Ш╫Щ╫Э ╫Ь╫С╫Щ╫Я ╫з╫С╫ж╫Щ╫Э ╫з╫Щ╫Щ╫Ю╫Щ╫Э
existing_cited_files = cited_files.intersection(existing_files)

# ╫Ф╫У╫д╫б╫к ╫к╫Х╫ж╫Р╫Х╫к
print(f"ЁЯФО ╫Ю╫б╫д╫и ╫к╫Щ╫з╫Щ╫Э ╫й╫Ю╫ж╫Х╫Ш╫Ш╫Щ╫Э ╫в╫Э predicted_label == 1: {len(cited_files)}")
print(f"тЬЕ ╫Ю╫к╫Х╫Ы╫Э ╫з╫Щ╫Щ╫Ю╫Щ╫Э ╫С╫к╫Щ╫з╫Щ╫Щ╫Ф: {len(existing_cited_files)}")
print("ЁЯУД ╫и╫й╫Щ╫Ю╫к ╫к╫Щ╫з╫Щ╫Э ╫з╫Щ╫Щ╫Ю╫Щ╫Э:")
for fname in sorted(existing_cited_files):
    print(fname)


тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к ╫к╫д_41730-12-19.csv: No columns to parse from file
тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к ╫к╫д_53452-10-21.csv: No columns to parse from file
тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к ╫к╫д_11145-01-14.csv: No columns to parse from file
тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к ╫к╫д_35907-11-14.csv: No columns to parse from file
тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к ╫к╫д_66195-07-19.csv: No columns to parse from file
тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к ╫к╫д_19425-03-21.csv: No columns to parse from file
тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к ╫к╫д_45232-07-18.csv: No columns to parse from file
тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к ╫к╫д_46823-08-19.csv: No columns to parse from file
тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к ╫к╫д_48194-05-22.csv: No columns to parse from file
тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к ╫к╫д_17162-05-21.csv: No columns to parse from file
тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к ╫к╫д_26792-04-16.csv: No columns to parse from file
тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к ╫к╫д_43840-06-22.csv: No columns to parse from file
тЪая╕П ╫С╫в╫Щ╫Ф ╫С╫з╫и╫Щ╫Р╫к

In [9]:
print("ЁЯУВ ╫з╫С╫ж╫Щ╫Э ╫з╫Щ╫Щ╫Ю╫Щ╫Э ╫Ь╫Р╫Ч╫и ╫а╫Щ╫и╫Ю╫Х╫Ь:")
for f in sorted(existing_files)[:10]:
    print(f)
print("ЁЯУС ╫ж╫Щ╫Ш╫Х╫Ш╫Щ╫Э ╫Ю╫а╫Х╫и╫Ю╫Ь╫Щ╫Э ╫Ю╫к╫Х╫Ъ predicted_label==1:")
for f in sorted(list(cited_files))[:10]:
    print(f)
print("ЁЯФН ╫Ф╫й╫Х╫Х╫Р╫Ф ╫С╫Щ╫Я ╫Ю╫з╫Х╫и ╫Ь╫Ю╫а╫Х╫и╫Ю╫Ь:")
df_sample = df[df['predicted_label'] == 1].dropna(subset=['citation']).head(10)
for raw in df_sample['citation']:
    norm = normalize_case_name_2(raw)
    print(f"{raw}  -->  {norm}")


ЁЯУВ ╫з╫С╫ж╫Щ╫Э ╫з╫Щ╫Щ╫Ю╫Щ╫Э ╫Ь╫Р╫Ч╫и ╫а╫Щ╫и╫Ю╫Х╫Ь:
╫к╫д_10003-05-16
╫к╫д_10029-03-21
╫к╫д_10054-09-19
╫к╫д_10086-06-18
╫к╫д_10095-08-22
╫к╫д_10096-02-22
╫к╫д_10108-11-21
╫к╫д_10111-08-17
╫к╫д_10140-10-19
╫к╫д_10152-09-17
ЁЯУС ╫ж╫Щ╫Ш╫Х╫Ш╫Щ╫Э ╫Ю╫а╫Х╫и╫Ю╫Ь╫Щ╫Э ╫Ю╫к╫Х╫Ъ predicted_label==1:
╫С"╫д_31971-05-13
╫С"╫й_5273-11-22
╫С╫й"╫д_10638-08
╫У╫а"╫д_10402-07
╫Ю"╫Щ_,_╫й╫Э_╫У╫Х╫С╫и_╫С╫Р╫У╫Э_╫й╫а╫Ф╫Т_╫С╫д╫б╫Щ╫Ь╫Ф,_╫в╫ж╫и_╫С╫Ы╫С╫Щ╫й_╫Х╫к╫з╫г_╫з╫Ш╫Щ╫Я_╫С╫Р╫Т╫и╫Х╫д╫Щ╫Э_╫Х╫а╫Ф╫Т_╫С╫С╫и╫Щ╫Х╫а╫Х╫к_╫С╫Ы╫С╫Щ╫й._╫Ф╫а╫Р╫й╫Э_╫а╫У╫Х╫Я_╫Ь╫й╫а╫к_╫Ю╫Р╫б╫и_╫в╫Ь_╫б╫Ю╫Ъ_╫Ю╫к╫Ч╫Э_╫й╫С╫Щ╫Я_╫Ю╫б╫д╫и_╫Ч╫Х╫У╫й╫Щ_╫Ю╫Р╫б╫и_╫Х╫в╫У_24_╫Ч╫Х╫У╫й╫Щ╫Э._╫Ю╫а╫Т╫У,_╫Ь╫Р_╫Ю╫в╫Ш_╫к╫Щ╫з╫Щ_╫Ф╫Ы╫Р╫к_╫з╫Ш╫Щ╫а╫Щ╫Э_╫Ю╫б╫к╫Щ╫Щ╫Ю╫Щ╫Э_╫Ф╫в╫а╫Щ╫й╫Ф_╫Ю╫з╫Щ╫Ь╫Ф_╫Ю╫Ш╫в╫Ю╫Щ_╫й╫Щ╫з╫Х╫Э_╫С╫в╫Х╫а╫й╫Щ_╫й╫Ь"╫е_╫Х╫в╫С╫Х╫У╫Х╫к_╫й╫Щ╫и╫Х╫к._╫С╫Ф╫з╫й╫и_╫Ц╫Ф_╫Ю╫д╫а╫Ф_╫Ь╫в╫д"╫Т_5662-06-10
╫Ю"╫Щ_._╫С╫Щ╫Ч╫б_╫Ь╫Ф╫Ч╫Ц╫з╫к_╫б╫Э_╫й╫Ь╫Р_╫Ь╫ж╫и╫Щ╫Ы╫Ф_╫в╫ж╫Ю╫Щ╫к_╫з╫й╫Ф_╫Ь╫Ю╫ж╫Х╫Р_╫С╫д╫б╫Щ╫з╫к_╫С╫Щ╫к_╫Ф╫Ю╫й╫д╫Ш_╫Ф╫в╫Ь╫Щ╫Х╫Я_╫Ф╫к╫Щ╫Щ╫Ч╫б╫Х╫к_╫Ь╫а╫б╫Щ╫С╫Х╫к_╫У╫Х╫Ю╫

In [17]:
import os
import pandas as pd
import re

import os
import pandas as pd

# ╫а╫к╫Щ╫С ╫Ь╫к╫Щ╫з╫Щ╫Щ╫Ф ╫в╫Э ╫Ф╫з╫С╫ж╫Щ╫Э
csv_dir = "/home/liorkob/M.Sc/thesis/data/drugs_3k/gpt/verdicts_tagged_citations"

# ╫Ю╫Щ╫Ь╫Х╫к ╫Ю╫д╫к╫Ч ╫й╫Ю╫в╫Щ╫У╫Х╫к ╫в╫Ь ╫Ф╫Ю╫й╫Ъ ╫Ш╫з╫б╫Ш ╫Ь╫Р ╫й╫Щ╫Щ╫Ъ
suspicious_keywords = [
    "╫Ф╫Х╫и╫й╫в", "╫Ф╫Х╫Т╫й", "╫ж╫Щ╫и╫г", "╫С╫Ю╫б╫Т╫и╫к", "╫Т╫Ц╫и", "╫а╫У╫Х╫Я", "╫к╫Ю╫Х╫и╫к",
    "╫С╫Щ╫к ╫Ф╫Ю╫й╫д╫Ш", "╫Ф╫а╫Р╫й╫Э", "╫к╫з╫Х╫д╫Ф", "╫й╫а╫Щ╫Э", "╫в╫С╫Щ╫и╫Х╫к", "╫в╫С╫и ╫д╫Ь╫Щ╫Ь╫Щ", "╫а╫б╫Щ╫С╫Х╫к"
]

# ╫к╫а╫Р╫Щ ╫Ь╫Ц╫Щ╫Ф╫Х╫Щ ╫ж╫Щ╫Ш╫Х╫Ш ╫С╫в╫Щ╫Щ╫к╫Щ
def is_suspicious(citation):
    if len(citation.split()) > 7:
        return True
    for kw in suspicious_keywords:
        if kw in citation:
            return True
    return False

# ╫б╫и╫Щ╫з╫Ф ╫Х╫Ф╫У╫д╫б╫Ф
for filename in os.listdir(csv_dir):
    if not filename.endswith(".csv"):
        continue
    path = os.path.join(csv_dir, filename)
    try:
        df = pd.read_csv(path)
        if 'citation' not in df.columns or 'context_text' not in df.columns:
            continue
        df = df[df['predicted_label'] == 1].dropna(subset=['citation', 'context_text'])

        for _, row in df.iterrows():
            citation = str(row['citation']).strip()
            context_text = str(row['context_text']).strip()
            if is_suspicious(citation):
                print(f"\nЁЯУБ {filename}:\n  ЁЯз╖ {citation}\n  ЁЯУЬ context_text: {context_text}\n")
    except Exception as e:
        continue
        print(f"тЪая╕П ╫й╫Т╫Щ╫Р╫Ф ╫С╫з╫Х╫С╫е {filename}: {e}")



ЁЯУБ ╫к╫д_50853-02-18.csv:
  ЁЯз╖ ╫Ю"╫Щ_._╫С╫и╫в"╫д_7996-12_╫Щ╫Х╫б╫г_╫а'_╫Ю"╫Щ_╫а╫з╫С╫в_╫Ю╫к╫Ч╫Э_╫й╫С╫Щ╫Я_7_╫Ь-18_╫Ч╫Х╫У╫й╫Щ_╫Ю╫Р╫б╫и_╫С╫а╫б╫Щ╫С╫Х╫к_╫й╫Ь_╫Р╫и╫С╫в╫Ф_╫Ю╫з╫и╫Щ_╫б╫Ч╫и_╫С╫Ч╫й╫Щ╫й_╫Х╫С╫з╫Х╫з╫Р╫Щ╫Я_._╫Ф╫а╫Р╫й╫Э_╫а╫У╫Х╫Я_╫Ь-21_╫Ч╫Х╫У╫й╫Щ_╫Ю╫Р╫б╫и_╫Р╫Ъ_╫Т╫Э_╫С╫Т╫Щ╫Я_╫Ю╫в╫й╫Щ╫Э_╫а╫Х╫б╫д╫Щ╫Э;_╫С╫и╫в"╫д_1370-17_╫С╫Я_╫а╫в╫Щ╫Э_╫а'_╫Ю"╫Щ_╫а╫У╫Х╫Я_╫Ф╫а╫Р╫й╫Э_╫Ь-16_╫Ч╫Х╫У╫й╫Щ_╫Ю╫Р╫б╫и_╫Х╫Ч╫Щ╫Ь╫Х╫Ш_╫в╫Ь_╫С╫б╫Щ╫б_╫Ю╫к╫Ч╫Э_╫й╫С╫Щ╫Я_12_╫Ь-24_╫Ч╫Х╫У╫й╫Щ_╫Ю╫Р╫б╫и_╫С╫и╫Щ╫С╫Х╫Щ_╫Ю╫з╫и╫Щ╫Э_╫й╫Ь_╫б╫Ч╫и_╫С╫Ч╫й╫Щ╫й_╫Ь╫и╫С╫Х╫к_╫б╫Ч╫и_╫С╫д╫Ь╫Ш╫Ф._╫й╫Э_╫У╫Х╫С╫и_╫Т╫Э_╫С╫б╫Ч╫и_╫й╫Ь_╫Ь╫Ю╫в╫Ь╫Ф_╫Ю╫з"╫Т_╫Ч╫й╫Щ╫й,_╫Х╫Ь╫Ы╫Я_╫Ф╫Ю╫з╫и╫Ф_╫Ч╫Ю╫Х╫и_╫Ю╫Ю╫з╫и╫а╫Х._╫С╫и╫в"╫д_2139-16_╫Р╫С╫и╫Ю╫б_╫а'_╫Ю"╫Щ_╫Р╫Х╫й╫и_╫Ю╫к╫Ч╫Э_╫в╫а╫Щ╫й╫Ф_╫й╫С╫Щ╫Я_8_╫Ь-27_╫Ч╫Х╫У╫й╫Щ╫Э_╫С╫Щ╫Ч╫б_╫Ь╫и╫Щ╫С╫Х╫Щ_╫Ю╫з╫и╫Щ╫Э_╫й╫Ь_╫б╫Ч╫и_╫С╫Ч╫й╫Щ╫й_╫С╫Ы╫Ю╫Х╫Щ╫Х╫к_╫Т╫У╫Х╫Ь╫Х╫к_╫Ю╫Р╫Х╫У_╫Х╫й╫Э_╫Ф╫к╫Щ╫з_╫Ф╫б╫к╫Щ╫Щ╫Э_╫С-8_╫Ч╫Х╫У╫й╫Щ_╫Ю╫Р╫б╫и_╫С╫д╫Х╫в╫Ь._╫С╫в╫д"╫Т_10715-02-16
  ЁЯУЬ context_text: ╫Р╫й╫и ╫Ь╫а╫б╫Щ╫С╫Х╫к ╫С╫Щ╫ж╫Х╫в ╫Ф╫в╫С╫Щ╫и╫Ф тАУ ╫С╫Ы