extract_citations

In [8]:
import pandas as pd
import os

required_parts = [
    "מתחם הענישה", "מתחם ענישה", "דיון", "ענישה נהוגה",
    "ענישה נוהגת", "מתחם העונש", "מתחם עונש", "מדיניות הענישה"
]

citation_patterns = [
    'ע"פ', 'ת"פ', 'עפ"ג', 'ע״פ', 'ת״פ', 'עפ״ג'
]

def extract_citations_with_full_parts(csv_path, output_path):
    """
    Extract citations from the given CSV file, including all rows starting from occurrences of required parts.

    Parameters:
    - csv_path (str): Path to the CSV file generated by `docToCsv`.
    - output_path (str): Path to save the extracted CSV with citations.

    Returns:
    - bool: True if no rows match the required parts, False otherwise.
    """
    try:
        # Read CSV
        df = pd.read_csv(csv_path)
        part_flag=False

        # Filter rows where 'part' matches exactly any of the required_parts
        primary_matches = df[
            df['part'].isin(required_parts) & 
            df['text'].str.contains('|'.join(citation_patterns), na=False)
        ]

        # Extended logic: Extract all rows starting from each required_part and search for citations
        extended_matches = []
        for part in required_parts:
            part_indices = df[df['part'].str.contains(part, na=False)].index.tolist()
            for idx in part_indices:
                part_flag=True
                # Include rows from the current part onward
                extended_rows = df.loc[idx:]
                # Filter rows that contain citations
                citation_rows = extended_rows[extended_rows['text'].str.contains('|'.join(citation_patterns), na=False)]
                extended_matches.extend(citation_rows.to_dict('records'))

        # Combine primary and extended matches
        combined_matches = pd.DataFrame(primary_matches.to_dict('records') + extended_matches).drop_duplicates()

        # Save the combined results
        combined_matches.to_csv(output_path, index=False, encoding='utf-8-sig')
        print(f"Combined citations extracted and saved to: {output_path}")

        # Return True if no matches were found
        return combined_matches.empty,part_flag
    except Exception as e:
        print(f"Error processing {csv_path}: {e}")
        return True  # Consider the file as missing required parts in case of an error
def batch_extract_citations(input_dir, output_dir):
    """
    Batch process all CSV files in a directory for citation extraction.
    
    Parameters:
    - input_dir (str): Directory containing CSV files.
    - output_dir (str): Directory to save the filtered citation CSVs.
    
    Returns:
    - list: A list of verdict file names where none of the required parts exist.
    """
    os.makedirs(output_dir, exist_ok=True)
    no_parts_verdicts = {}

    for root, _, files in os.walk(input_dir):
        for file in files:
            if not file.endswith('.csv'):
                continue

            input_path = os.path.join(root, file)
            output_path = os.path.join(output_dir, f"filtered_{file}")
            missing_parts,part_flag = extract_citations_with_full_parts(input_path, output_path)
            
            if missing_parts:
                if part_flag:
                    no_parts_verdicts[file] = "part exists but no citation found"
                else:
                    no_parts_verdicts[file] = "no part exists"
    return no_parts_verdicts

# Example usage
if __name__ == "__main__":
    input_csv_dir = "/home/liorkob/thesis/lcp/data/docx_csv_2018"  # Directory where `docToCsv` saves CSV files
    output_csv_dir = "/home/liorkob/thesis/lcp/data/filtered_citations_csv_2018"
    no_parts_files = batch_extract_citations(input_csv_dir, output_csv_dir)
    print("no_parts_files")
    for v, k in no_parts_files.items():
        print(v, k)


Combined citations extracted and saved to: /home/liorkob/thesis/lcp/data/filtered_citations_csv_2018/filtered_ת"פ 11786-06-16.csv
Combined citations extracted and saved to: /home/liorkob/thesis/lcp/data/filtered_citations_csv_2018/filtered_ת"פ 13632-08-17.csv
Combined citations extracted and saved to: /home/liorkob/thesis/lcp/data/filtered_citations_csv_2018/filtered_ת"פ 20051-12-17.csv
Combined citations extracted and saved to: /home/liorkob/thesis/lcp/data/filtered_citations_csv_2018/filtered_ת"פ 21139-04-17.csv
Combined citations extracted and saved to: /home/liorkob/thesis/lcp/data/filtered_citations_csv_2018/filtered_ת"פ 22830-12-17.csv
Combined citations extracted and saved to: /home/liorkob/thesis/lcp/data/filtered_citations_csv_2018/filtered_ת"פ 17856-06-17.csv
Combined citations extracted and saved to: /home/liorkob/thesis/lcp/data/filtered_citations_csv_2018/filtered_ת"פ 16420-10-16.csv
Combined citations extracted and saved to: /home/liorkob/thesis/lcp/data/filtered_citation

extract_citations_with_gpt

In [None]:
import pandas as pd
import os
import openai  

OPENAI_API_KEY = "your-api-key"
openai.api_key = OPENAI_API_KEY

# Required parts to filter
required_parts = [
    "מתחם הענישה", "מתחם ענישה", "דיון", "ענישה נהוגה",
    "ענישה נוהגת", "מתחם העונש", "מתחם עונש", "מדיניות הענישה"
]

# Patterns to identify legal citations
citation_patterns = [
    'ע"פ', 'ת"פ', 'עפ"ג', 'ע״פ', 'ת״פ', 'עפ״ג'
]

def call_gpt_to_split_text(text):
    """
    Calls GPT API to split a paragraph containing multiple citations into separate paragraphs.
    
    Parameters:
    - text (str): The paragraph with multiple citations.

    Returns:
    - list: A list of split paragraphs, each containing only one citation.
    """
    prompt = f"""
    The following Hebrew paragraph contains multiple legal citations. Split it into separate paragraphs so that each one contains exactly one citation without modifying the text:
    
    {text}
    
    Return only the split paragraphs as a list.
    """

    try:
        response = openai.ChatCompletion.create(
            model="gpt-4-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1
        )

        # Extract and return the GPT response (list of paragraphs)
        return response["choices"][0]["message"]["content"].strip().split("\n\n")
    except Exception as e:
        print(f"Error calling GPT API: {e}")
        return [text]  # If API call fails, return the original text unchanged

def extract_citations_with_gpt(csv_path, output_path):
    """
    Extracts citations and ensures each paragraph has only one citation by using GPT API.
    
    Parameters:
    - csv_path (str): Path to the CSV file containing text data.
    - output_path (str): Path to save the processed CSV.
    
    Returns:
    - bool: True if no citations found, False otherwise.
    """
    try:
        df = pd.read_csv(csv_path)
        part_flag = False

        # Filter rows where 'part' is relevant and 'text' contains citations
        filtered_rows = df[df['part'].isin(required_parts) & df['text'].str.contains('|'.join(citation_patterns), na=False)]

        processed_rows = []
        for _, row in filtered_rows.iterrows():
            text = row["text"]
            citation_count = sum(text.count(pattern) for pattern in citation_patterns)

            if citation_count > 1:  # If multiple citations exist, split using GPT
                part_flag = True
                split_texts = call_gpt_to_split_text(text)
                for split_text in split_texts:
                    new_row = row.copy()
                    new_row["text"] = split_text
                    processed_rows.append(new_row)
            else:
                processed_rows.append(row)

        # Save results
        processed_df = pd.DataFrame(processed_rows)
        processed_df.to_csv(output_path, index=False, encoding="utf-8-sig")
        print(f"Citations processed and saved to: {output_path}")

        return processed_df.empty, part_flag
    except Exception as e:
        print(f"Error processing {csv_path}: {e}")
        return True, False

def batch_extract_citations_with_gpt(input_dir, output_dir):
    """
    Batch processes all CSV files in a directory for citation extraction and paragraph splitting.
    
    Parameters:
    - input_dir (str): Directory containing CSV files.
    - output_dir (str): Directory to save the processed citation CSVs.
    
    Returns:
    - list: A list of verdict file names where no citations were found.
    """
    os.makedirs(output_dir, exist_ok=True)
    no_parts_verdicts = {}

    for root, _, files in os.walk(input_dir):
        for file in files:
            if not file.endswith(".csv"):
                continue

            input_path = os.path.join(root, file)
            output_path = os.path.join(output_dir, f"filtered_{file}")
            missing_parts, part_flag = extract_citations_with_gpt(input_path, output_path)

            if missing_parts:
                if part_flag:
                    no_parts_verdicts[file] = "Part exists but no citation found"
                else:
                    no_parts_verdicts[file] = "No part exists"
    
    return no_parts_verdicts

# Example usage
if __name__ == "__main__":
    input_csv_dir = "/home/liorkob/thesis/lcp/data/docx_csv_2018"
    output_csv_dir = "/home/liorkob/thesis/lcp/data/filtered_citations_csv_2018"
    no_parts_files = batch_extract_citations_with_gpt(input_csv_dir, output_csv_dir)

    print("No parts files:")
    for file, reason in no_parts_files.items():
        print(file, reason)


extract_citations_with_dictalm

In [None]:
import pandas as pd
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Load DictaLM model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained("dicta-il/dictalm2.0-instruct", torch_dtype=torch.bfloat16, device_map=device)
tokenizer = AutoTokenizer.from_pretrained("dicta-il/dictalm2.0-instruct")

# Initialize DictaLM generator
dictalm_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Required parts to filter
required_parts = [
    "מתחם הענישה", "מתחם ענישה", "דיון", "ענישה נהוגה",
    "ענישה נוהגת", "מתחם העונש", "מתחם עונש", "מדיניות הענישה"
]

# Patterns to identify legal citations
citation_patterns = [
    'ע"פ', 'ת"פ', 'עפ"ג', 'ע״פ', 'ת״פ', 'עפ״ג'
]

def query_dictalm(text):
    """
    Calls DictaLM to split paragraphs with multiple citations.

    Parameters:
    - text (str): The paragraph with multiple citations.

    Returns:
    - list: A list of split paragraphs, each containing only one citation.
    """
    prompt = (
        "הטקסט הבא מכיל מספר הפניות לפסקי דין. "
        "עליך לפצל אותו כך שכל פסקה תכיל הפניה אחת בלבד, בלי לשנות את התוכן או הניסוח:\n\n"
        f"טקסט: {text}\n"
        "תשובה:"
    )

    try:
        response = dictalm_generator(prompt, max_new_tokens=1024, num_return_sequences=1)
        generated_text = response[0]["generated_text"].strip()
        return generated_text.split("\n\n")  # Split into separate paragraphs
    except torch.cuda.OutOfMemoryError:
        torch.cuda.empty_cache()
        print("Switching to CPU due to GPU memory constraints.")
        return [text]  # Return original text if failure

def extract_citations_with_dictalm(csv_path, output_path):
    """
    Extracts citations and ensures each paragraph has only one citation using DictaLM.

    Parameters:
    - csv_path (str): Path to the CSV file containing text data.
    - output_path (str): Path to save the processed CSV.

    Returns:
    - bool: True if no citations found, False otherwise.
    """
    try:
        df = pd.read_csv(csv_path)
        part_flag = False

        # Filter rows where 'part' is relevant and 'text' contains citations
        filtered_rows = df[df['part'].isin(required_parts) & df['text'].str.contains('|'.join(citation_patterns), na=False)]

        processed_rows = []
        for _, row in filtered_rows.iterrows():
            text = row["text"]
            citation_count = sum(text.count(pattern) for pattern in citation_patterns)

            if citation_count > 1:  # If multiple citations exist, split using DictaLM
                part_flag = True
                split_texts = query_dictalm(text)
                for split_text in split_texts:
                    new_row = row.copy()
                    new_row["text"] = split_text
                    processed_rows.append(new_row)
            else:
                processed_rows.append(row)

        # Save results
        processed_df = pd.DataFrame(processed_rows)
        processed_df.to_csv(output_path, index=False, encoding="utf-8-sig")
        print(f"Citations processed and saved to: {output_path}")

        return processed_df.empty, part_flag
    except Exception as e:
        print(f"Error processing {csv_path}: {e}")
        return True, False

def batch_extract_citations_with_dictalm(input_dir, output_dir):
    """
    Batch processes all CSV files in a directory for citation extraction and paragraph splitting.

    Parameters:
    - input_dir (str): Directory containing CSV files.
    - output_dir (str): Directory to save the processed citation CSVs.

    Returns:
    - list: A list of verdict file names where no citations were found.
    """
    os.makedirs(output_dir, exist_ok=True)
    no_parts_verdicts = {}

    for root, _, files in os.walk(input_dir):
        for file in files:
            if not file.endswith(".csv"):
                continue

            input_path = os.path.join(root, file)
            output_path = os.path.join(output_dir, f"filtered_{file}")
            missing_parts, part_flag = extract_citations_with_dictalm(input_path, output_path)

            if missing_parts:
                if part_flag:
                    no_parts_verdicts[file] = "Part exists but no citation found"
                else:
                    no_parts_verdicts[file] = "No part exists"
    
    return no_parts_verdicts

# Example usage
if __name__ == "__main__":
    input_csv_dir = "/home/liorkob/thesis/lcp/data/docx_csv_2018"
    output_csv_dir = "/home/liorkob/thesis/lcp/data/filtered_citations_csv_2018"
    no_parts_files = batch_extract_citations_with_dictalm(input_csv_dir, output_csv_dir)

    print("No parts files:")
    for file, reason in no_parts_files.items():
        print(file, reason)
