### try1

In [None]:
import os
import pandas as pd
import networkx as nx
from sentence_transformers import SentenceTransformer, util
import re

docx_csv_dir = "/home/liorkob/thesis/lcp/data/docx_csv_2018"
citations_dir = "/home/liorkob/thesis/lcp/data/citations_csv_2018_with_tags"


# Citation patterns
citation_patterns = {
    'ע"פ': r'ע"פ (\d+/\d+)',
    'ת"פ': r'ת"פ (\d+[-/]\d+[-/]\d+)',
    'עפ"ג': r'עפ"ג (\d+/\d+)',
    'ע״פ': r'ע״פ (\d+/\d+)',
    'ת״פ': r'ת״פ (\d+[-/]\d+[-/]\d+)',
    'עפ״ג': r'עפ״ג (\d+/\d+)',
}

# Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Section search logic
primary_sections = ["הכרעת הדין", "אישום", "רקע", "כללי", "כתב אישום", "כתב האישום"]
secondary_sections = ["תסקיר", "שירות המבחן"]
tertiary_sections = ["גזר דין", "גזר הדין", "בעניינו של ", "פסק דין", "פסק הדין"]

def normalize_case_name(case_name):
    """Normalize case names by removing extra spaces and fixing slashes."""
    return re.sub(r'\s+', ' ', case_name.replace('∕', '/')).strip()

# Step 1: Extract Relevant Text
def extract_relevant_text_verdict(csv_path):
    df = pd.read_csv(csv_path)
    extracted_text = ""
    found_primary = found_secondary = found_tertiary = False

    for index, row in df.iterrows():
        part = row['part']
        text = row['text']

        if any(keyword in part for keyword in primary_sections):
            extracted_text += text + " "
            found_primary = True
        elif not found_primary and any(keyword in part for keyword in secondary_sections):
            extracted_text += text + " "
            found_secondary = True
        elif not found_primary and not found_secondary and any(keyword in part for keyword in tertiary_sections):
            extracted_text += text + " "
            found_tertiary = True

    if not extracted_text.strip():
        print(f"CSV not found: {csv_path}")
        extracted_text = " ".join(df['text'].astype(str).tolist())

    return extracted_text.strip()


from pathlib import Path
# Step 2: Generate Embeddings with Mapping
def generate_embeddings(docx_csv_dir):
    embeddings = {}
    for file in Path(docx_csv_dir).rglob("*.csv"):  # Recursively find all CSV files
        doc_name = os.path.splitext(file.name)[0]  # Extract the filename without extension
        print(f"Processing file: {file}, doc_name: {doc_name}")
        
        # Ensure the full file path is used in extraction
        csv_path = str(file)
        print(f"Looking for CSV at: {csv_path}")
        
        # Extract relevant text using the full file path
        relevant_text = extract_relevant_text_verdict(csv_path)
        if relevant_text:
            try:
                embeddings[doc_name] = model.encode(relevant_text, convert_to_tensor=True)
            except Exception as e:
                print(f"Error encoding {doc_name}: {e}")
        else:
            print(f"Skipped embedding for {doc_name} due to lack of relevant text.")
    return embeddings

# Step 3: Build Graph from Citation Data
def build_graph(citations_dir):
    G = nx.DiGraph()
    node_sources = {}  # Dictionary to track the source of each node

    for file in os.listdir(citations_dir):
        if file.endswith(".csv"):
            file_path = os.path.join(citations_dir, file)
            doc_name = normalize_case_name(os.path.splitext(file)[0])  # Normalize case name

            # Read the CSV and handle empty files
            try:
                citations = pd.read_csv(file_path)
                if citations.empty:
                    print(f"Skipping empty CSV: {file}")
                    continue
            except pd.errors.EmptyDataError:
                print(f"Skipping empty CSV: {file}")
                continue

            # Extract citations and build the graph
            for _, row in citations.iterrows():
                para_text = row['paragraph_text']
                for key, pattern in citation_patterns.items():
                    matches = re.finditer(pattern, para_text)
                    for match in matches:
                        cited_case = normalize_case_name(f'{key} {match.group(1)}')
                        G.add_edge(doc_name, cited_case)  # Add edge between the verdict and cited case
                        print("edge:", doc_name, ",", cited_case)
                        node_sources[cited_case] = doc_name  # Store source of the cited case

    return G, node_sources

# Step 4: Compute Graph Similarity
def compute_graph_similarity(G, verdict_a, verdict_b):
    try:
        shortest_path_length = nx.shortest_path_length(G, source=verdict_a, target=verdict_b)
        return 1 / (1 + shortest_path_length)
    except nx.NetworkXNoPath:
        return 0

# # Step 5: Combine Graph and Textual Similarity
# def compute_combined_similarity(G, embeddings, verdict_a, verdict_b, alpha=0.5):
#     # Check if nodes exist in the graph
#     if verdict_a not in G.nodes:
#         print(f"Node {verdict_a} not found in the graph.")
#         return 0
#     if verdict_b not in G.nodes:
#         print(f"Node {verdict_b} not found in the graph.")
#         return 0

#     # Compute similarities
#     graph_sim = compute_graph_similarity(G, verdict_a, verdict_b)
#     text_sim = 0
#     if verdict_a in embeddings and verdict_b in embeddings:
#         text_sim = util.cos_sim(embeddings[verdict_a], embeddings[verdict_b]).item()

#     # Combine graph and textual similarity
#     return alpha * graph_sim + (1 - alpha) * text_sim
# Step 5: Combine Graph and Textual Similarity
def compute_combined_similarity(G, embeddings, verdict_a, verdict_b, alpha=0.5):
    # Check if nodes exist in the graph
    if verdict_a not in G.nodes:
        print(f"Node {verdict_a} not found in the graph.")
        return 0
    if verdict_b not in G.nodes:
        print(f"Node {verdict_b} not found in the graph.")
        return 0

    # Compute graph similarity
    graph_sim = compute_graph_similarity(G, verdict_a, verdict_b)
    print(f"Graph similarity : {graph_sim}")

    # Compute textual similarity
    text_sim = 0
    if verdict_a in embeddings and verdict_b in embeddings:
        text_sim = util.cos_sim(embeddings[verdict_a], embeddings[verdict_b]).item()
    print(f"Textual similarity : {text_sim}")

    # Combine graph and textual similarity
    combined_similarity = alpha * graph_sim + (1 - alpha) * text_sim
    print(f"Combined similarity between {verdict_a} and {verdict_b}: {combined_similarity}")
    
    return combined_similarity

# Main Execution
if __name__ == "__main__":
    # Build graph
    graph, node_sources = build_graph(citations_dir)
    print(f"Graph loaded with {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges.")
    print("Graph nodes:", graph.nodes)
    
    # Extract document names from docx_csv_dir
    docx_cases = [normalize_case_name(os.path.splitext(file.name)[0]) for file in Path(docx_csv_dir).rglob("*.csv")]
    
    # Find mismatches
    only_in_graph = set(graph.nodes) - set(docx_cases)
    only_in_docx = set(docx_cases) - set(graph.nodes)
    print(only_in_graph)
    print(only_in_docx)

        # # Generate embeddings
    # verdict_embeddings = generate_embeddings(docx_csv_dir)
    # print(f"Generated embeddings for {len(verdict_embeddings)} verdicts.")
    # print("Embedding keys:", verdict_embeddings.keys())

    # # Select verdicts that exist in both graph and embeddings
    # embedding_keys = list(verdict_embeddings.keys())
    
    # # Loop through all pairs of verdicts that exist in both the graph and embeddings
    # verdict_pairs = [(a, b) for a in embedding_keys for b in embedding_keys if a != b and a in graph.nodes and b in graph.nodes]

    # if verdict_pairs:
    #     for verdict_a, verdict_b in verdict_pairs:
    #         print(f"Computing similarity for {verdict_a} and {verdict_b}")
    #         # Compute combined similarity
    #         similarity = compute_combined_similarity(graph, verdict_embeddings, verdict_a, verdict_b, alpha=0.5)
    #         print(f"Final combined similarity between {verdict_a} and {verdict_b}: {similarity}")
    # else:
    #     print("No verdicts found that exist in both the graph and embeddings.")


### try 2

In [None]:
import os
import pandas as pd
import networkx as nx
from pathlib import Path
import re

# Directories
docx_csv_dir = "/home/liorkob/thesis/lcp/data/docx_csv_2018"
citations_dir = "/home/liorkob/thesis/lcp/data/tag_citations_csv_2018"
output_csv = "/home/liorkob/thesis/lcp/graph_data.csv"

# Citation patterns
citation_patterns = {
    'ע"פ': r'ע"פ (\d+/\d+)',
    'ת"פ': r'ת"פ (\d+[-/]\d+[-/]\d+)',
    'עפ"ג': r'עפ"ג (\d+/\d+)',
    'ע״פ': r'ע״פ (\d+/\d+)',
    'ת״פ': r'ת״פ (\d+[-/]\d+[-/]\d+)',
    'עפ״ג': r'עפ״ג (\d+/\d+)',
}

# Section search logic
primary_sections = ["הכרעת הדין", "אישום", "רקע", "כללי", "כתב אישום", "כתב האישום"]
secondary_sections = ["תסקיר", "שירות המבחן"]
tertiary_sections = ["גזר דין", "גזר הדין", "בעניינו של ", "פסק דין", "פסק הדין"]

def normalize_case_name(case_name):
    """Normalize case names by removing extra spaces and fixing slashes."""
    return re.sub(r'\s+', ' ', case_name.replace('∕', '/')).strip()

# Extract relevant text from verdict CSV
def extract_relevant_text(csv_path):
    df = pd.read_csv(csv_path)
    extracted_text = ""
    found_primary = found_secondary = found_tertiary = False

    for _, row in df.iterrows():
        part = row['part']
        text = row['text']

        if any(keyword in part for keyword in primary_sections):
            extracted_text += text + " "
            found_primary = True
        elif not found_primary and any(keyword in part for keyword in secondary_sections):
            extracted_text += text + " "
            found_secondary = True
        elif not found_primary and not found_secondary and any(keyword in part for keyword in tertiary_sections):
            extracted_text += text + " "
            found_tertiary = True

    if not extracted_text.strip():
        extracted_text = " ".join(df['text'].astype(str).tolist())

    return extracted_text.strip()

# Build graph from citation data
def build_graph(citations_dir):
    G = nx.DiGraph()
    rows = []
    
    for file in os.listdir(citations_dir):
        if file.endswith(".csv"):
            file_path = os.path.join(citations_dir, file)
            doc_name = normalize_case_name(os.path.splitext(file)[0])

            try:
                citations = pd.read_csv(file_path)
                if citations.empty:
                    continue
            except pd.errors.EmptyDataError:
                continue

            for _, row in citations.iterrows():
                para_text = row['paragraph_text']
                for key, pattern in citation_patterns.items():
                    matches = re.finditer(pattern, para_text)
                    for match in matches:
                        cited_case = normalize_case_name(f'{key} {match.group(1)}')
                        G.add_edge(doc_name, cited_case)
                        rows.append({
                            "source": doc_name,
                            "target": cited_case,
                            "source_citation": para_text
                        })
    
    return G, pd.DataFrame(rows)

# Extract text for embeddings and update node data
def update_node_texts(df, docx_csv_dir):
    text_data = {}
    
    for file in Path(docx_csv_dir).rglob("*.csv"):
        doc_name = normalize_case_name(os.path.splitext(file.name)[0])
        text_data[doc_name] = extract_relevant_text(str(file))
    
    df["source_text"] = df["source"].map(text_data)
    df["target_text"] = df["target"].map(text_data)
    return df

# Save graph data to CSV
def save_graph_to_csv(df, output_csv):
    df.to_csv(output_csv, index=False)
    print(f"CSV saved to {output_csv}")

# Main Execution
if __name__ == "__main__":
    # Build the citation graph
    graph, df_graph = build_graph(citations_dir)
    print(f"Graph loaded with {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges.")
    
    # Update nodes with text from docx CSVs
    df_graph = update_node_texts(df_graph, docx_csv_dir)
    
    # Save graph structure with text details
    save_graph_to_csv(df_graph, output_csv)


### try 3

In [None]:
# import os
# import pandas as pd
# import networkx as nx
# import matplotlib.pyplot as plt
# import seaborn as sns
# import re
# from pathlib import Path
# from transformers import AutoModel, AutoTokenizer
# import numpy as np

# # Define base directory and years
# base_dir = "/home/liorkob/thesis/lcp/data"
# years = ["2018", "2019", "2020"]  # Adjust years as needed

# # Define directory structure
# dirs = {
#     "docx_csv": [os.path.join(base_dir, f"docx_csv_{year}") for year in years],
#     "citations_csv": [os.path.join(base_dir, f"tag_citations_csv_{year}") for year in years],
# }

# # Define embedding CSV files
# embedding_files = {
#     "verdicts": "processed_verdicts_with_gpt.csv",
#     "appeals": "processed_appeals_with_gpt.csv"
# }

# # Load HeBERT model
# model_name = "avichr/heBERT"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)

# # Citation patterns
# citation_patterns = {
#     'ע"פ': r'ע"פ (\d+/\d+)',
#     'עפ"ג': r'עפ"ג (\d+/\d+)',
#     'ת"פ': r'ת"פ (\d+[-/]\d+[-/]\d+)',
#     'עפ״ג': r'עפ״ג (\d+/\d+)',
#     'רע״פ': r'רע״פ (\d+/\d+)',
#     'תפ"ח': r'תפ"ח\s*(\d+[-/]\d+[-/]\d+)',
# }

# # Normalize case names
# def normalize_case_name(case_name):
#     return re.sub(r'\s+', ' ', case_name.replace('∕', '/')).strip()

# # Build graph from multiple citation directories
# def build_graph(citations_dirs):
#     G = nx.DiGraph()
#     for citations_dir in citations_dirs:
#         if not os.path.exists(citations_dir):
#             print(f"Warning: {citations_dir} not found. Skipping...")
#             continue
        
#         for file in os.listdir(citations_dir):
#             if file.endswith(".csv"):
#                 file_path = os.path.join(citations_dir, file)
#                 doc_name = normalize_case_name(os.path.splitext(file)[0])

#                 try:
#                     citations = pd.read_csv(file_path)
#                     if citations.empty:
#                         continue
#                 except pd.errors.EmptyDataError:
#                     continue

#                 for _, row in citations.iterrows():
#                     if row["predicted_label"]==1:
#                     # para_text = row['paragraph_text']
#                         cited_case = normalize_case_name(row['citation'])
#                         G.add_edge(doc_name, cited_case)
#                         print("edge between:",doc_name, cited_case)
#     return G

# # Compute dataset statistics
# def compute_dataset_statistics(graph):
#     num_cases = graph.number_of_nodes()
#     num_citations = graph.number_of_edges()
#     degrees = [d for _, d in graph.degree()]
#     avg_citations = sum(degrees) / num_cases if num_cases > 0 else 0
    
#     in_degrees = [d for _, d in graph.in_degree()]
#     out_degrees = [d for _, d in graph.out_degree()]
    
#     stats_df = pd.DataFrame({
#         "Metric": ["Total Cases", "Total Citations", "Avg Citations per Case", "Max In-Degree", "Max Out-Degree"],
#         "Value": [num_cases, num_citations, avg_citations, max(in_degrees, default=0), max(out_degrees, default=0)]
#     })
#     return stats_df, degrees, in_degrees, out_degrees

# # Generate plots
# def generate_plots(degrees, in_degrees, out_degrees, graph):
#     plt.figure(figsize=(8, 5))
#     sns.histplot(degrees, bins=20, kde=True)
#     plt.xlabel("Number of Citations")
#     plt.ylabel("Frequency")
#     plt.title("Citation Distribution")
#     plt.show()
    
#     plt.figure(figsize=(8, 5))
#     sns.histplot(in_degrees, bins=20, kde=True)
#     plt.xlabel("In-Degree (Citations Received)")
#     plt.ylabel("Frequency")
#     plt.title("In-Degree Distribution")
#     plt.show()
    
#     plt.figure(figsize=(8, 5))
#     sns.histplot(out_degrees, bins=20, kde=True)
#     plt.xlabel("Out-Degree (Citations Given)")
#     plt.ylabel("Frequency")
#     plt.title("Out-Degree Distribution")
#     plt.show()
    
#     plt.figure(figsize=(10, 6))
#     nx.draw(graph, with_labels=False, node_size=10, alpha=0.5, edge_color="gray")
#     plt.title("Legal Citation Network")
#     plt.show()

# # Run processing with all citation directories
# all_citation_dirs = dirs["citations_csv"]
# graph = build_graph(all_citation_dirs)
# dataset_stats, degrees, in_degrees, out_degrees = compute_dataset_statistics(graph)

# # Display statistics
# print("Dataset Statistics:")
# print(dataset_stats)

# # Generate plots
# generate_plots(degrees, in_degrees, out_degrees, graph)


In [37]:
import os
import torch
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import re
from pathlib import Path
from transformers import AutoModel, AutoTokenizer
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import random

# Define base directory and years
base_dir = "/home/liorkob/thesis/lcp/data"
years = ["2018", "2019", "2020"]

# Define directories
dirs = {
    "docx_csv": [os.path.join(base_dir, f"docx_csv_{year}") for year in years],
    "citations_csv": [os.path.join(base_dir, f"tag_citations_csv_{year}") for year in years],
}

# Define embedding CSV files
embedding_files = {
    "verdicts": "processed_verdicts_with_gpt.csv",
    "appeals": "processed_appeals_with_gpt.csv"
}

# Load HeBERT model
model_name = "avichr/heBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Normalize case names
def normalize_case_name(case_name):
    return re.sub(r'\s+', ' ', case_name.replace('∕', '/')).strip()

# Extract embeddings for case texts
def get_case_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Load precomputed embeddings
def load_precomputed_embeddings(csv_path):
    if not os.path.exists(csv_path):
        print(f"Warning: {csv_path} not found. Skipping...")
        return {}
    
    df = pd.read_csv(csv_path)
    embeddings = {}
    for _, row in df.iterrows():
        verdict = normalize_case_name(row['verdict'])
        try:
            text = row['extracted_gpt_facts'].strip()
            if text:
                embeddings[verdict] = get_case_embedding(text)
            else:
                raise ValueError("Empty extracted_gpt_facts text")
        except Exception as e:
            print(f"Skipping embedding for {verdict} due to error: {e}")
            continue  
    return embeddings

# Compute similarity scores
def compute_similarity(embeddings):
    valid_embeddings = {k: v for k, v in embeddings.items() if isinstance(v, np.ndarray) and v.ndim == 1 and v.size > 0}
    
    if len(valid_embeddings) < len(embeddings):
        print(f"Warning: {len(embeddings) - len(valid_embeddings)} embeddings were invalid and removed.")
    
    case_names = list(valid_embeddings.keys())
    emb_matrix = np.array([valid_embeddings[case] for case in case_names])
    
    if emb_matrix.shape[0] == 0:
        raise ValueError("No valid embeddings available for similarity computation.")
    
    similarity_matrix = cosine_similarity(emb_matrix)
    similarity_df = pd.DataFrame(similarity_matrix, index=case_names, columns=case_names)
    
    # Save the similarity matrix
    similarity_df.to_csv("case_similarity_matrix.csv")
    
    return similarity_df, case_names, emb_matrix

# Extract citation pairs from the citation network
def get_citation_pairs(graph, num_pairs=10):
    edges = list(graph.edges())
    if len(edges) < num_pairs:
        print(f"Warning: Only {len(edges)} citation pairs found in the graph!")
        num_pairs = len(edges)
    return random.sample(edges, num_pairs)

# Generate random pairs that are not citations
def get_random_pairs(graph, num_pairs=10):
    nodes = list(graph.nodes())
    random_pairs = set()
    while len(random_pairs) < num_pairs:
        pair = tuple(random.sample(nodes, 2))
        if pair not in graph.edges() and pair[::-1] not in graph.edges():  
            random_pairs.add(pair)
    return list(random_pairs)

# Compute similarity for selected pairs
def compute_pairwise_similarity(pairs, embeddings, pair_type, num_required=10):
    valid_pairs = []
    similarities = []
    
    for case1, case2 in pairs:
        if case1 in embeddings and case2 in embeddings:
            sim = cosine_similarity([embeddings[case1]], [embeddings[case2]])[0][0]
            valid_pairs.append((case1, case2))
            similarities.append(sim)
        if len(valid_pairs) == num_required:
            break  # Stop once we have enough valid pairs
    
    if len(valid_pairs) < num_required:
        print(f"Warning: Only {len(valid_pairs)} valid {pair_type} pairs found!")
    
    return valid_pairs, similarities

# Build the citation network
def build_graph(citations_dirs):
    G = nx.DiGraph()
    for citations_dir in citations_dirs:
        if not os.path.exists(citations_dir):
            print(f"Warning: {citations_dir} not found. Skipping...")
            continue
        
        for file in os.listdir(citations_dir):
            if file.endswith(".csv"):
                file_path = os.path.join(citations_dir, file)
                doc_name = normalize_case_name(os.path.splitext(file)[0])

                try:
                    citations = pd.read_csv(file_path)
                    if citations.empty:
                        continue
                except pd.errors.EmptyDataError:
                    continue

                for _, row in citations.iterrows():
                    if row["predicted_label"] == 1:
                        cited_case = normalize_case_name(row['citation'])
                        G.add_edge(doc_name, cited_case)
    return G

# Load embeddings
embeddings = {}
for key, csv_file in embedding_files.items():
    embeddings.update(load_precomputed_embeddings(csv_file))

# Compute similarity matrix
similarity_matrix, case_names, emb_matrix = compute_similarity(embeddings)

# Build the citation graph
citation_dirs = dirs["citations_csv"]
graph = build_graph(citation_dirs)
# Select citation and random pairs
citation_pairs = get_citation_pairs(graph, num_pairs=15)  # Get more to ensure 10 valid
random_pairs = get_random_pairs(graph, num_pairs=15)

# Compute similarity scores (ensuring 10 valid pairs)
citation_pairs, citation_similarities = compute_pairwise_similarity(citation_pairs, embeddings, "Citation", 5)
random_pairs, random_similarities = compute_pairwise_similarity(random_pairs, embeddings, "Random", 5)

# Save all pairs to a CSV file
pairs_data = {
    "Case Pairs": [f"{p1} ↔ {p2}" for p1, p2 in citation_pairs + random_pairs],
    "Pair Type": ["Citation Pair"] * len(citation_pairs) + ["Random Pair"] * len(random_pairs),
    "Similarity Score": citation_similarities + random_similarities
}
pairs_df = pd.DataFrame(pairs_data)
pairs_df.to_csv("pairs_similarity.csv", index=False)

# Heatmap plotting
plt.figure(figsize=(12, 6))
heatmap_pivot = pairs_df.pivot(index="Case Pairs", columns="Pair Type", values="Similarity Score")
sns.heatmap(heatmap_pivot, annot=True, cmap="coolwarm", fmt=".2f", cbar=True)
plt.title("Citation Pairs vs. Random Pairs Similarity Heatmap")
plt.xlabel("Pair Type")
plt.ylabel("Case Pairs")
plt.show()


In [1]:
import os
import torch
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import re
from transformers import AutoModel, AutoTokenizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Define base directory and years
base_dir = "/home/liorkob/thesis/lcp/data"
years = ["2018", "2019", "2020"]

# Define directory structure
dirs = {
    "docx_csv": [os.path.join(base_dir, f"docx_csv_{year}") for year in years],
    "citations_csv": [os.path.join(base_dir, f"tag_citations_csv_{year}") for year in years],
}

# Define embedding CSV files
embedding_files = {
    "verdicts": "processed_verdicts_with_gpt.csv",
    "appeals": "processed_appeals_with_gpt.csv"
}

# Load HeBERT model
model_name = "avichr/heBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Normalize case names
def normalize_case_name(case_name):
    return re.sub(r'\s+', ' ', case_name.replace('∕', '/')).strip()

# Extract embeddings for case texts
def get_case_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Build graph from multiple citation directories
def build_graph(citations_dirs):
    G = nx.DiGraph()
    for citations_dir in citations_dirs:
        if not os.path.exists(citations_dir):
            print(f"Warning: {citations_dir} not found. Skipping...")
            continue
        
        for file in os.listdir(citations_dir):
            if file.endswith(".csv"):
                file_path = os.path.join(citations_dir, file)
                doc_name = normalize_case_name(os.path.splitext(file)[0])

                try:
                    citations = pd.read_csv(file_path)
                    if citations.empty:
                        continue
                except pd.errors.EmptyDataError:
                    continue

                for _, row in citations.iterrows():
                    if row["predicted_label"] == 1:
                        cited_case = normalize_case_name(row['citation'])
                        G.add_edge(doc_name, cited_case)
    return G

# Compute graph-based similarity metrics
def compute_graph_similarity(graph, case_pairs):
    similarities = {}

    for case1, case2 in case_pairs:
        sim_scores = {}

        if graph.has_node(case1) and graph.has_node(case2):
            # Compute Jaccard similarity
            neighbors1 = set(graph.neighbors(case1))
            neighbors2 = set(graph.neighbors(case2))
            intersection = len(neighbors1 & neighbors2)
            union = len(neighbors1 | neighbors2)
            sim_scores["jaccard"] = intersection / union if union > 0 else 0

            # Compute shortest path length
            try:
                sim_scores["shortest_path"] = 1 / (1 + nx.shortest_path_length(graph, case1, case2))
            except nx.NetworkXNoPath:
                sim_scores["shortest_path"] = 0

        similarities[(case1, case2)] = sim_scores
    return similarities

# Compute embedding-based similarity
def compute_embedding_similarity(embeddings):
    case_names = list(embeddings.keys())
    emb_matrix = np.array([embeddings[case] for case in case_names])
    similarity_matrix = cosine_similarity(emb_matrix)
    return pd.DataFrame(similarity_matrix, index=case_names, columns=case_names)

# Combine graph and embedding similarities
def compute_combined_similarity(graph, embeddings):
    case_pairs = [(case1, case2) for case1 in embeddings for case2 in embeddings if case1 != case2]
    graph_similarities = compute_graph_similarity(graph, case_pairs)
    embedding_similarities = compute_embedding_similarity(embeddings)

    combined_scores = {}
    for (case1, case2) in case_pairs:
        graph_score = graph_similarities.get((case1, case2), {}).get("jaccard", 0)
        embedding_score = embedding_similarities.at[case1, case2] if case1 in embedding_similarities.index and case2 in embedding_similarities.columns else 0
        combined_scores[(case1, case2)] = (graph_score + embedding_score) / 2  # Simple average

    return combined_scores

# Load precomputed embeddings from CSV files
def load_precomputed_embeddings(csv_path):
    if not os.path.exists(csv_path):
        print(f"Warning: {csv_path} not found. Skipping...")
        return {}
    
    df = pd.read_csv(csv_path)
    embeddings = {}
    for _, row in df.iterrows():
        verdict = normalize_case_name(row['verdict'])
        try:
            text = row['extracted_gpt_facts'].strip()
            if text:
                embeddings[verdict] = get_case_embedding(text)
            else:
                raise ValueError("Empty extracted_gpt_facts text")
        except Exception as e:
            print(f"Skipping embedding for {verdict} due to error: {e}")
            continue  
    return embeddings

# Load and merge precomputed embeddings
embeddings = {}
for key, csv_file in embedding_files.items():
    embeddings.update(load_precomputed_embeddings(csv_file))

# Build citation graph
all_citation_dirs = dirs["citations_csv"]
graph = build_graph(all_citation_dirs)

# Compute similarity metrics
combined_similarities = compute_combined_similarity(graph, embeddings)

# Convert to DataFrame for better readability
similarity_df = pd.DataFrame(combined_similarities.items(), columns=["Case Pair", "Combined Similarity"])
similarity_df = similarity_df.sort_values(by="Combined Similarity", ascending=False)

# Display statistics
print("Top 10 Most Similar Case Pairs:")
print(similarity_df.head(10))

# Generate a heatmap for visualization
plt.figure(figsize=(12, 10))
sns.heatmap(pd.pivot_table(similarity_df, values="Combined Similarity", index="Case Pair", aggfunc="mean"), cmap="coolwarm", annot=False)
plt.title("Combined Case Similarity Heatmap")
plt.show()


In [45]:
import os
import torch
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
from transformers import AutoModel, AutoTokenizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

# Define base directory and years
base_dir = "/home/liorkob/thesis/lcp/data"
years = ["2018", "2019", "2020"]

# Define directory structure
dirs = {
    "docx_csv": [os.path.join(base_dir, f"docx_csv_{year}") for year in years],
    "citations_csv": [os.path.join(base_dir, f"tag_citations_csv_{year}") for year in years],
}

# Define embedding CSV files
embedding_files = {
    "verdicts": "processed_verdicts_with_gpt.csv",
    "appeals": "processed_appeals_with_gpt.csv"
}

# Load HeBERT model
model_name = "avichr/Legal-heBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Normalize case names
def normalize_case_name(case_name):
    return re.sub(r'\s+', ' ', case_name.replace('∕', '/')).strip()

# Extract embeddings for case texts
def get_case_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Load precomputed embeddings
def load_precomputed_embeddings(csv_path):
    if not os.path.exists(csv_path):
        print(f"Warning: {csv_path} not found. Skipping...")
        return {}
    
    df = pd.read_csv("processed_verdicts_with_gpt.csv")
    print(df[['verdict', 'extracted_gpt_facts']].sample(5))
    embeddings = {}
    for _, row in df.iterrows():
        verdict = normalize_case_name(row['verdict'])
        try:
            text = row['extracted_gpt_facts'].strip()
            if text:
                embeddings[verdict] = get_case_embedding(text)
            else:
                raise ValueError("Empty extracted_gpt_facts text")
        except Exception as e:
            print(f"Skipping embedding for {verdict} due to error: {e}")
            continue  
    return embeddings

# Train-test split
def train_test_split_cases(positive_pairs, all_cases, test_size=0.2):
    positive_pairs = list(positive_pairs)  # ✅ Fix: Convert set to list
    positive_train, positive_test = train_test_split(positive_pairs, test_size=test_size, random_state=42)

    negative_pairs = set()
    while len(negative_pairs) < len(positive_pairs):
        case1, case2 = random.sample(all_cases, 2)
        if (case1, case2) not in positive_pairs and (case2, case1) not in positive_pairs:
            negative_pairs.add((case1, case2))

    negative_pairs = list(negative_pairs)  # ✅ Convert negative_pairs to list
    negative_train, negative_test = train_test_split(negative_pairs, test_size=test_size, random_state=42)
    
    train_data = positive_train + negative_train
    test_data = positive_test + negative_test
    
    train_labels = [1] * len(positive_train) + [0] * len(negative_train)
    test_labels = [1] * len(positive_test) + [0] * len(negative_test)

    print(f"📊 Train size: {len(train_data)}, Test size: {len(test_data)}")
    print(f"✅ Sample Positive Train Pairs: {train_data[:3]}")
    print(f"❌ Sample Negative Train Pairs: {negative_train[:3]}")
    
    return train_data, test_data, train_labels, test_labels

def compute_graph_similarity(graph, case_pairs):
    similarities = {}

    for case1, case2 in case_pairs:
        sim_scores = {}

        if not graph.has_node(case1) or not graph.has_node(case2):
            print(f"⚠️ Missing node(s) → {case1} or {case2} not in graph")
            continue  # Skip if a case is missing

        # Compute Jaccard similarity
        neighbors1 = set(graph.neighbors(case1))
        neighbors2 = set(graph.neighbors(case2))
        intersection = len(neighbors1 & neighbors2)
        union = len(neighbors1 | neighbors2)
        sim_scores["jaccard"] = intersection / union if union > 0 else 0

        # Compute shortest path length
        try:
            sim_scores["shortest_path"] = 1 / (1 + nx.shortest_path_length(graph, case1, case2))
        except nx.NetworkXNoPath:
            sim_scores["shortest_path"] = 0

        similarities[(case1, case2)] = sim_scores

        # 🔍 Debug: Print valid similarities
        if sim_scores["jaccard"] > 0 or sim_scores["shortest_path"] > 0:
            print(f"📌 Graph Similarity ({case1}, {case2}) → Jaccard: {sim_scores['jaccard']:.3f}, Shortest Path: {sim_scores['shortest_path']:.3f}")

    return similarities


# Compute embedding-based similarity
def compute_embedding_similarity(embeddings):
    case_names = list(embeddings.keys())
    emb_matrix = np.array([embeddings[case] for case in case_names])
    similarity_matrix = cosine_similarity(emb_matrix)
    similarity_df = pd.DataFrame(similarity_matrix, index=case_names, columns=case_names)

    # 🔍 Debug: Print high similarity pairs
    print("📌 Top 5 Embedding Similarities:")
    top_similar_pairs = similarity_df.unstack().sort_values(ascending=False).drop_duplicates().head(5)
    print(top_similar_pairs)

    return similarity_df

# 🚀 **Step 1: Load embeddings**
embeddings = {}
for key, csv_file in embedding_files.items():
    embeddings.update(load_precomputed_embeddings(csv_file))

# 🚀 **Step 2: Get all cases**
all_cases = list(embeddings.keys())

# 🚀 **Step 3: Load citation pairs BEFORE building the graph**
positive_pairs = set()
for citations_dir in dirs["citations_csv"]:
    if os.path.exists(citations_dir):
        for file in os.listdir(citations_dir):
            if file.endswith(".csv"):
                file_path = os.path.join(citations_dir, file)
                doc_name = normalize_case_name(os.path.splitext(file)[0])

                try:
                    citations = pd.read_csv(file_path)
                    for _, row in citations.iterrows():
                        if row["predicted_label"] == 1:
                            cited_case = normalize_case_name(row['citation'])
                            positive_pairs.add((doc_name, cited_case))
                except pd.errors.EmptyDataError:
                    continue

# 🚀 **Step 4: Split Train-Test Before Graph is Built**
train_data, test_data, train_labels, test_labels = train_test_split_cases(positive_pairs, all_cases)

# 🚀 **Step 5: Build Graph ONLY on Train Data**
graph = nx.DiGraph()
for case1, case2 in train_data:
    graph.add_edge(case1, case2)

# 🚀 **Step 6: Compute Similarities**
graph_similarities = compute_graph_similarity(graph, train_data + test_data)
embedding_similarities = compute_embedding_similarity(embeddings)

# 🚀 **Step 7: Compute Final Similarity Scores**
final_scores = []
for case1, case2 in train_data + test_data:
    graph_score = graph_similarities.get((case1, case2), {}).get("jaccard", 0)
    embedding_score = embedding_similarities.at[case1, case2] if case1 in embedding_similarities.index and case2 in embedding_similarities.columns else 0
    if graph_score > 0 and embedding_score > 0:
        combined_score = (graph_score + embedding_score) / 2
    elif graph_score > 0:
        combined_score = graph_score
    elif embedding_score > 0:
        combined_score = embedding_score
    else:
        combined_score = 0  # If both are 0
    final_scores.append(combined_score)

print(f"🔍 Example of computed similarity scores: {final_scores[:5]}")

# 🚀 **Step 8: Evaluate**
roc_auc = roc_auc_score(test_labels, final_scores[len(train_data):])
precision, recall, _ = precision_recall_curve(test_labels, final_scores[len(train_data):])
pr_auc = auc(recall, precision)

print(f"🎯 ROC AUC Score: {roc_auc:.4f}")
print(f"🎯 Precision-Recall AUC: {pr_auc:.4f}")

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_true = test_labels
y_pred = [1 if s > 0.5 else 0 for s in final_scores[len(train_data):]]

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()


import seaborn as sns

sns.histplot(final_scores[:len(train_data)], bins=20, color="blue", label="Similar Cases", alpha=0.6)
sns.histplot(final_scores[len(train_data):], bins=20, color="red", label="Random Pairs", alpha=0.6)
plt.legend()
plt.xlabel("Similarity Score")
plt.ylabel("Count")
plt.title("Similarity Score Distribution")
plt.show()


# 🚀 **Step 9: Plot Precision-Recall Curve**
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.show()



### first verify

In [13]:
import pandas as pd
import os

import pandas as pd
import os

def verify_verdict_parts_from_csv(output_directory, required_parts):
    """
    Verifies the presence of required parts in each CSV file generated from verdict processing.

    Parameters:
    - output_directory (str): Directory containing the output CSV files.
    - required_parts (list): List of strings representing the required parts.

    Output:
    - Prints the parts for each verdict.
    - Identifies and lists verdicts where none of the required parts are found.
    """
    verdicts_with_no_parts = []  # Store verdicts where none of the parts exist

    for file in os.listdir(output_directory):
        if not file.endswith(".csv"):
            continue
        
        file_path = os.path.join(output_directory, file)
        df = pd.read_csv(file_path)
        verdict_name = os.path.splitext(file)[0]
        
        print(f"Verifying Verdict: {verdict_name}")
        
        # Extract unique parts from the DataFrame
        verdict_parts = df['part'].dropna().astype(str).unique()  # Ensure all parts are strings
        
        # Print all parts for the verdict
        print("  Parts in the verdict:")
        for part in verdict_parts:
            print(f"    - {part}")
        
        # Check if none of the required parts exist
        if not any(req_part in part for req_part in required_parts for part in verdict_parts):
            verdicts_with_no_parts.append((verdict_name,verdict_parts))  # Add to the list of problematic verdicts
        
        print("-" * 40)
    
    # Print verdicts with no matching parts
    if verdicts_with_no_parts:
        print("Verdicts with no matching parts:")
        for verdict,parts in verdicts_with_no_parts:
            print(f"  - {verdict}")
            print(f"parts: {parts}")



    else:
        print("All verdicts have at least one matching part.")

# Define the directory containing the output CSV files
output_directory = "/home/liorkob/thesis/lcp/data/docx_csv_2018"

# Define the required parts (partial matching supported)
required_parts =["חקיקה"]
# required_parts = ["אחידות בענישה","מתחם הענישה","מתחם ענישה", "דיון", "ענישה נהוגה", "הענישה הנוהגת","ענישה נוהגת", "מתחם העונש" ,"מתחם עונש","מדיניות הענישה" "והכרעה", "ההרשעה","מדיניות הענישה הנהוגה"]
# required_parts=["הכרעת הדין", "אישום" ,"רקע" ,"כללי" ,"כתב אישום","כתב האישום"]
# Run the verification
verify_verdict_parts_from_csv(output_directory, required_parts)


FileNotFoundError: [Errno 2] No such file or directory: '/home/liorkob/thesis/lcp/data/docx_csv_2018'

### extract indictment facts

In [24]:
import pandas as pd
import os
import re

# Define start and end patterns based on the 'part' column (for partial matches)
START_PARTS = [
    "עובדותם", "כללי", "כתב האישום", "האישום", "אישום", "רקע", "גזר", "דין", "פסק","מבוא","הרשעת" ,"בעניינו","עבירות","הורשע","עובדות"
]

END_PARTS = [
    "טענות", "עמדת", "תסקיר", "שירות", "מבחן", "דיון", "התסקיר",
    "טיעוני", "הצדדים", "צדדים", "והכרעה", "האישום השני", "ראיות"
]

def extract_indictment_facts(df):
    """
    Extracts the 'Indictment Facts' section based on the 'part' column with partial matches.
    Ensures:
    - If start and end are the same, it extends the search.
    - The text **does not** include the content of the end part, only up to it.
    """
    if df.empty or "part" not in df.columns or "text" not in df.columns:
        return "❌ No indictment facts found", None, None

    # Find the first row where 'part' contains a start pattern (case-insensitive, partial match)
    start_row = df[df["part"].str.contains('|'.join(START_PARTS), case=False, na=False, regex=True)]
    if start_row.empty:
        return "❌ No indictment facts found", None, None
    start_idx = start_row.index.min()
    start_part_name = df.loc[start_idx, "part"]

    # Find the first row where 'part' contains an end pattern **after** the start index
    end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]

    # Ensure end is after start and not identical in name
    if not end_row.empty:
        potential_end_idx = end_row.index.min()

        # If the end part is the same as the start part, look further down
        if df.loc[potential_end_idx, "part"] == df.loc[start_idx, "part"]:
            print(f"⚠️ Warning: Start and End have the same name for verdict '{df['verdict'].iloc[0]}'. Searching for next distinct part.")

            # Find the next part that is different from the start part
            extended_end_row = df[df.index > potential_end_idx][df["part"] != df.loc[start_idx, "part"]]

            if not extended_end_row.empty:
                end_idx = extended_end_row.index.min()
            else:
                end_idx = len(df)  # Default to full text if no better match is found
        else:
            end_idx = potential_end_idx  # Use valid end index if found
    else:
        end_idx = len(df)  # Default to full text if no end marker is found

    # Assign extracted part
    end_part_name = df.loc[end_idx, "part"] if end_idx < len(df) else "❌ No end found (used full text)"

    # Extract text **only until** the end part, excluding it
    extracted_text = "\n".join(df.loc[start_idx:end_idx-1, "text"].dropna().astype(str))  # Exclude the last part

    return extracted_text.strip() if extracted_text else "❌ No indictment facts found", start_part_name, end_part_name

# Tracking statistics
total_files = 0
successful_extractions = 0
failed_extractions = 0
failed_verdicts = []
extracted_results = []
for year in [2018,2019,2020]:
    csv_directory = f"/home/liorkob/thesis/lcp/data/docx_csv_{year}"  # Change this to your actual directory

    # Iterate through all CSV files in the directory
    for filename in os.listdir(csv_directory):
        if filename.endswith(".csv"):
            total_files += 1
            file_path = os.path.join(csv_directory, filename)
            
            # Load CSV file
            df = pd.read_csv(file_path)

            # Ensure necessary columns exist
            if 'verdict' not in df.columns or 'text' not in df.columns or 'part' not in df.columns:
                print(f"Skipping {filename}, missing required columns.")
                continue

            # Extract indictment facts based on 'part'
            extracted_facts, start_part, end_part = extract_indictment_facts(df)

            # Track statistics
            if extracted_facts == "❌ No indictment facts found":
                failed_extractions += 1
                failed_verdicts.append({
                    "verdict": df["verdict"].iloc[0],
                    "all_parts": "; ".join(df["part"].dropna().astype(str).unique())  # Store all parts for debugging
                })
                print(f"\n❌ **Failed Extraction for Verdict: {df['verdict'].iloc[0]}**")
                print(f"📌 Available Parts: {failed_verdicts[-1]['all_parts']}\n")
            else:
                successful_extractions += 1

            # Store results
            extracted_results.append({
                "verdict": df["verdict"].iloc[0],
                "extracted_facts": extracted_facts,
                "start_part": start_part if start_part else "❌ Not Found",
                "end_part": end_part if end_part else "❌ Not Found"
            })

# Convert results to DataFrame
final_df = pd.DataFrame(extracted_results)
failed_df = pd.DataFrame(failed_verdicts) if failed_verdicts else pd.DataFrame(columns=["verdict", "all_parts"])

# Save results
final_df.to_csv("processed_verdicts.csv", index=False, encoding="utf-8-sig")
failed_df.to_csv("failed_verdicts.csv", index=False, encoding="utf-8-sig")

# Display statistics
stats = {
    "Total CSV Files Processed": total_files,
    "Successful Extractions": successful_extractions,
    "Failed Extractions": failed_extractions
}

# Print statistics
print("\n=== Statistics ===")
print(pd.DataFrame([stats]))

# Show failed verdicts (if any)
if not failed_df.empty:
    print("\n=== Sample of Failed Verdicts ===")
    print(failed_df.head())  # Print first few rows for review

# Show extracted results with start and end parts
print("\n=== Sample of Successful Extractions (Start & End Parts) ===")
print(final_df[["verdict", "start_part", "end_part"]].head())  # Print first few rows

print("\n✅ Process complete. Results saved as 'processed_verdicts.csv' and 'failed_verdicts.csv'")


  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  extended_end_row = df[df.index > potential_end_idx][df["part"] != df.loc[start_idx, "part"]]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case



  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'



  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
  end_row = df[df.index > start_idx][df["part"].str.contains('|'


=== Statistics ===
   Total CSV Files Processed  Successful Extractions  Failed Extractions
0                        149                     149                   0

=== Sample of Successful Extractions (Start & End Parts) ===
           verdict  start_part            end_part
0  ת"פ 16420-10-16     גזר דין         עמדת המדינה
1   ת"פ 1995-03-17        כללי  תסקיר שירות המבחן 
2  ת"פ 21139-04-17     גזר דין   תסקיר שירות המבחן
3  ת"פ 13632-08-17     גזר דין    תסקיר שרות מבחן:
4  ת"פ 17856-06-17  כתב-האישום   תסקיר שירות המבחן

✅ Process complete. Results saved as 'processed_verdicts.csv' and 'failed_verdicts.csv'


In [47]:
pip install openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
60154.03s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Collecting openai
  Downloading openai-1.62.0-py3-none-any.whl.metadata (27 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Downloading anyio-4.5.2-py3-none-any.whl.metadata (4.7 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting sniffio (from openai)
  Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting exceptiongroup>=1.0.2 (from anyio<5,>=3.5.0->openai)
  Using cached exceptiongroup-1.2.2-py3-none-any.whl.metadata (6.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Using cached httpcore-1.0.7-py3-none-any.whl.metadata (21 kB)
Colle

### extract indictment facts with API gpt-VERDICTS

In [48]:
import pandas as pd
import os
import re
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = "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"  # Replace with actual key

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
models = client.models.list()

# Define start and end patterns based on the 'part' column (for partial matches)
START_PARTS = [
    "עובדותם", "כללי", "כתב האישום", "האישום", "אישום", "רקע", "גזר", "דין", "פסק","מבוא","הרשעת" ,"בעניינו","עבירות","הורשע","עובדות"
]

END_PARTS = [
    "טענות", "עמדת", "תסקיר", "שירות", "מבחן", "דיון", "התסקיר",
    "טיעוני", "הצדדים", "צדדים", "והכרעה",  "ראיות"
]

def extract_indictment_facts(df):
    """
    Extracts the 'Indictment Facts' section based on the 'part' column with partial matches.
    Ensures:
    - If start and end are the same, it extends the search.
    - The text **does not** include the content of the end part, only up to it.
    """
    if df.empty or "part" not in df.columns or "text" not in df.columns:
        return "❌ No indictment facts found", None, None

    # Find the first row where 'part' contains a start pattern (case-insensitive, partial match)
    start_row = df[df["part"].str.contains('|'.join(START_PARTS), case=False, na=False, regex=True)]
    if start_row.empty:
        return "❌ No indictment facts found", None, None
    start_idx = start_row.index.min()
    start_part_name = df.loc[start_idx, "part"]

    # Find the first row where 'part' contains an end pattern **after** the start index
    end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]

    # Handle case where start and end are identical (wrong extraction range)
    if not end_row.empty and end_row.index.min() == start_idx:
        print(f"⚠️ Warning: Start and End are the same for verdict '{df['verdict'].iloc[0]}'. Extending search.")
        end_row = df[df.index > start_idx + 1][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
    
    end_idx = end_row.index.min() if not end_row.empty else len(df)
    end_part_name = df.loc[end_idx, "part"] if not end_row.empty else "❌ No end found (used full text)"

    # Extract text **only until** the end part, excluding it
    extracted_text = "\n".join(df.loc[start_idx:end_idx-1, "text"].dropna().astype(str))  # Exclude the last part

    return extracted_text.strip() if extracted_text else "❌ No indictment facts found", start_part_name, end_part_name


def extract_facts_with_gpt(text):
    """
    Sends extracted text to GPT API and extracts specific facts.
    """
    if text == "❌ No indictment facts found":
        return "❌ No facts extracted"

    prompt = f"""
    **Task:** Extract only the factual allegations from the provided legal text, preserving the original wording without summarizing, rephrasing, or omitting details. Present the extracted facts as a single paragraph rather than a structured list.

    **Guidelines:**
    - Do not include conclusions, arguments, or legal interpretations.
    - Keep the extracted text **exactly as it appears** in the original source.
    - Maintain coherence when merging multiple allegations into a single paragraph.

    **Example:**
    
    **Input:**
    הנאשם הורשע על פי הודאתו בעבירות של החזקת חלק של נשק או תחמושת, לפי סעיף 144 (א) לחוק העונשין, תשל"ז 1977 (להלן: "חוק העונשין") ונשיאה/הובלת חלק של נשק או תחמושת, לפי סעיף 144(ב) לחוק העונשין. על פי הנטען בכתב האישום ביום 28.8.2022, בשעה 00:20 לערך, נהג הנאשם ברכב מסוג קיה ספורטג' נושא לוחית רישוי מספר 13-608-201 אל עבר מעבר הל"ה בדרכו לשטחי האזור, כל זאת כאשר הוא נושא מתחת למושב הנהג ברכב שקית ובה 6 מכלולים של נשק מסוג M16. בנוסף בתא המטען של הרכב נשא הנאשם שבעה ארגזי תחמושת וארגז קרטון אשר הכילו יחדיו כ-9000 כדורים בקוטר 5.56 מ"מ אשר היו מכוסים ומוסתרים.

    **Expected Output:**
    הנאשם הורשע על פי הודאתו בעבירות של החזקת חלק של נשק או תחמושת, לפי סעיף 144 (א) לחוק העונשין, תשל"ז 1977 ונשיאה/הובלת חלק של נשק או תחמושת, לפי סעיף 144(ב) לחוק העונשין. על פי הנטען בכתב האישום, ביום 28.8.2022 בשעה 00:20 לערך, נהג הנאשם ברכב מסוג קיה ספורטג' עם לוחית רישוי מספר 13-608-201 לכיוון מעבר הל"ה בדרכו לשטחי האזור, כאשר מתחת למושב הנהג ברכב הייתה שקית ובה 6 מכלולים של נשק מסוג M16. בנוסף, בתא המטען של הרכב נשא שבעה ארגזי תחמושת וארגז קרטון שהכילו יחדיו כ-9000 כדורים בקוטר 5.56 מ"מ, שהיו מכוסים ומוסתרים.

    **Input Text:**
    {text}

    **Extracted Facts:**
    """

    response = client.chat.completions.create(
        model="gpt-4o", 
        messages=[
            {"role": "system", "content": "You are an AI trained to extract factual allegations from legal texts, ensuring no interpretation or rewording."},
            {"role": "user", "content": prompt}
        ]
    )

    extracted_facts = response.choices[0].message.content.strip()
    print("Input Text:", text)
    print("Extracted Facts:", extracted_facts)
    return extracted_facts


# Tracking statistics
total_files = 0
successful_extractions = 0
failed_extractions = 0
failed_verdicts = []
extracted_results = []


for year in [2018,2019,2020]:
    csv_directory = f"/home/liorkob/thesis/lcp/data/docx_csv_{year}"  # Change this to your actual directory

    # Iterate through all CSV files in the directory
    for filename in os.listdir(csv_directory):
        if filename.endswith(".csv"):
            total_files += 1
            file_path = os.path.join(csv_directory, filename)
            
            # Load CSV file
            df = pd.read_csv(file_path)

            # Ensure necessary columns exist
            if 'verdict' not in df.columns or 'text' not in df.columns or 'part' not in df.columns:
                print(f"Skipping {filename}, missing required columns.")
                continue

            # Extract indictment facts based on 'part'
            extracted_facts, start_part, end_part = extract_indictment_facts(df)

            # Extract facts using GPT
            extracted_gpt_facts = extract_facts_with_gpt(extracted_facts)

            # Track statistics
            if extracted_facts == "❌ No indictment facts found":
                failed_extractions += 1
                failed_verdicts.append({
                    "verdict": df["verdict"].iloc[0],
                    "all_parts": "; ".join(df["part"].dropna().astype(str))  # Store all parts for debugging
                })
                print(f"\n❌ **Failed Extraction for Verdict: {df['verdict'].iloc[0]}**")
                print(f"📌 Available Parts: {failed_verdicts[-1]['all_parts']}\n")
            else:
                successful_extractions += 1

            # Store results
            extracted_results.append({
                "verdict": df["verdict"].iloc[0],
                "extracted_facts": extracted_facts,
                "extracted_gpt_facts": extracted_gpt_facts,
                "start_part": start_part if start_part else "❌ Not Found",
                "end_part": end_part if end_part else "❌ Not Found"
            })

# Convert results to DataFrame
final_df = pd.DataFrame(extracted_results)
failed_df = pd.DataFrame(failed_verdicts) if failed_verdicts else pd.DataFrame(columns=["verdict", "all_parts"])

# Save results
final_df.to_csv("processed_verdicts_with_gpt.csv", index=False, encoding="utf-8-sig")
failed_df.to_csv("failed_verdicts.csv", index=False, encoding="utf-8-sig")

# Display statistics
stats = {
    "Total CSV Files Processed": total_files,
    "Successful Extractions": successful_extractions,
    "Failed Extractions": failed_extractions
}

# Print statistics
print("\n=== Statistics ===")
print(pd.DataFrame([stats]))

# Show failed verdicts (if any)
if not failed_df.empty:
    print("\n=== Sample of Failed Verdicts ===")
    print(failed_df.head())  # Print first few rows for review

# Show extracted results with GPT output
print("\n=== Sample of Successful Extractions (GPT Facts) ===")
print(final_df[["verdict", "extracted_gpt_facts"]].head())  # Print first few rows

print("\n✅ Process complete. Results saved as 'processed_verdicts_with_gpt.csv' and 'failed_verdicts.csv'")


### extract indictment facts with API gpt-APPEALS

In [49]:
import pandas as pd
import os
import re
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = "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"  # Replace with actual key

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
models = client.models.list()

# Define start and end patterns based on the 'part' column (for partial matches)
START_PARTS = [
    "עובדותם", "כללי", "כתב האישום", "האישום", "אישום", "רקע", "גזר", "דין", "פסק","מבוא","הרשעת" ,"בעניינו","עבירות","הורשע","עובדות"
]

END_PARTS = ["אני מסכים"
    "טענות", "עמדת", "תסקיר", "שירות", "מבחן", "דיון", "התסקיר",
    "טיעוני", "הצדדים", "צדדים", "והכרעה", "ראיות","הכרעה"
]

def extract_indictment_facts(df):
    """
    Extracts the 'Indictment Facts' section based on the 'part' column with partial matches.
    Ensures:
    - If start and end are the same, it extends the search.
    - If no start is found, it attempts a secondary search in the text.
    - The text **does not** include the content of the end part, only up to it.
    """
    if df.empty or "part" not in df.columns or "text" not in df.columns:
        return "❌ No indictment facts found", None, None

    # Search for start part in 'part' column
    start_row = df[df["part"].str.contains('|'.join(START_PARTS), case=False, na=False, regex=True)]
    
    if start_row.empty:
        # Secondary search: check if the 'text' column contains possible indicators
        text_match = df[df["text"].str.contains('|'.join(START_PARTS), case=False, na=False, regex=True)]
        if text_match.empty:
            return "❌ No indictment facts found (start section missing)", None, None
        else:
            start_idx = text_match.index.min()
            start_part_name = "🔍 Found in text column"
    else:
        start_idx = start_row.index.min()
        start_part_name = df.loc[start_idx, "part"]

    # Search for the first row containing an end pattern **after** the start index
    end_row = df[df.index > start_idx][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]

    # Handle case where start and end are identical (wrong extraction range)
    if not end_row.empty and end_row.index.min() == start_idx:
        print(f"⚠️ Warning: Start and End are the same for verdict '{df['verdict'].iloc[0]}'. Extending search.")
        end_row = df[df.index > start_idx + 1][df["part"].str.contains('|'.join(END_PARTS), case=False, na=False, regex=True)]
    
    end_idx = end_row.index.min() if not end_row.empty else len(df)
    end_part_name = df.loc[end_idx, "part"] if not end_row.empty else "❌ No end found (used full text)"

    # Extract text **only until** the end part, excluding it
    extracted_text = "\n".join(df.loc[start_idx:end_idx-1, "text"].dropna().astype(str))  # Exclude the last part

    return extracted_text.strip() if extracted_text else "❌ No indictment facts found", start_part_name, end_part_name


def extract_facts_with_gpt(text):
    """
    Sends extracted text to GPT API and extracts the original indictment details from the case being appealed.
    """
    if text == "❌ No indictment facts found":
        return "❌ No facts extracted"

    prompt = f"""
    **Task:** Extract only the original indictment details of the case being appealed. Ignore all references to the appeal decision, legal arguments, and judicial reasoning. The extracted text should contain only the original facts that led to the indictment, exactly as they appear in the text.

    **Guidelines:**
    - Extract **only the factual allegations** from the original indictment.
    - **Do not include** details about the appeal, court rulings, or sentencing decisions.
    - Maintain the **exact wording** of the indictment without summarizing or omitting details.
    - If the indictment contains multiple allegations, present them in a coherent paragraph.

    **Example:**
    
    **Input:**
    ע""פ 761∕07 - ערעור על גזר דינו של בית המשפט המחוזי. 
    באחד מימיו של חודש יוני 2006, בשעות הערב, נהג הנאשם ברכב, וכאשר נעצר על ידי שוטרים לבדיקה, הוא נמצא מחזיק באקדח, מחסנית ותחמושת כשאלה עטופים בגרב ומוסתרים בתחתוניו.
    כן נטען, כי הנאשם הציג בפני השוטרים תעודת זהות של אחר מתוך כוונה להונותם.
    הנאשם הודה בעובדות האמורות, ובעקבות כך הורשע בעבירות של החזקת נשק שלא כדין והפרעה לשוטר במילוי תפקידו, עבירות לפי סעיפים 144 רישא ו-275 לחוק העונשין.

    **Expected Output:**
    באחד מימיו של חודש יוני 2006, בשעות הערב, נהג הנאשם ברכב, וכאשר נעצר על ידי שוטרים לבדיקה, נמצא מחזיק באקדח, מחסנית ותחמושת עטופים בגרב ומוסתרים בתחתוניו. בנוסף, הציג לשוטרים תעודת זהות של אחר בכוונה להונותם. על סמך עובדות אלה, הואשם בעבירות של החזקת נשק שלא כדין והפרעה לשוטר במילוי תפקידו לפי סעיפים 144 רישא ו-275 לחוק העונשין.

    **Input Text:**
    {text}

    **Extracted Indictment Details:**
    """

    response = client.chat.completions.create(
        model="gpt-4o", 
        messages=[
            {"role": "system", "content": "You are an AI trained to extract factual allegations from legal texts, ensuring no interpretation or rewording."},
            {"role": "user", "content": prompt}
        ]
    )

    extracted_facts = response.choices[0].message.content.strip()
    # print("Input Text:", text)
    print("Extracted Facts:", extracted_facts)
    return extracted_facts


# Tracking statistics
total_files = 0
successful_extractions = 0
failed_extractions = 0
failed_verdicts = []
extracted_results = []


for year in [2018,2019,2020]:
    csv_directory = f"/home/liorkob/thesis/lcp/data/docx_citations_csv_{year}"  

    # Iterate through all CSV files in the directory
    for filename in os.listdir(csv_directory):
        if filename.endswith(".csv"):
            total_files += 1
            file_path = os.path.join(csv_directory, filename)
            
            # Load CSV file
            df = pd.read_csv(file_path)

            # Ensure necessary columns exist
            if 'verdict' not in df.columns or 'text' not in df.columns or 'part' not in df.columns:
                print(f"Skipping {filename}, missing required columns.")
                continue

            # Extract indictment facts based on 'part'
            extracted_facts, start_part, end_part = extract_indictment_facts(df)

            # Extract facts using GPT
            extracted_gpt_facts = extract_facts_with_gpt(extracted_facts)

            # Track statistics
            if extracted_facts == "❌ No indictment facts found":
                failed_extractions += 1
                failed_verdicts.append({
                    "verdict": df["verdict"].iloc[0],
                    "all_parts": "; ".join(df["part"].dropna().astype(str))  # Store all parts for debugging
                })
                print(f"\n❌ **Failed Extraction for Verdict: {df['verdict'].iloc[0]}**")
                print(f"📌 Available Parts: {failed_verdicts[-1]['all_parts']}.unique()\n")
            else:
                successful_extractions += 1

            # Store results
            extracted_results.append({
                "verdict": df["verdict"].iloc[0],
                "extracted_facts": extracted_facts,
                "extracted_gpt_facts": extracted_gpt_facts,
                "start_part": start_part if start_part else "❌ Not Found",
                "end_part": end_part if end_part else "❌ Not Found"
            })

# Convert results to DataFrame
final_df = pd.DataFrame(extracted_results)
failed_df = pd.DataFrame(failed_verdicts) if failed_verdicts else pd.DataFrame(columns=["verdict", "all_parts"])

# Save results
final_df.to_csv("processed_appeals_with_gpt.csv", index=False, encoding="utf-8-sig")
failed_df.to_csv("failed_verdicts.csv", index=False, encoding="utf-8-sig")

# Display statistics
stats = {
    "Total CSV Files Processed": total_files,
    "Successful Extractions": successful_extractions,
    "Failed Extractions": failed_extractions
}

# Print statistics
print("\n=== Statistics ===")
print(pd.DataFrame([stats]))

# Show failed verdicts (if any)
if not failed_df.empty:
    print("\n=== Sample of Failed Verdicts ===")
    print(failed_df.head())  # Print first few rows for review

# Show extracted results with GPT output
print("\n=== Sample of Successful Extractions (GPT Facts) ===")
print(final_df[["verdict", "extracted_gpt_facts"]].head())  # Print first few rows

print("\n✅ Process complete. Results saved as 'processed_appeals_with_gpt.csv' and 'failed_verdicts.csv'")
