In [3]:
from dotenv import load_dotenv
import os

load_dotenv(override=True)

# Load and validate Azure AI Services configs
AZURE_AI_SERVICES_ENDPOINT = os.getenv("AZURE_AI_SERVICES_ENDPOINT")
AZURE_AI_SERVICES_API_VERSION = os.getenv("AZURE_AI_SERVICES_API_VERSION")
AZURE_AI_SERVICES_API_KEY = os.getenv("AZURE_AI_SERVICES_API_KEY", None)
AZURE_AI_DOCUMENT_ENDPOINT = os.getenv("AZURE_AI_DOCUMENT_ENDPOINT") or os.getenv("AZURE_AI_SERVICES_ENDPOINT")
AZURE_AI_DOCUMENT_API_KEY = os.getenv("AZURE_AI_DOCUMENT_API_KEY", None)
assert AZURE_AI_SERVICES_ENDPOINT, "AZURE_AI_SERVICES_ENDPOINT must be set"
assert AZURE_AI_SERVICES_API_VERSION, "AZURE_AI_SERVICES_API_VERSION must be set"
assert AZURE_AI_DOCUMENT_ENDPOINT, "AZURE_AI_DOCUMENT_ENDPOINT must be set"

# Load and validate Azure OpenAI configs
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", None)
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
AZURE_OPENAI_CHAT_API_VERSION = os.getenv("AZURE_OPENAI_CHAT_API_VERSION")
AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME")
AZURE_OPENAI_EMBEDDINGS_API_VERSION = os.getenv("AZURE_OPENAI_EMBEDDINGS_API_VERSION")
assert AZURE_OPENAI_ENDPOINT, "AZURE_OPENAI_ENDPOINT must be set"
assert (AZURE_OPENAI_CHAT_DEPLOYMENT_NAME), "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME must be set"
assert (AZURE_OPENAI_CHAT_API_VERSION), "AZURE_OPENAI_CHAT_API_VERSION must be set"
assert (AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME), "AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME must be set"
assert (AZURE_OPENAI_EMBEDDINGS_API_VERSION), "AZURE_OPENAI_EMBEDDINGS_API_VERSION must be set"



In [None]:
import json

client = AzureChatOpenAI(
    model=AZURE_OPENAI_CHAT_DEPLOYMENT_NAME,
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    openai_api_key=AZURE_OPENAI_API_KEY,
    api_version=AZURE_OPENAI_CHAT_API_VERSION,
    temperature=0.7
)

def load_clauses(jsonl_file):
    clauses = []
    with open(jsonl_file, "r") as f:
        for line in f:
            data = json.loads(line)
            if "clauseName" in data and "clauseText" in data:
                clauses.append((data["clauseName"], data["clauseText"]))
            else:
                print(f"Skipping line due to missing keys: {line}")
    return clauses

def check_clauses(contract_chunks, required_clauses):
    clause_similarities = {}
    
    for clause_name, clause_text in required_clauses:
        for chunk in contract_chunks:
            prompt = f"You are a legal assistant who is tasked with ensuring legal documents contain the proper clauses. You will be given two clauses to compare. Your job is to determine if the two clauses express the same intent. The first is the golden clause, the second comes from the document we are reviewing. Golden Clause: {clause_text} and Document Clause: {chunk}. Respond with true if the two clauses have the similar intent. Otherwise, respond with false."
    
            response = client.invoke(prompt)
            if response.content.strip().lower() == "true":
                clause_similarities[clause_name] = True
                break
        else:
            clause_similarities[clause_name] = False
    return clause_similarities

jsonl_file = "./master_contract_clause.jsonl"
master_clauses = load_clauses(jsonl_file)

sample_chunks = [
    "Borrower confirms that the loan will be repaid before maturity",
    "The interest starts accruing at signing date.",
    "All notifications, requests, and correspondences must be sent following the specified procedures.",
    "All notices, demands, and communications shall be delivered in accordance with the stated procedures."
]

results = check_clauses(sample_chunks, master_clauses)

# Print results in Markdown table format
print("| Clause Name | Intent |")
print("|:------------|:-----------|")
for clause_name, similarity in results.items():
    print(f"| {clause_name} | {similarity} |")

In [None]:
import json
import warnings
import numpy as np
import tensorflow as tf
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import jaccard_score
import pandas as pd
from IPython.display import display, Markdown

warnings.filterwarnings("ignore", category=FutureWarning)

def embed_text(items, model):
    """
    Given a list of strings, returns a NumPy array of sentence embeddings using the SentenceTransformer model.
    """
    if not items:
        return np.array([])
    return model.encode(items, show_progress_bar=False)

def compute_tfidf_similarity(clause_texts, chunk_texts):
    """
    Computes TF-IDF similarity between each clause and each chunk.
    Returns a 2D array of shape (num_clauses, num_chunks).
    
    For simplicity, we do a dot product on the TF-IDF vectors,
    then normalize the result into a [0..1] range.
    """
    if not clause_texts or not chunk_texts:
        return np.array([])

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(clause_texts + chunk_texts)
    
    # Partition into clause part and chunk part
    clause_tfidf = tfidf_matrix[:len(clause_texts)]
    chunk_tfidf = tfidf_matrix[len(clause_texts):]
    
    # Dot product (sparse matrix multiplication)
    similarity_matrix = clause_tfidf * chunk_tfidf.T
    similarity_matrix = similarity_matrix.toarray()
    
    # Normalize [global min..max] -> [0..1]
    if similarity_matrix.size > 0 and similarity_matrix.max() != similarity_matrix.min():
        similarity_matrix = (
            (similarity_matrix - similarity_matrix.min())
            / (similarity_matrix.max() - similarity_matrix.min())
        )
    return similarity_matrix

def compute_jaccard_similarity(clause_texts, chunk_texts):
    """
    Uses CountVectorizer(binary=True) to convert texts to binary vectors
    and then computes Jaccard similarity for each (clause, chunk) pair.
    """
    if not clause_texts or not chunk_texts:
        return np.array([])

    all_texts = clause_texts + chunk_texts
    vectorizer = CountVectorizer(binary=True)
    binary_matrix = vectorizer.fit_transform(all_texts).toarray()
    
    clause_binary = binary_matrix[:len(clause_texts)]
    chunk_binary = binary_matrix[len(clause_texts):]
    
    similarities = np.zeros((len(clause_binary), len(chunk_binary)))
    for i, clause_vec in enumerate(clause_binary):
        for j, chunk_vec in enumerate(chunk_binary):
            sim = jaccard_score(clause_vec, chunk_vec, average='binary')
            similarities[i, j] = sim
    return similarities

def check_clauses(contract_chunks, required_clauses, model, alpha=0.5, beta=0.3, gamma=0.2):
    """
    Checks if the contract chunks contain the required clauses.
    Returns a dictionary with clause names as keys and a dict
    containing score, best chunk index, and chunk text as values.

    alpha, beta, gamma are the weights for embedding similarity, TF-IDF, and Jaccard, respectively.
    """
    # Extract actual text from dictionaries
    clause_texts = [c["clauseText"].strip().lower() for c in required_clauses if "clauseText" in c and c["clauseText"].strip()]
    chunk_texts  = [ch["text"].strip().lower() for ch in contract_chunks if "text" in ch and ch["text"].strip()]
    
    print(f"\nNumber of clauses: {len(clause_texts)}")
    print(f"Number of chunks: {len(chunk_texts)}")
    
    if not clause_texts:
        print("No valid clause texts found.")
        return {}
    if not chunk_texts:
        print("No valid chunk texts found.")
        return {}
    
    # Embed chunk texts
    chunk_embeddings = embed_text(chunk_texts, model)
    
    # Embed clause texts
    clause_embeddings = embed_text(clause_texts, model)
    
    print(f"Embedded {len(clause_embeddings)} clause embeddings and {len(chunk_embeddings)} chunk embeddings.")
    
    # Compute TF-IDF and Jaccard similarities
    tfidf_similarities = compute_tfidf_similarity(clause_texts, chunk_texts)
    jaccard_similarities = compute_jaccard_similarity(clause_texts, chunk_texts)
    
    print("TF-IDF Similarities:\n", tfidf_similarities)
    print("Jaccard Similarities:\n", jaccard_similarities)
    
    if tfidf_similarities.size == 0 or jaccard_similarities.size == 0:
        print("Similarity matrices are empty.")
        return {}
    
    clause_similarities = {}
    
    for i, clause_emb in enumerate(clause_embeddings):
        best_score = -1
        best_chunk_index = None
        
        for j, chunk_emb in enumerate(chunk_embeddings):
            # Cosine similarity (inverted sign to get actual similarity)
            sim = tf.keras.losses.cosine_similarity(clause_emb, chunk_emb).numpy()
            sim = -sim  # range goes from -1..1; we invert it to get 1 as high similarity
            
            # Normalize cosine similarity from -1..1 to 0..1
            norm_sim = (sim + 1) / 2
            
            # Merge similarities
            tfidf_part = tfidf_similarities[i, j] if tfidf_similarities.size else 0
            jaccard_part = jaccard_similarities[i, j] if jaccard_similarities.size else 0
            
            combined_score = (
                alpha * norm_sim +
                beta  * tfidf_part +
                gamma * jaccard_part
            )
            
            print(f"Clause {i}: Chunk {j} - Norm Sim: {norm_sim:.2f}, TF-IDF: {tfidf_part:.2f}, Jaccard: {jaccard_part:.2f}, Combined: {combined_score:.2f}")
            
            if combined_score > best_score:
                best_score = combined_score
                best_chunk_index = j
        
        clause_name = required_clauses[i]["clauseName"].strip() if "clauseName" in required_clauses[i] else f"Clause_{i}"
        chunk_text = chunk_texts[best_chunk_index] if (best_chunk_index is not None and best_chunk_index < len(chunk_texts)) else None
        
        clause_similarities[clause_name] = {
            "score": best_score,
            "chunk_index": best_chunk_index,
            "chunk_text": chunk_text
        }
    
    return clause_similarities

def load_items(jsonl_path):
    """
    Loads items from a JSONL file.
    Each line should be a valid JSON object.
    """
    items = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for idx, line in enumerate(f, start=1):
            try:
                data = json.loads(line.strip())
                items.append(data)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {idx}: {e}")
    print(f"Loaded {len(items)} items from {jsonl_path}")
    if items:
        print(f"First item: {items[0]}")
    return items

def display_results_as_dataframe(results):
    """
    Displays the results as a formatted Pandas DataFrame.
    """
    if not results:
        display(Markdown("**No results to display.**"))
        return
    
    data = []
    for clause_name, info in results.items():
        data.append({
            "Clause Name": clause_name,
            "Similarity": f"{info['score']:.2f}",
            "Best Chunk Index": info["chunk_index"] if info["chunk_index"] is not None else "None",
            "Chunk Text": info["chunk_text"] if info["chunk_text"] else "None"
        })
    
    df = pd.DataFrame(data)
    df = df.sort_values(by="Similarity", ascending=False)
    
    # Display the dataframe
    display(df)


model_name = "microsoft/deberta-base"
model = SentenceTransformer(model_name)

master_clauses = load_items("master_contract_clause.jsonl")
sample_chunks  = load_items("chunks.jsonl")

results = check_clauses(
    contract_chunks=sample_chunks,
    required_clauses=master_clauses,
    model=model,
    alpha=0.5,  # weight for embedding similarity
    beta=0.3,   # weight for TF-IDF similarity
    gamma=0.2   # weight for Jaccard similarity
)

display_results_as_dataframe(results)

No sentence-transformers model found with name microsoft/deberta-base. Creating a new one with mean pooling.


pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]

Loaded 10 items from master_contract_clause.jsonl
First item: {'clauseNumber': 1, 'clauseName': 'Repayment of Loan Principal', 'clauseText': 'Borrower shall repay the Loan principal in full on or before the Maturity Date.'}
Loaded 131 items from chunks.jsonl
First item: {'text': '<!-- PageHeader="2023 Supportive Housing NOFA Amendment OC Housing and Community Development March 2024" -->\n<!-- PageHeader="ATTACHMENT N 1" -->'}

Number of clauses: 10
Number of chunks: 121
Embedded 10 clause embeddings and 121 chunk embeddings.
TF-IDF Similarities:
 [[0.         0.06862465 0.         ... 0.         0.26856724 0.08155316]
 [0.         0.         0.         ... 0.         0.3656505  0.        ]
 [0.         0.07431727 0.         ... 0.         0.23087339 0.08831823]
 ...
 [0.04697443 0.10877042 0.         ... 0.         0.27310874 0.08335399]
 [0.13586559 0.1780513  0.         ... 0.         0.10125878 0.07881355]
 [0.04200786 0.03454605 0.         ... 0.         0.1441535  0.        ]]
Jac

Unnamed: 0,Clause Name,Similarity,Best Chunk Index,Chunk Text
7,Governing Law,0.8,101,# attachment n 1 \nsuch addresses may be chan...
2,Prepayment,0.77,65,# attachment n 1 \ncounty shall prohibit the ...
0,Repayment of Loan Principal,0.74,62,# attachment n 1 \nand the borrower's sir pro...
1,Accrual of Interest,0.72,62,# attachment n 1 \nand the borrower's sir pro...
8,Amendment and Modification,0.72,104,# attachment n 1 \n9.12 approvals. where an a...
4,Compliance with Representations and Warranties,0.7,29,# article iv disbursement of loan \n4.1\ncond...
5,Insurance Coverage,0.7,37,# attachment n 1 \ninsurance coverage) is $10...
3,Provision of Collateral,0.67,4,# recitals \n1\.\nborrower intends to constru...
9,Notices and Communications,0.67,39,# representations and warranties of borrower ...
6,Default and Remedies,0.63,95,# attachment n 1 \nobligation to make or cont...
