This notebook provides several techniques for comparing the simlarity between documents 

- LLM as compared the semantic similarity between chunck of text
- TF-IDF (Term Frequency-Inverse Document Frequency) weights terms based on how uniquely important they are in your text, helping to prioritize meaningful words.
- Cosine Similarity measures the angle between vectors (created by TF-IDF), capturing semantic closeness between clauses even if wording slightly differs.

In [None]:
from dotenv import load_dotenv
import os
import json
import warnings
import numpy as np
import tensorflow as tf
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from IPython.display import display, Markdown

load_dotenv(override=True)
warnings.filterwarnings("ignore", category=FutureWarning)

# Load and validate Azure AI Services configs
AZURE_AI_SERVICES_ENDPOINT = os.getenv("AZURE_AI_SERVICES_ENDPOINT")
AZURE_AI_SERVICES_API_VERSION = os.getenv("AZURE_AI_SERVICES_API_VERSION")
AZURE_AI_SERVICES_API_KEY = os.getenv("AZURE_AI_SERVICES_API_KEY", None)
AZURE_AI_DOCUMENT_ENDPOINT = os.getenv("AZURE_AI_DOCUMENT_ENDPOINT") or os.getenv("AZURE_AI_SERVICES_ENDPOINT")
AZURE_AI_DOCUMENT_API_KEY = os.getenv("AZURE_AI_DOCUMENT_API_KEY", None)
assert AZURE_AI_SERVICES_ENDPOINT, "AZURE_AI_SERVICES_ENDPOINT must be set"
assert AZURE_AI_SERVICES_API_VERSION, "AZURE_AI_SERVICES_API_VERSION must be set"
assert AZURE_AI_DOCUMENT_ENDPOINT, "AZURE_AI_DOCUMENT_ENDPOINT must be set"

# Load and validate Azure OpenAI configs
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY", None)
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_CHAT_DEPLOYMENT_NAME")
AZURE_OPENAI_CHAT_API_VERSION = os.getenv("AZURE_OPENAI_CHAT_API_VERSION")
AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME")
AZURE_OPENAI_EMBEDDINGS_API_VERSION = os.getenv("AZURE_OPENAI_EMBEDDINGS_API_VERSION")
assert AZURE_OPENAI_ENDPOINT, "AZURE_OPENAI_ENDPOINT must be set"
assert (AZURE_OPENAI_CHAT_DEPLOYMENT_NAME), "AZURE_OPENAI_CHAT_DEPLOYMENT_NAME must be set"
assert (AZURE_OPENAI_CHAT_API_VERSION), "AZURE_OPENAI_CHAT_API_VERSION must be set"
assert (AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME), "AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME must be set"
assert (AZURE_OPENAI_EMBEDDINGS_API_VERSION), "AZURE_OPENAI_EMBEDDINGS_API_VERSION must be set"



: 

In [None]:
def load_items(jsonl_path):
    items = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line.strip())
            items.append(data)
    return items

def display_results_as_dataframe(results):
    if not results:
        display(Markdown("**No results to display.**"))
        return

    data = []
    for clause_name, info in results.items():
        data.append({
            "Clause Name": clause_name,
            "Similarity": f"{info['score']:.2f}",
            "Best Chunk Index": info["chunk_index"],
            "Chunk Text": info["chunk_text"]
        })

    df = pd.DataFrame(data).sort_values(by="Similarity", ascending=False)
    display(df)

In [None]:
import openai
import json
import pandas as pd
from IPython.display import display, Markdown

client = openai.AzureOpenAI(
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_version=AZURE_OPENAI_CHAT_API_VERSION,
    api_key=AZURE_OPENAI_API_KEY
)

def load_items(jsonl_file):
    items = []
    with open(jsonl_file, "r") as f:
        for line in f:
            data = json.loads(line.strip())
            items.append(data)
    return items

def check_clauses(contract_chunks, required_clauses):
    clause_similarities = {}

    for clause in required_clauses:
        clause_name = clause.get("clauseName", "Unknown Clause")
        clause_text = clause.get("clauseText", "")
        best_score = 0
        best_chunk_index = -1
        best_chunk_text = ""

        for idx, chunk in enumerate(contract_chunks):
            chunk_text = chunk.get("text", "")
            prompt = (
                "You are a legal assistant tasked with ensuring legal documents contain the proper clauses. "
                "You will be given two clauses to compare. Your job is to determine if the two clauses express the same intent. "
                f"Golden Clause: '{clause_text}'\nDocument Clause: '{chunk_text}'\n"
                "Respond with a value from 0 to 1, where 0 means the clauses are not similar and 1 means they are similar. "
                "If you are not sure, respond with 0.\n"
                "Please respond with a single number, without any additional text."
            )

            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
                max_tokens=10
            )

            reply = response.choices[0].message.content.strip()

            try:
                score = float(reply)
                if score > best_score:
                    best_score = score
                    best_chunk_index = idx
                    best_chunk_text = chunk_text
            except ValueError:
                continue

        clause_similarities[clause_name] = {
            "score": best_score,
            "chunk_index": best_chunk_index,
            "chunk_text": best_chunk_text
        }

    return clause_similarities

master_clauses = load_items("master_contract_clause.jsonl")
sample_chunks  = load_items("chunks.jsonl")
llm_results = check_clauses(sample_chunks, master_clauses)

display_results_as_dataframe(llm_results)


In [None]:


def embed_text(items, model):
    if not items:
        return np.array([])
    return model.encode(items, show_progress_bar=False)

def compute_tfidf_similarity(clause_texts, chunk_texts):
    if not clause_texts or not chunk_texts:
        return np.array([])

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(clause_texts + chunk_texts)

    clause_tfidf = tfidf_matrix[:len(clause_texts)]
    chunk_tfidf = tfidf_matrix[len(clause_texts):]

    similarity_matrix = clause_tfidf * chunk_tfidf.T
    similarity_matrix = similarity_matrix.toarray()

    if similarity_matrix.size > 0 and similarity_matrix.max() != similarity_matrix.min():
        similarity_matrix = (
            (similarity_matrix - similarity_matrix.min())
            / (similarity_matrix.max() - similarity_matrix.min())
        )
    return similarity_matrix

def check_clauses(contract_chunks, required_clauses, model, alpha=0.7, beta=0.3):
    clause_texts = [c["clauseText"].strip().lower() for c in required_clauses if "clauseText" in c and c["clauseText"].strip()]
    chunk_texts  = [ch["text"].strip().lower() for ch in contract_chunks if "text" in ch and ch["text"].strip()]

    if not clause_texts or not chunk_texts:
        return {}

    chunk_embeddings = embed_text(chunk_texts, model)
    clause_embeddings = embed_text(clause_texts, model)

    tfidf_similarities = compute_tfidf_similarity(clause_texts, chunk_texts)

    clause_similarities = {}

    for i, clause_emb in enumerate(clause_embeddings):
        best_score = -1
        best_chunk_index = None

        for j, chunk_emb in enumerate(chunk_embeddings):
            sim = tf.keras.losses.cosine_similarity(clause_emb, chunk_emb).numpy()
            sim = -sim
            norm_sim = (sim + 1) / 2

            tfidf_part = tfidf_similarities[i, j] if tfidf_similarities.size else 0

            combined_score = (alpha * norm_sim + beta * tfidf_part)

            if combined_score > best_score:
                best_score = combined_score
                best_chunk_index = j

        clause_name = required_clauses[i].get("clauseName", f"Clause_{i}").strip()
        chunk_text = chunk_texts[best_chunk_index] if best_chunk_index is not None else None

        clause_similarities[clause_name] = {
            "score": best_score,
            "chunk_index": best_chunk_index,
            "chunk_text": chunk_text
        }

    return clause_similarities



model_name = "microsoft/deberta-base"
model = SentenceTransformer(model_name)

master_clauses = load_items("master_contract_clause.jsonl")
sample_chunks  = load_items("chunks.jsonl")

sim_results = check_clauses(
    contract_chunks=sample_chunks,
    required_clauses=master_clauses,
    model=model,
    alpha=0.7,
    beta=0.3
)

display_results_as_dataframe(sim_results)


In [None]:
def generate_similarity_report(llm_results, sim_results):
    report = []

    for clause_name in llm_results.keys():
        llm_score = llm_results[clause_name]['score']
        semantic_score = sim_results.get(clause_name, {}).get('score', 0)

        prompt = (
            "You are a legal assistant tasked with generating a concise similarity report for legal clauses. "
            f"For the clause '{clause_name}', the LLM semantic similarity score is {llm_score:.2f}, "
            f"and the combined TF-IDF and cosine similarity score is {semantic_score:.2f}. "
            "Provide a brief summary (1-2 sentences) explaining the similarity between the golden clause and the document clause, "
            "taking into account both scores. Clearly state if the clauses are sufficiently similar or if further review is recommended."
        )

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=100
        )

        summary = response.choices[0].message.content.strip()

        report.append({
            "Clause Name": clause_name,
            "LLM Similarity": f"{llm_score:.2f}",
            "TF-IDF & Cosine Similarity": f"{semantic_score:.2f}",
            "Summary": summary
        })

    df_report = pd.DataFrame(report)
    display(df_report)

generate_similarity_report(llm_results, sim_results)