In [30]:
# Semantic Document Change Analyzer

In [31]:
!pip install transformers




**Reading the documents into Python**

In [32]:
# Read policy version 1
with open("/Policy_v1.txt", "r", encoding="latin-1") as f:
    policy_v1 = f.read()

# Read policy version 2
with open("/Policy_v2.txt", "r", encoding="latin-1") as f:
    policy_v2 = f.read()

# Quick check
print("Policy V1 length:", len(policy_v1))
print("Policy V2 length:", len(policy_v2))

Policy V1 length: 761
Policy V2 length: 804


In [33]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = "sentence-transformers/all-MiniLM-L6-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

print("Model loaded successfully")

Model loaded successfully


**This function converts any document into a numeric form the computer can compare.**

In [34]:
def get_embedding(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True
    )

    with torch.no_grad():
        outputs = model(**inputs)

    # Take the average of token embeddings
    embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding


**Create embeddings for both documents**

In [35]:
# Create embeddings for both policy documents
embedding_v1 = get_embedding(policy_v1)
embedding_v2 = get_embedding(policy_v2)

# Quick check
print("Embedding V1 shape:", embedding_v1.shape)
print("Embedding V2 shape:", embedding_v2.shape)


Embedding V1 shape: torch.Size([1, 384])
Embedding V2 shape: torch.Size([1, 384])


**Now both documents are converted into numbers and are ready to be compared.**

**Compare the two documents**

In [36]:
from torch.nn.functional import cosine_similarity

# Compare the two document embeddings
similarity_score = cosine_similarity(embedding_v1, embedding_v2)

print("Similarity score:", similarity_score.item())


Similarity score: 0.9497689008712769


**Split documents into sections**

In [37]:
import re

def split_into_sections(text):
    # Split based on numbered sections like "1. ", "2. ", etc.
    sections = re.split(r"\n\d+\.\s", text)

    # Clean and remove empty parts
    sections = [s.strip() for s in sections if s.strip()]
    return sections


**TESTing So that We can now compare section by section, instead of guessing from the whole document.**

In [38]:
sections_v1 = split_into_sections(policy_v1)
sections_v2 = split_into_sections(policy_v2)

print("Sections in Policy V1:", len(sections_v1))
print("Sections in Policy V2:", len(sections_v2))

Sections in Policy V1: 6
Sections in Policy V2: 6


**Comparing sections and finding major changes**

In [39]:
def compare_sections(sections_v1, sections_v2, threshold=0.85):
    major_changes = []

    for i in range(min(len(sections_v1), len(sections_v2))):
        emb1 = get_embedding(sections_v1[i])
        emb2 = get_embedding(sections_v2[i])

        score = cosine_similarity(emb1, emb2).item()

        if score < threshold:
            major_changes.append({
                "section_number": i + 1,
                "similarity_score": round(score, 2),
                "summary": sections_v2[i][:120] + "..."
            })

    return major_changes

In [40]:
major_changes = compare_sections(sections_v1, sections_v2)

for change in major_changes:
    print(change)

{'section_number': 2, 'similarity_score': 0.81, 'summary': 'Working Hours\nEmployees are expected to work 8 hours per day, Monday to Friday.\nFlexible working hours are allowed with ...'}
{'section_number': 3, 'similarity_score': 0.68, 'summary': 'Remote Work\nEmployees may work remotely up to 2 days per week.\nRemote work requests must be submitted through the intern...'}
{'section_number': 6, 'similarity_score': 0.84, 'summary': 'Policy Updates\nPolicy changes will be communicated through email and the company dashboard.\nEmployees are responsible fo...'}


**For clean Output**

In [41]:
def format_output(changes):
    return {
        "total_major_changes": len(changes),
        "changes": [
            {
                "section": change["section_number"],
                "summary": change["summary"]
            }
            for change in changes
        ]
    }

In [42]:
final_output = format_output(major_changes)
final_output

{'total_major_changes': 3,
 'changes': [{'section': 2,
   'summary': 'Working Hours\nEmployees are expected to work 8 hours per day, Monday to Friday.\nFlexible working hours are allowed with ...'},
  {'section': 3,
   'summary': 'Remote Work\nEmployees may work remotely up to 2 days per week.\nRemote work requests must be submitted through the intern...'},
  {'section': 6,
   'summary': 'Policy Updates\nPolicy changes will be communicated through email and the company dashboard.\nEmployees are responsible fo...'}]}

In [43]:
import json

with open("output.json", "w") as f:
    json.dump(final_output, f, indent=2)

print("Output saved to output.json")


Output saved to output.json


In [44]:
!ls


output.json  sample_data


**Bento ML**

In [45]:
!pip install bentoml



In [46]:
import bentoml
print(bentoml.__version__)


1.4.31


In [47]:
import bentoml
from typing import Dict, Any

@bentoml.service(name="document_change_analyzer")
class DocumentChangeService:

    @bentoml.api
    def analyze(self, input_data: Dict[str, Any]) -> Dict[str, Any]:
        text_v1 = input_data["policy_v1"]
        text_v2 = input_data["policy_v2"]

        sections_v1 = split_into_sections(text_v1)
        sections_v2 = split_into_sections(text_v2)

        changes = compare_sections(sections_v1, sections_v2)
        result = format_output(changes)

        return result

In [48]:
# Create a test input
test_input = {
    "policy_v1": policy_v1,
    "policy_v2": policy_v2
}

# Create service instance
service_instance = DocumentChangeService()

# Call the API method directly
response = service_instance.analyze(test_input)

response


{'total_major_changes': 3,
 'changes': [{'section': 2,
   'summary': 'Working Hours\nEmployees are expected to work 8 hours per day, Monday to Friday.\nFlexible working hours are allowed with ...'},
  {'section': 3,
   'summary': 'Remote Work\nEmployees may work remotely up to 2 days per week.\nRemote work requests must be submitted through the intern...'},
  {'section': 6,
   'summary': 'Policy Updates\nPolicy changes will be communicated through email and the company dashboard.\nEmployees are responsible fo...'}]}

In [49]:
%%writefile service.py
print("service file created")


Writing service.py


In [50]:
%%writefile service.py
import bentoml

@bentoml.service(name="document_change_analyzer")
class DocumentChangeService:
    pass


Overwriting service.py


In [52]:
%%writefile service.py
import bentoml

@bentoml.service(name="document_change_analyzer")
class DocumentChangeService:

    @bentoml.api
    def health(self) -> dict:
        return {"status": "ok"}


Overwriting service.py


In [53]:
!bentoml serve service:DocumentChangeService


2025-12-30T07:59:41+0000 [INFO] [cli] Starting production HTTP BentoServer from "service:DocumentChangeService" listening on http://localhost:3000 (Press CTRL+C to quit)
2025-12-30T07:59:44+0000 [INFO] [entry_service:document_change_analyzer:1] Service document_change_analyzer initialized
2025-12-30T08:02:49+0000 [INFO] [entry_service:document_change_analyzer:1] Service instance cleanup finalized


In [54]:
%%writefile service.py
import bentoml

@bentoml.service(name="document_change_analyzer")
class DocumentChangeService:

    @bentoml.api
    def analyze(self, payload: dict) -> dict:
        doc_v1 = payload["doc_v1"]
        doc_v2 = payload["doc_v2"]

        if doc_v1 == doc_v2:
            return {
                "changed": False,
                "message": "No changes detected"
            }

        return {
            "changed": True,
            "message": "Documents are different",
            "length_v1": len(doc_v1),
            "length_v2": len(doc_v2)
        }

Overwriting service.py


In [57]:
!bentoml serve service:DocumentChangeService

2025-12-30T08:06:50+0000 [INFO] [cli] Starting production HTTP BentoServer from "service:DocumentChangeService" listening on http://localhost:3000 (Press CTRL+C to quit)
2025-12-30T08:06:53+0000 [INFO] [entry_service:document_change_analyzer:1] Service document_change_analyzer initialized
2025-12-30T08:08:20+0000 [INFO] [entry_service:document_change_analyzer:1] Service instance cleanup finalized


In [62]:
!bentoml serve service:DocumentChangeService &

2025-12-30T08:09:56+0000 [INFO] [cli] Starting production HTTP BentoServer from "service:DocumentChangeService" listening on http://localhost:3000 (Press CTRL+C to quit)
2025-12-30T08:09:57+0000 [INFO] [entry_service:document_change_analyzer:1] Service document_change_analyzer initialized
2025-12-30T08:11:14+0000 [INFO] [entry_service:document_change_analyzer:1] Service instance cleanup finalized


In [64]:
%%bash
bentoml serve service:DocumentChangeService &
sleep 5

curl -X POST http://localhost:3000/analyze \
  -H "Content-Type: application/json" \
  -d '{"doc_v1": "Company policy allows remote work.", "doc_v2": "Company policy does not allow remote work."}'


Process is interrupted.


**BentoML integration completed successfully.**

In [65]:
%%writefile service.py
import bentoml
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the embedding model ONCE
model = SentenceTransformer("all-MiniLM-L6-v2")

@bentoml.service(name="document_change_analyzer")
class DocumentChangeService:

    @bentoml.api
    def analyze(self, payload: dict) -> dict:
        doc_v1 = payload["doc_v1"]
        doc_v2 = payload["doc_v2"]

        # Generate embeddings
        emb_v1 = model.encode([doc_v1])
        emb_v2 = model.encode([doc_v2])

        # Compute cosine similarity
        similarity = cosine_similarity(emb_v1, emb_v2)[0][0]

        # Simple decision rule
        if similarity > 0.85:
            change_type = "minor_or_no_change"
        else:
            change_type = "major_change"

        return {
            "similarity_score": float(similarity),
            "change_type": change_type
        }


Overwriting service.py


In [66]:
!bentoml serve service:DocumentChangeService

2025-12-30 09:26:18.714751: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767086778.781974   56308 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767086778.802095   56308 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767086778.869904   56308 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767086778.869975   56308 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767086778.869981   56308 computation_placer.cc:177] computation placer alr