<a href="https://colab.research.google.com/github/Hema-Vasantha/Regulatory-Compliance-Checker-for-Legal-Contracts-with-leveraging-AI/blob/main/data_pipeline_app.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#step1: data preprocessing
import pandas as pd

# Load the dataset
file_path = "/content/Dataset.xlsx"
df = pd.read_excel(file_path)

# Display all columns
print("Columns in the dataset:", df.columns)

# Select relevant columns dynamically
required_columns = ['Category', 'Document Name', 'Parties', 'Agreement Date',
                    'Effective Date', 'Expiration Date', 'Governing Law']
existing_columns = [col for col in required_columns if col in df.columns]

if not existing_columns:
    print("No matching columns found in the dataset!")
else:
    relevant_columns = df[existing_columns]
    print("Relevant Columns Data:")
    print(relevant_columns.head())

Columns in the dataset: Index(['Category', 'Document Name', 'Parties', 'Agreement Date',
       'Effective Date', 'Expiration Date', 'Renewal Term', 'Governing Law',
       'Notice to terminate renewal', 'Exclusivity',
       'Post-Termination Services', 'Discrepancy',
       'Notice to Terminate Renewal', 'Filename', 'contract', 'Exact Law'],
      dtype='object')
Relevant Columns Data:
      Category                                      Document Name  \
0  Co_Branding             ['CO-BRANDING AND SERVICES AGREEMENT']   
1  Co_Branding                          ['CO-BRANDING AGREEMENT']   
2  Co_Branding  ['PRODUCT DEVELOPMENT AND CO-BRANDING AGREEMENT']   
3  Co_Branding  ['ENDORSEMENT LICENSING AND CO-BRANDING AGREEM...   
4  Co_Branding                          ['CO-BRANDING AGREEMENT']   

                                             Parties  \
0  ['PrimeCall', 'deltathree.com, Inc. (formerly ...   
1  ['Women.com', 'eDiets', 'WOMEN.COM NETWORKS, I...   
2  ['d/b/a Time Life Music

In [2]:
!pip install chromadb



In [3]:
#step2: RAG Implementation
import chromadb

# Initialize the new Chroma client
client = chromadb.PersistentClient(path="./chroma_db")

# Create or get a collection
collection = client.get_or_create_collection("legal_compliance_checker")

# Example: Add data to the collection
documents = ["Sample document text for compliance checking."]
ids = ["doc1"]
embeddings = [[0.1, 0.2, 0.3]]  # Replace with actual embeddings

collection.add(documents=documents, ids=ids, embeddings=embeddings)

# Query the collection
query_result = collection.query(query_embeddings=[[0.1, 0.2, 0.3]], n_results=1)
print(query_result)



{'ids': [['doc1']], 'embeddings': None, 'documents': [['Sample document text for compliance checking.']], 'uris': None, 'data': None, 'metadatas': [[None]], 'distances': [[0.0]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [4]:
pip show torch transformers

Name: torch
Version: 2.5.1+cu121
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3-Clause
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-nccl-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions
Required-by: accelerate, fastai, peft, sentence-transformers, timm, torchaudio, torchvision
---
Name: transformers
Version: 4.48.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)

In [5]:
pip install --upgrade torch transformers



In [6]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import chromadb

# Load the dataset
file_path = "/content/Dataset.xlsx"
df = pd.read_excel(file_path)

# Specify the column to use for embeddings (e.g., 'Document Name')
text_column = 'Document Name'  # Replace with the actual column name from your data

# Ensure the column exists in the DataFrame
if text_column not in df.columns:
    raise KeyError(f"Column '{text_column}' not found in the DataFrame. Available columns are: {df.columns.tolist()}")

# Prepare the data
text_data = df[text_column].dropna().tolist()  # Drop NaN values and convert to a list

# Load the embedding model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Generate embeddings
def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).numpy()

embeddings = get_embeddings(text_data)

# Initialize the Chroma client here
client = chromadb.PersistentClient(path="./chroma_db") # This line was moved here

# Add data to Chroma
ids = [f"doc_{i}" for i in range(len(text_data))]
collection = client.get_or_create_collection("legal_compliance_checker_bert")
collection.add(
    documents=text_data,
    embeddings=embeddings.tolist(),
    ids=ids
)

print("Data successfully added to Chroma.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Data successfully added to Chroma.


In [7]:
#step3: create a pipeline for RAG
query_text = "Check if the contract complies with GDPR."
query_embedding = get_embeddings([query_text])

results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=5
)
print("Relevant Documents:", results)

Relevant Documents: {'ids': [['doc_69', 'doc_75', 'doc_77', 'doc_90', 'doc_85']], 'embeddings': None, 'documents': [["['SAMPLE OF NON-DISCLOSURE AND NON-COMPETITION AGREEMENT']", "['Customization Schedule', 'Software License, Customization and Maintenance Agreement']", "['Software License, Customization and Maintenance Agreement', 'Product License Schedule']", "['NON-COMPETITION AND NON-SOLICITATION AGREEMENT']", "['Attachment C to Master Franchise Agreement   MULTI-STATE ADDENDUM   CALIFORNIA APPENDIX']"]], 'uris': None, 'data': None, 'metadatas': [[None, None, None, None, None]], 'distances': [[47.180159931009484, 48.679853704363644, 48.837604446601695, 50.51918571173609, 51.27558044906161]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [8]:
# step4: Analyze and Return Output
def analyze_results(results):
    for doc, score in zip(results['documents'], results['distances']):
        print(f"Document: {doc}\nRelevance Score: {score}\n")

In [9]:
!pip install -q streamlit

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install pyngrok # Install the pyngrok library

In [10]:
from pyngrok import ngrok
import os

# Set the Streamlit script name
streamlit_script = 'your_pipeline_module.py'  # Replace with your file name

# Open a tunnel to the Streamlit port 8501
public_url = ngrok.connect(port='8501')

# Run the Streamlit app
os.system(f"streamlit run {streamlit_script} --server.port=8501 --server.address=0.0.0.0 &")

# Display the public URL
print(f"Streamlit app is live at: {public_url}")

ModuleNotFoundError: No module named 'pyngrok'

In [None]:
!streamlit run your_pipeline_module.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.2:8501[0m
[34m  External URL: [0m[1mhttp://35.229.238.55:8501[0m
[0m
