In [33]:
from pathlib import Path
from typing import List, Any
import os
from langchain_community.document_loaders import PyPDFLoader, TextLoader, CSVLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders.excel import UnstructuredExcelLoader
from langchain_community.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_groq import ChatGroq
import numpy as np
import chromadb
from chromadb.config import Settings

In [17]:
dir_path = os.getcwd() + "/full_contract_txt"

print(f'The current working directory is {dir_path}\n')

files_list = os.listdir(dir_path)

print(f'There are {len(files_list)} txt files in the directory')

The current working directory is c:\Users\LENOVO\OneDrive\Desktop\MS DS Courses\MS DSP 453\Project Code/full_contract_txt

There are 510 txt files in the directory


In [26]:
for files in files_list:
    if files.endswith('.txt'):
        loader = TextLoader(os.path.join(dir_path, files),encoding='utf8')
        data = loader.load()
        
print(f'All the data from the files is loaded')

All the data from the files is loaded


In [30]:
class EmbeddingPipeline:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2", chunk_size: int = 1000, chunk_overlap: int = 200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.model = SentenceTransformer(model_name)
        print(f"[INFO] Loaded embedding model: {model_name}")

    def chunk_documents(self, documents: List[Any]) -> List[Any]:
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
        chunks = splitter.split_documents(documents)
        print(f"[INFO] Split {len(documents)} documents into {len(chunks)} chunks.")
        return chunks

    def embed_chunks(self, chunks: List[Any]) -> np.ndarray:
        texts = [chunk.page_content for chunk in chunks]
        print(f"[INFO] Generating embeddings for {len(texts)} chunks...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"[INFO] Embeddings shape: {embeddings.shape}")
        return embeddings

# Example usage
if __name__ == "__main__":
    
    emb_pipe = EmbeddingPipeline()
    chunks = emb_pipe.chunk_documents(data)
    embeddings = emb_pipe.embed_chunks(chunks)
    # print("[INFO] Example embedding:", embeddings[0] if len(embeddings) > 0 else None)

[INFO] Loaded embedding model: all-MiniLM-L6-v2
[INFO] Split 1 documents into 15 chunks.
[INFO] Generating embeddings for 15 chunks...


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.15s/it]

[INFO] Embeddings shape: (15, 384)





In [None]:
class vectorDB:
    def __init__(self,embeddings,chunks,db_name):
        self.embeddings = embeddings
        self.chunks = chunks
        self.client = chromadb.Client()
        if db_name in [col.name for col in self.client.list_collections()]:
            self.collection = self.client.get_collection(db_name)
        else:
            self.collection = self.client.create_collection(db_name)

    def Loading_data_to_ChromaDB(self):
    # Add embeddings and documents to the collection
        self.collection.add(
            embeddings=self.embeddings,
            documents=[chunk.page_content for chunk in self.chunks],
            metadatas=[{"source": f"contract_{i}"} for i in range(len(self.chunks))],
            ids=[f"contract_{i}" for i in range(len(self.chunks))]
        )
        print(f"[INFO] Added {len(self.chunks)} documents to ChromaDB collection.")

    def queryDB(self,query):
        results = self.collection.query(
            query_texts=[query],
            n_results=2
        )
        return results

    
# Example usage
if __name__ == "__main__":
    emb_pipe = EmbeddingPipeline()
    chunks = emb_pipe.chunk_documents(data)
    embeddings = emb_pipe.embed_chunks(chunks)

    vdb = vectorDB(embeddings,chunks,'contract')
    vdb.Loading_data_to_ChromaDB()
    res = vdb.queryDB("DEFINITION OF CONFIDENTIAL INFORMATION.")
    print(res)

[INFO] Loaded embedding model: all-MiniLM-L6-v2
[INFO] Split 1 documents into 15 chunks.
[INFO] Generating embeddings for 15 chunks...


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.00it/s]


[INFO] Embeddings shape: (15, 384)
[INFO] Added 15 documents to ChromaDB collection.
{'ids': [['contract_5', 'contract_9']], 'embeddings': None, 'documents': [['whenever necessary, which shall be executed by signing supplemental agreements upon consensus of both Parties.   (c) Party B shall ensure the vehicles are in good conditions, the compartments are properly sealed without leakage and the vehicles are equipped with fire-fighting equipment. In the event of parcel damage resulting from leakage or fire, Party B shall indemnify at the standard rate of RMB200 per parcel, and indemnify the actual price for high-end insured parcel (or indemnify by the value of the parcel provided by arbitration department determined by Party A).   (d) Party B shall have valid and legal licenses for national road transportation. In the event of loss caused to Party A by delivery delay due to vehicles detention for the lack of license, Party B shall compensate for any loss to Party A.   (e) Party B shall a

In [None]:
def Groq_model(query):
    groq_api_key = "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"
    model_name = "openai/gpt-oss-120b"

    Groq_model = ChatGroq(groq_api_key=groq_api_key, model_name=model_name)
    print(f"[INFO] Groq LLM initialized: {Groq_model}")
    res = vdb.queryDB(query)

    text = [r for r in res['documents']]
    context = ["\n\n".join(i) for i in text]

    prompt = f"""Summarize the following context for the query: '{query}'\n\nContext:\n{context}\n\nSummary:"""
    response = Groq_model.predict(prompt)
    return response

In [None]:
with open("Summary File.txt", "w" ,encoding = 'UTF-8') as f:
    f.write(Groq_model("DEFINITION OF CONFIDENTIAL INFORMATION."))


[INFO] Groq LLM initialized: client=<groq.resources.chat.completions.Completions object at 0x000002352C594C50> async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x000002352C595650> model_name='openai/gpt-oss-120b' model_kwargs={} groq_api_key=SecretStr('**********')
[INFO] Groq LLM initialized: client=<groq.resources.chat.completions.Completions object at 0x000002352C595E50> async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x000002352C596950> model_name='openai/gpt-oss-120b' model_kwargs={} groq_api_key=SecretStr('**********')


: 