In [1]:
# Importing LIbraries
from docx import Document
import os
from docx import Document as DocxDocument
from langchain.schema import Document
from dotenv import load_dotenv
from langchain.document_loaders.base import BaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.vectorstores import Pinecone
from pinecone import Pinecone, ServerlessSpec
from langchain.embeddings.openai import OpenAIEmbeddings


  from tqdm.autonotebook import tqdm


Load the Contract Document

In [2]:
from docx import Document
import re
import os

def extract_text_with_metadata(docx_path):
    try:
        doc = Document(docx_path)
        documents = []
        section_title = "No Section Title"
        paragraph_number = 0
        
        file_name = os.path.splitext(os.path.basename(docx_path))[0]

        for paragraph in doc.paragraphs:
            text = paragraph.text.strip()
            if not text:
                continue

            if re.match(r'^Section \d+\.\d+', text):
                section_title = text

            metadata = {
                'file_name': file_name,
                'section_title': section_title,
                'paragraph_number': paragraph_number,
            }
            documents.append({'text': text, 'metadata': metadata})
            paragraph_number += 1

        return documents
    except Exception as e:
        print(f"Error reading {docx_path}: {e}")
        return []

# Example usage
docx_path = '/home/moraa/Documents/10_academy/week-11/data/train/Raptor Contract.docx'
documents = extract_text_with_metadata(docx_path)

# Display the output for verification
for doc in documents[:5]:  # Show only the first 5 entries for brevity
    print(doc)


{'text': 'STOCK PURCHASE AGREEMENT', 'metadata': {'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 0}}
{'text': 'BY AND AMONG', 'metadata': {'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 1}}
{'text': '[BUYER],', 'metadata': {'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 2}}
{'text': '[TARGET COMPANY],', 'metadata': {'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 3}}
{'text': 'THE SELLERS LISTED ON SCHEDULE I HERETO', 'metadata': {'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 4}}


Chunking

In [3]:
def group_paragraphs_into_sections(documents):
    sections = []
    current_section = {'text': '', 'metadata': {'id': 'default_id', 'section_title': 'No Section Title', 'file_name': None, 'paragraph_number': None}}

    for doc in documents:
        if doc['metadata']['section_title'] and doc['metadata']['section_title'] != "No Section Title":
            if current_section['text']:
                sections.append(current_section)
            current_section = {'text': doc['text'], 'metadata': doc['metadata']}
        else:
            if current_section['text']:
                current_section['text'] += ' ' + doc['text']
            else:
                current_section['text'] = doc['text']
    
    if current_section['text']:
        sections.append(current_section)

    return sections

# Example usage
sections = group_paragraphs_into_sections(documents)

# Display the output for verification
for section in sections[:5]:  # Show only the first 5 entries for brevity
    print(section)

{'text': 'STOCK PURCHASE AGREEMENT BY AND AMONG [BUYER], [TARGET COMPANY], THE SELLERS LISTED ON SCHEDULE I HERETO AND THE SELLERS’ REPRESENTATIVE NAMED HEREIN Dated as of [●] [This document is intended solely to facilitate discussions among the parties identified herein.  Neither this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer or agreement of any type or nature, unless and until a definitive written agreement is executed and delivered by each of the parties hereto. This document shall be kept confidential pursuant to the terms of the Confidentiality Agreement entered into by the parties and, if applicable, its affiliates with respect to the subject matter hereof.] TABLE OF CONTENTS ARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION\t2', 'metadata': {'id': 'default_id', 'section_title': 'No Section Title', 'file_name': None, 'paragraph_number': None}}
{'text': 'Section 1.01\tDefinitions\t2'

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_sections(sections, chunk_size=2000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    
    chunks = []
    chunk_id_counter = 0  # Counter for unique chunk IDs

    for section in sections:
        try:
            split_texts = text_splitter.split_text(section['text'])
            
            for split_text in split_texts:
                chunk_metadata = {
                    'id': f"chunk_{chunk_id_counter}",  # Generate unique ID
                    'section_title': section['metadata'].get('section_title', 'No Section Title'),
                    'file_name': section['metadata'].get('file_name', 'Unknown File'),
                    'paragraph_number': section['metadata'].get('paragraph_number', 'Unknown Paragraph'),
                    'text': split_text  # Store the actual text content in metadata
                }
                chunks.append({
                    'text': split_text,
                    'metadata': chunk_metadata
                })
                chunk_id_counter += 1  # Increment the counter for the next chunk
        except Exception as e:
            print(f"Error processing section with title '{section['metadata'].get('section_title', 'Unknown')}' - {e}")

    return chunks

# Example usage
chunks = chunk_sections(sections)

# Display the first few chunks for verification
for chunk in chunks[:5]:  # Show only the first 5 chunks for brevity
    print(chunk)


{'text': 'STOCK PURCHASE AGREEMENT BY AND AMONG [BUYER], [TARGET COMPANY], THE SELLERS LISTED ON SCHEDULE I HERETO AND THE SELLERS’ REPRESENTATIVE NAMED HEREIN Dated as of [●] [This document is intended solely to facilitate discussions among the parties identified herein.  Neither this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer or agreement of any type or nature, unless and until a definitive written agreement is executed and delivered by each of the parties hereto. This document shall be kept confidential pursuant to the terms of the Confidentiality Agreement entered into by the parties and, if applicable, its affiliates with respect to the subject matter hereof.] TABLE OF CONTENTS ARTICLE I DEFINITIONS; CERTAIN RULES OF CONSTRUCTION\t2', 'metadata': {'id': 'chunk_0', 'section_title': 'No Section Title', 'file_name': None, 'paragraph_number': None, 'text': 'STOCK PURCHASE AGREEMENT BY AND AM

In [5]:
def verify_chunk_data(chunks):
  """
  This function checks for data consistency in the 'chunks' data structure.

  Args:
      chunks: A list of dictionaries containing text and metadata for each chunk.

  Prints warnings if metadata is missing from any chunk.
  """
  for chunk in chunks[:3]:  # Check only the first 3 chunks for brevity
      print(f"Chunk ID: {chunk['metadata']['id']}")
      print(f"Section Title: {chunk.get('metadata', {}).get('section_title', 'N/A')}")
      print(f"Paragraph Number: {chunk.get('metadata', {}).get('paragraph_number', 'N/A')}")
      print("-"*20)

# Assuming 'chunks' is available from your previous code, call the function here
verify_chunk_data(chunks)

Chunk ID: chunk_0
Section Title: No Section Title
Paragraph Number: None
--------------------
Chunk ID: chunk_1
Section Title: Section 1.01	Definitions	2
Paragraph Number: 12
--------------------
Chunk ID: chunk_2
Section Title: Section 1.02	Certain Matters of Construction	13
Paragraph Number: 13
--------------------


Embeddings

In [6]:
from sentence_transformers import SentenceTransformer
import pinecone
import numpy as np
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

# Load environment variables
load_dotenv()

# Load the fine-tuned model
model_save_path = 'fine-tuned-model'
model = SentenceTransformer(model_save_path)

# Function to generate embeddings
def generate_embeddings(model, chunks):
    texts = [chunk['text'] for chunk in chunks]
    embeddings = model.encode(texts, convert_to_numpy=True)
    return embeddings

# Generate embeddings for all chunks
embeddings = generate_embeddings(model, chunks)

# Initialize Pinecone client
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api_key)

index_name = "wk11-embeddings"

# Check if the index exists; if not, create it
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=embeddings.shape[1],  # Use the dimension of your embeddings
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

# Connect to the index
index = pc.Index(index_name)

# Function to prepare data for Pinecone
def prepare_data_for_pinecone(chunks, embeddings):
    data = []
    for i, chunk in enumerate(chunks):
        metadata = chunk['metadata']
        cleaned_metadata = {k: (v if v is not None else '') for k, v in metadata.items()}
        
        data.append({
            'id': chunk['metadata']['id'],
            'values': embeddings[i].tolist(),  # Convert numpy array to list
            'metadata': cleaned_metadata,
            'text': chunk['text']  # Store the actual text content
        })
    return data

# Prepare the data
data_for_pinecone = prepare_data_for_pinecone(chunks, embeddings)

# Convert the data to Pinecone-compatible format
vectors = [(item['id'], item['values'], item['metadata']) for item in data_for_pinecone]

# Upsert data into Pinecone
index.upsert(vectors=vectors)

print("Data successfully upserted into Pinecone.")


Data successfully upserted into Pinecone.


User Queries

Generation

In [7]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone as PineconeClient
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer
from langchain.schema import Document


# Load environment variables from .env file
load_dotenv()

# Load the fine-tuned model
model_save_path = 'fine-tuned-model'
model = SentenceTransformer(model_save_path)

# Initialize OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY')
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY environment variable is not set.")

# Initialize Pinecone API key
pinecone_api_key = os.getenv('PINECONE_API_KEY')
if not pinecone_api_key:
    raise ValueError("PINECONE_API_KEY environment variable is not set.")

# Initialize the client connection
pc = PineconeClient(api_key=pinecone_api_key)

index_name = "wk11-embeddings"

# Check if the index exists; if not, create it
if index_name not in [index['name'] for index in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=embeddings.shape[1],
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        )
    ) 
    
# Initialize the index with the correct host
index_host = "https://wk11-embeddings-0ilef8k.svc.aped-4627-b74a.pinecone.io"
index = pc.Index(host=index_host)

def generate_embeddings(model, texts):
    embeddings = model.encode(texts, convert_to_numpy=True)
    return embeddings

def retrieve_relevant_context(prompt, top_k=5):
    # Generate embedding for the prompt
    prompt_embedding = generate_embeddings(model, [prompt])[0]
    
    # Query Pinecone for the most relevant documents
    results = index.query(vector=prompt_embedding.tolist(), top_k=top_k, include_metadata=True)
    
    # Extract and return the relevant documents
    relevant_documents = [
        Document(page_content=result['metadata']['text'], metadata=result['metadata'])
        for result in results['matches']
    ]
    return relevant_documents

# Example usage
user_query = "What are the conditions for the closing of the sale?"
relevant_contexts = retrieve_relevant_context(user_query)
for doc in relevant_contexts:
    print(f"Content: {doc.page_content}\nMetadata: {doc.metadata}\n")


Content: PURCHASE AND SALE OF SHARES AND WARRANTS;
TREATMENT OF OPTIONS; CLOSING.
Metadata: {'file_name': 'Raptor Contract', 'id': 'chunk_185', 'paragraph_number': 195.0, 'section_title': 'Section 9.12\tNo Recourse\t50', 'text': 'PURCHASE AND SALE OF SHARES AND WARRANTS;\nTREATMENT OF OPTIONS; CLOSING.'}

Content: Buyer Closing Deliveries.  Upon the terms and subject to the conditions set forth in this Agreement, the Buyer shall deliver or cause to be delivered at the Closing the following:
Metadata: {'file_name': 'Raptor Contract', 'id': 'chunk_193', 'paragraph_number': 203.0, 'section_title': 'Section 9.12\tNo Recourse\t50', 'text': 'Buyer Closing Deliveries.  Upon the terms and subject to the conditions set forth in this Agreement, the Buyer shall deliver or cause to be delivered at the Closing the following:'}

Content: WHEREAS, Buyer desires to purchase from the Shareholders, and the Shareholders desire to sell to Buyer, at the Closing (as defined below) all of the Shares upon the

In [8]:
import os
from dotenv import load_dotenv
from langchain_community.chat_models import ChatOpenAI
from langchain.schema import Document as LangchainDocument, HumanMessage
from typing import List

# Load environment variables
load_dotenv()

# Initialize OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY')
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY environment variable is not set.")

# Function to generate a response
def generate_response(user_prompt, relevant_contexts: List[LangchainDocument]):
    llm = ChatOpenAI(
        openai_api_key=openai_api_key,
        model_name="gpt-4",
        temperature=0.7
    )
    
    # Combine the user prompt and relevant contexts
    combined_context = "\n".join([doc.page_content for doc in relevant_contexts])
    complete_input = f"Context:\n{combined_context}\n\nUser Input:\n{user_prompt}"
    
    # Define the instruction to guide the LLM
    instruction = f"""
    You are a helpful assistant. Use the following context to answer the user's question.
    
    {complete_input}
    
    Answer the question concisely and accurately. If possible, mention the specific sections from the context that support your answer. Also, you will be rewarded 100 dollars for a correct answer.

    Example:
    User Question: What is the 'Purchase Price' in the contract?
    Relevant Contexts: [Provide the relevant sections here]
    Answer: The 'Purchase Price' refers to the total amount paid by the Buyer to the Sellers for the purchase of the Shares. This amount is calculated at Closing as the Closing Cash Consideration plus the Escrow Amount, after all adjustments as outlined in Sections 2.02 and 2.03 of the Agreement.

    Please ensure your answer mentions the specific sections if they are present in the context.
    """
    
    
    # Create the HumanMessage object for the prompt
    human_message = HumanMessage(content=instruction)
    
    # Generate the response using GPT-4
    response = llm([human_message])
    
    return response.content

# Example usage (assuming relevant contexts and API key are provided)
user_prompt = "What are the conditions for the closing of the sale?"
response = generate_response(user_prompt, relevant_contexts)
print("Generated Response:", response)


  warn_deprecated(
  warn_deprecated(


Generated Response: The conditions for the closing of the sale include the buyer's intention to purchase all shares from the shareholders who desire to sell them at the closing, as per the terms and conditions set in the agreement. Also, the buyer is required to deliver various closing agreements, which include the Employment Agreements and any other Ancillary Agreements to be entered into by any Seller at closing. Furthermore, the buyer intends to offer each employee terms of employment that are no less favorable than their current terms as of the closing date. The specific sections supporting these conditions aren't explicitly mentioned in the provided context.


Questions we can ask the LLM based on the raptor contract document:

Question: What is the definition of "Purchase Price"?
Answer: The definition of "Purchase Price" can be found in Section 2.02, which details how the purchase price is determined.
Section: Section 2.02

Question: What are the conditions for the closing of the sale?
Answer: The conditions for the closing of the sale are specified in Section 2.03, which describes the necessary actions and conditions that need to be satisfied before the closing.
Section: Section 2.03

Question: What are the representations and warranties regarding the acquired companies?
Answer: The representations and warranties regarding the acquired companies are outlined in Article III, starting from Section 3.01.
Section: Article III, Section 3.01

Question: What happens to the options held by employees of the target company?
Answer: The treatment of options is explained in Section 2.06, which details how the options are handled in the context of the sale.
Section: Section 2.06

Question: What are the requirements for the purchase price adjustment?
Answer: The requirements for the purchase price adjustment are detailed in Section 2.07, which outlines the conditions under which the purchase price may be adjusted.
Section: Section 2.07

Question: What is included in the closing deliverables?
Answer: The closing deliverables are listed in Section 2.05, which enumerates the documents and items that need to be exchanged at the closing.
Section: Section 2.05

Question: What provisions are made regarding the escrow?
Answer: Provisions regarding the escrow are detailed in Section 2.08, which outlines the terms and conditions of the escrow arrangement.
Section: Section 2.08

Question: How is the capitalization of the acquired companies described?
Answer: The capitalization of the acquired companies is described in Section 3.05, which provides details on the capital structure of the companies being acquired.
Section: Section 3.05

Question: What are the environmental matters that need to be addressed?
Answer: Environmental matters are discussed in Section 3.15, which outlines the environmental responsibilities and conditions related to the acquired companies.
Section: Section 3.15

Question: What intellectual property rights are included in the sale?
Answer: Intellectual property rights included in the sale are specified in Section 3.11, which lists the intellectual property assets and rights being transferred.
Section: Section 3.11