In [1]:
# Importing Libraries
from docx import Document
import re
import os
from langchain.schema import Document
from dotenv import load_dotenv
import openai
from langchain.document_loaders.base import BaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.vectorstores import Pinecone
from pinecone import Pinecone, ServerlessSpec
from langchain.embeddings.openai import OpenAIEmbeddings


  from tqdm.autonotebook import tqdm


Load the Contract Document

In [2]:
from docx import Document
import re
import os

def extract_text_with_metadata(docx_path):
    try:
        doc = Document(docx_path)
        documents = []
        section_title = "No Section Title"
        paragraph_number = 0
        
        # Extract the file name without the path and extension
        file_name = os.path.splitext(os.path.basename(docx_path))[0]

        for paragraph in doc.paragraphs:
            text = paragraph.text.strip()
            if not text:
                continue

            # Print paragraph text for debugging
            print(f"Processing paragraph: {text}")

            # Check if the paragraph is a section title
            if re.match(r'^Section \d+\.\d+', text):
                section_title = text
                print(f"Detected section title: {section_title}")

            # Create metadata for each paragraph
            metadata = {
                'file_name': file_name,
                'section_title': section_title,
                'paragraph_number': paragraph_number,
            }
            documents.append({'text': text, 'metadata': metadata})
            paragraph_number += 1

        # Ensure no paragraphs were skipped
        print(f"Total paragraphs processed: {paragraph_number}")

        return documents
    except Exception as e:
        print(f"Error reading {docx_path}: {e}")
        return []

# Example usage
docx_path = '/home/moraa/Documents/10_academy/week-11/data/train/Raptor Contract.docx'
documents = extract_text_with_metadata(docx_path)

# Display the output for verification
for doc in documents[:5]:  # Show only the first 5 entries for brevity
    print(doc)



Processing paragraph: STOCK PURCHASE AGREEMENT
Processing paragraph: BY AND AMONG
Processing paragraph: [BUYER],
Processing paragraph: [TARGET COMPANY],
Processing paragraph: THE SELLERS LISTED ON SCHEDULE I HERETO
Processing paragraph: AND
Processing paragraph: THE SELLERS’ REPRESENTATIVE NAMED HEREIN
Processing paragraph: Dated as of [●]
Processing paragraph: [This document is intended solely to facilitate discussions among the parties identified herein.  Neither this document nor such discussions are intended to create, nor will either or both be deemed to create, a legally binding or enforceable offer or agreement of any type or nature, unless and until a definitive written agreement is executed and delivered by each of the parties hereto.
Processing paragraph: This document shall be kept confidential pursuant to the terms of the Confidentiality Agreement entered into by the parties and, if applicable, its affiliates with respect to the subject matter hereof.]
Processing paragraph:

In [3]:
documents

[{'text': 'STOCK PURCHASE AGREEMENT',
  'metadata': {'file_name': 'Raptor Contract',
   'section_title': 'No Section Title',
   'paragraph_number': 0}},
 {'text': 'BY AND AMONG',
  'metadata': {'file_name': 'Raptor Contract',
   'section_title': 'No Section Title',
   'paragraph_number': 1}},
 {'text': '[BUYER],',
  'metadata': {'file_name': 'Raptor Contract',
   'section_title': 'No Section Title',
   'paragraph_number': 2}},
 {'text': '[TARGET COMPANY],',
  'metadata': {'file_name': 'Raptor Contract',
   'section_title': 'No Section Title',
   'paragraph_number': 3}},
 {'text': 'THE SELLERS LISTED ON SCHEDULE I HERETO',
  'metadata': {'file_name': 'Raptor Contract',
   'section_title': 'No Section Title',
   'paragraph_number': 4}},
 {'text': 'AND',
  'metadata': {'file_name': 'Raptor Contract',
   'section_title': 'No Section Title',
   'paragraph_number': 5}},
 {'text': 'THE SELLERS’ REPRESENTATIVE NAMED HEREIN',
  'metadata': {'file_name': 'Raptor Contract',
   'section_title': 'N

Chunking

In [4]:
def group_paragraphs_into_sections(documents):
    sections = []
    current_section = {'text': '', 'metadata': {'id': None, 'section_title': None}}

    for doc in documents:
        if doc['metadata']['section_title']:
            if current_section['text']:
                sections.append(current_section)
            current_section = {'text': doc['text'], 'metadata': doc['metadata']}
        else:
            if current_section['text']:
                current_section['text'] += ' ' + doc['text']
            else:
                current_section['text'] = doc['text']
    
    if current_section['text']:
        sections.append(current_section)

    # Verify metadata for each section
    for section in sections:
        if not section['metadata']['section_title']:
            print(f"Warning: Section missing section title metadata - {section['text'][:100]}...")
        if not section['metadata'].get('file_name'):
            print(f"Warning: Section missing file name metadata - {section['text'][:100]}...")
    
    return sections

# Example usage
sections = group_paragraphs_into_sections(documents)

# Display the output for verification
for section in sections[:5]:  # Show only the first 5 entries for brevity
    print(section)



{'text': 'STOCK PURCHASE AGREEMENT', 'metadata': {'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 0}}
{'text': 'BY AND AMONG', 'metadata': {'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 1}}
{'text': '[BUYER],', 'metadata': {'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 2}}
{'text': '[TARGET COMPANY],', 'metadata': {'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 3}}
{'text': 'THE SELLERS LISTED ON SCHEDULE I HERETO', 'metadata': {'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 4}}


In [5]:
def chunk_sections(sections, chunk_size=2000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    
    chunks = []
    for section in sections:
        try:
            # Split the section text into chunks
            split_texts = text_splitter.split_text(section['text'])
            
            for split_text in split_texts:
                chunk_id = section['metadata'].get('id', 'default_id')  # Ensure ID is set
                chunks.append({
                    'text': split_text,
                    'metadata': {
                        'id': chunk_id,
                        **section['metadata']  # Include other metadata
                    }
                })
        except Exception as e:
            print(f"Error processing section with title '{section['metadata'].get('section_title', 'Unknown')}' - {e}")

    return chunks

# Example usage
chunks = chunk_sections(sections)

# Display the first few chunks for verification
for chunk in chunks[:5]:  # Show only the first 5 chunks for brevity
    print(chunk)


{'text': 'STOCK PURCHASE AGREEMENT', 'metadata': {'id': 'default_id', 'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 0}}
{'text': 'BY AND AMONG', 'metadata': {'id': 'default_id', 'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 1}}
{'text': '[BUYER],', 'metadata': {'id': 'default_id', 'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 2}}
{'text': '[TARGET COMPANY],', 'metadata': {'id': 'default_id', 'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 3}}
{'text': 'THE SELLERS LISTED ON SCHEDULE I HERETO', 'metadata': {'id': 'default_id', 'file_name': 'Raptor Contract', 'section_title': 'No Section Title', 'paragraph_number': 4}}


In [6]:
def verify_chunk_data(chunks):
  """
  This function checks for data consistency in the 'chunks' data structure.

  Args:
      chunks: A list of dictionaries containing text and metadata for each chunk.

  Prints warnings if metadata is missing from any chunk.
  """
  for chunk in chunks[:3]:  # Check only the first 3 chunks for brevity
      print(f"Chunk ID: {chunk['metadata']['id']}")
      print(f"Section Title: {chunk.get('metadata', {}).get('section_title', 'N/A')}")
      print(f"Paragraph Number: {chunk.get('metadata', {}).get('paragraph_number', 'N/A')}")
      print("-"*20)

# Assuming 'chunks' is available from your previous code, call the function here
verify_chunk_data(chunks)

Chunk ID: default_id
Section Title: No Section Title
Paragraph Number: 0
--------------------
Chunk ID: default_id
Section Title: No Section Title
Paragraph Number: 1
--------------------
Chunk ID: default_id
Section Title: No Section Title
Paragraph Number: 2
--------------------


Text Embedding

In [7]:
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Initialize the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose other models as well

# Function to generate dense embeddings using Sentence Transformers
def generate_dense_embeddings(chunks):
    texts = [chunk['text'] for chunk in chunks]
    embeddings = model.encode(texts)
    for i, chunk in enumerate(chunks):
        chunk['dense_embedding'] = embeddings[i]
    return chunks

# Function to generate sparse embeddings using TF-IDF
def generate_sparse_embeddings(chunks):
    texts = [chunk['text'] for chunk in chunks]
    vectorizer = TfidfVectorizer()
    sparse_matrix = vectorizer.fit_transform(texts)
    
    for i, chunk in enumerate(chunks):
        chunk['sparse_embedding'] = sparse_matrix[i].toarray().flatten()
    return chunks

# Generate embeddings
chunks = generate_dense_embeddings(chunks)
chunks = generate_sparse_embeddings(chunks)

In [9]:
import pinecone
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import json
from pinecone import Index

# Load environment variables
load_dotenv()

# Initialize Pinecone client
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api_key)

index_name = "wk11-hybrid-search"

# Check if the index exists; if not, create it
if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

# Connect to the index
index = pc.Index(index_name)

# Initialize the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to store embeddings in Pinecone
def store_embeddings_in_pinecone(chunks):
    for chunk in chunks:
        if (chunk.get('metadata') and 
            chunk['metadata'].get('id') and
            chunk.get('dense_embedding') is not None and 
            chunk.get('sparse_embedding') is not None):
            
            # Serialize other_metadata dictionary to JSON string
            other_metadata_str = json.dumps(chunk['metadata'])
            
            # Convert sparse_embedding list to string
            sparse_embedding_str = json.dumps(chunk['sparse_embedding'].tolist())

            index.upsert([
                {
                    "id": chunk['metadata']['id'],
                    "values": chunk['dense_embedding'].tolist(),  # Convert numpy array to list
                    "metadata": {
                        "text": chunk['text'],
                        "sparse_embedding": sparse_embedding_str,  # Store sparse_embedding as string
                        "other_metadata": other_metadata_str  # Serialize other_metadata
                    }
                }
            ])
        else:
            print(f"Skipping chunk due to missing data: {chunk}")

# Example usage
store_embeddings_in_pinecone(chunks)

Retrieval

In [10]:
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import json

# Initialize the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to perform dense search in Pinecone
def perform_dense_search(query_embedding, top_k=10):
    # Query Pinecone index
    result = index.query(vector=query_embedding.tolist(), top_k=top_k, include_metadata=True)
    return result

# Function to perform sparse search
def perform_sparse_search(query_embedding, top_k=10):
    sparse_results = []
    for chunk in chunks:
        if len(query_embedding) != len(chunk['sparse_embedding']):
            continue
        score = 1 - cosine(query_embedding, chunk['sparse_embedding'])
        sparse_results.append((chunk, score))
    
    # Print debug info
    print(f"Total sparse results: {len(sparse_results)}")
    
    # Sort results and limit to top_k
    sparse_results = sorted(sparse_results, key=lambda x: x[1], reverse=True)[:top_k]
    
    # Print debug info
    for result in sparse_results:
        print(f"Chunk ID: {result[0]['metadata']['id']}, Score: {result[1]}")
    
    return sparse_results

# Function to combine dense and sparse results
def combine_results(dense_results, sparse_results):
    combined_results = {}

    # Process dense search results
    for result in dense_results['matches']:
        metadata = json.loads(result['metadata'].get('other_metadata', '{}'))
        chunk_id = result['id']
        combined_results[chunk_id] = {
            'text': result['metadata'].get('text', 'N/A'),
            'score': result['score'],
            'metadata': {
                'file_name': metadata.get('file_name', 'N/A'),
                'section_title': metadata.get('section_title', 'N/A'),
                'paragraph_number': metadata.get('paragraph_number', 'N/A')
            }
        }
    
    # Process sparse search results
    for chunk, score in sparse_results:
        metadata = chunk.get('metadata', {})
        chunk_id = metadata.get('id', 'N/A')
        # Update only if the chunk ID is not already present (avoids overriding dense results info)
        if chunk_id not in combined_results:
            combined_results[chunk_id] = {
                'text': chunk.get('text', 'N/A'),
                'score': score,
                'metadata': {
                    'file_name': metadata.get('file_name', 'N/A'),
                    'section_title': metadata.get('section_title', 'N/A'),
                    'paragraph_number': metadata.get('paragraph_number', 'N/A')
                }
            }
        # Otherwise, just add the score to the existing entry
        else:
            combined_results[chunk_id]['score'] += score
    
    combined_results = sorted(combined_results.values(), key=lambda x: x['score'], reverse=True)
    return combined_results[:10]

# Function to perform hybrid search
def hybrid_search(query, top_k=10):
    # Generate query embeddings
    dense_query_embedding = model.encode([query])[0]
    
    vectorizer = TfidfVectorizer()
    sparse_query_embedding = vectorizer.fit_transform([query]).toarray().flatten()
    
    # Perform dense search
    dense_results = perform_dense_search(dense_query_embedding, top_k=top_k)
    
    # Perform sparse search
    sparse_results = perform_sparse_search(sparse_query_embedding, top_k=top_k)
    
    # Combine results
    combined_results = combine_results(dense_results, sparse_results)
    
    return combined_results

# Function to format and print results
def display_results(results):
    for result in results:
        metadata = result.get('metadata', {})
        print(f"Text: {result.get('text', 'N/A')}")
        print(f"File Name: {metadata.get('file_name', 'N/A')}")
        print(f"Section Title: {metadata.get('section_title', 'N/A')}")
        print(f"Paragraph Number: {metadata.get('paragraph_number', 'N/A')}")
        print(f"Score: {result.get('score', 'N/A')}")
        print("="*50)

queries = [
    "Under what circumstances and to what extent the Sellers are responsible for a breach of representations and warranties?",
    "Whose consent is required for the assignment of the Agreement by the Buyer?",
    "Would the aggregate amount payable by the Buyer to the Sellers be affected if it is determined that the actual Closing Debt Amount is greater the estimated Closing Debt Amount?"
]

# Perform search and display results for each query
for query in queries:
    results = hybrid_search(query)
    display_results(results)


Total sparse results: 0
Text: THE SELLERS:		[●]
File Name: Raptor Contract
Section Title: Section 9.12	No Recourse	50
Paragraph Number: 509
Score: 0.468455821
Total sparse results: 0
Text: THE SELLERS:		[●]
File Name: Raptor Contract
Section Title: Section 9.12	No Recourse	50
Paragraph Number: 509
Score: 0.373432904
Total sparse results: 0
Text: THE SELLERS:		[●]
File Name: Raptor Contract
Section Title: Section 9.12	No Recourse	50
Paragraph Number: 509
Score: 0.20627746


Customize Training

In [11]:
from docx import Document
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import Dataset, DataLoader
import torch

# Step 1: Read and Extract Data from DOCX
def read_docx(file_path):
    doc = Document(file_path)
    qa_pairs = []
    paragraphs = [p.text for p in doc.paragraphs if p.text.strip() != '']
    for i in range(0, len(paragraphs) - 1, 2):
        question = paragraphs[i]
        answer = paragraphs[i + 1]
        qa_pairs.append((question, answer))
    return qa_pairs

# Path to the DOCX file
file_path = '/home/moraa/Documents/10_academy/week-11/data/train/Raptor Q&A2.docx'
qa_pairs = read_docx(file_path)

# Step 2: Convert Data to `InputExample` Format
def create_input_examples(qa_pairs):
    return [InputExample(texts=[q, a], label=1) for q, a in qa_pairs]

# Convert Q&A pairs to InputExamples
train_examples = create_input_examples(qa_pairs)

# Step 3: Create Custom Dataset and DataLoader
class CustomDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

# Create dataset
train_dataset = CustomDataset(train_examples)

# Create a DataLoader
def collate_fn(batch):
    # Return a list of InputExamples directly
    return batch

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8, collate_fn=collate_fn)

# Step 4: Define Loss Function and Train the Model
# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define loss function
train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=2
)

# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100
)

# Save the fine-tuned model
model_save_path = 'fine-tuned-model'
model.save(model_save_path)

print(f"Model saved to {model_save_path}")


100%|██████████| 2/2 [00:01<00:00,  1.74it/s]


{'train_runtime': 1.1436, 'train_samples_per_second': 12.242, 'train_steps_per_second': 1.749, 'train_loss': 0.6954153776168823, 'epoch': 1.0}


                                                                     

Model saved to fine-tuned-model


Evaluation and Tuning

In [12]:
from sklearn.metrics import precision_recall_fscore_support
from sentence_transformers import SentenceTransformer, evaluation

# Step 1: Prepare the Evaluation Data
# Assuming you have a separate validation DOCX file
validation_file_path = '/home/moraa/Documents/10_academy/week-11/data/validation/Raptor Q&A2.docx'
validation_qa_pairs = read_docx(validation_file_path)
validation_examples = create_input_examples(validation_qa_pairs)
validation_dataset = CustomDataset(validation_examples)
validation_dataloader = DataLoader(validation_dataset, shuffle=False, batch_size=8, collate_fn=collate_fn)

# Step 2: Define Evaluation Metrics
def compute_metrics(preds, labels):
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Step 3: Perform Evaluation
def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            texts = batch['texts']
            labels = batch['labels']
            embeddings = model.encode(texts, convert_to_tensor=True)
            preds = (embeddings[:, 0] > embeddings[:, 1]).int().tolist()  # Example threshold-based prediction
            all_preds.extend(preds)
            all_labels.extend(labels)
    metrics = compute_metrics(all_preds, all_labels)
    return metrics

# Load the fine-tuned model
model_save_path = 'fine-tuned-model'
model = SentenceTransformer(model_save_path)

# Evaluate the model
evaluation_metrics = evaluate_model(model, validation_dataloader)
print(f"Evaluation Metrics: {evaluation_metrics}")

# Step 4: Perform Qualitative Analysis
def qualitative_analysis(model, qa_pairs):
    for question, answer in qa_pairs[:5]:  # Analyze first 5 pairs for simplicity
        question_embedding = model.encode(question, convert_to_tensor=True)
        answer_embedding = model.encode(answer, convert_to_tensor=True)
        print(f"Question: {question}")
        print(f"Answer: {answer}")
        print(f"Similarity: {torch.cosine_similarity(question_embedding, answer_embedding, dim=0).item()}")
        print("---")

qualitative_analysis(model, validation_qa_pairs)


PackageNotFoundError: Package not found at '/home/moraa/Documents/10_academy/week-11/data/validation/Raptor Q&A2.docx'

Implementation