In [33]:
import groq
import os
import pandas as pd
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [34]:
"""
1. Load environment variables from a .env file (if it exists).
2. Retrieve the GROQ_API_KEY from the environment variables.
3. Initialize the Groq client using the retrieved API key.
"""

load_dotenv()

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
client = groq.Client(api_key=GROQ_API_KEY)

In [35]:
"""
1. This code initializes the UnstructuredWordDocumentLoader with the path to the PDF file "contract_interview.pdf".
2. It then loads the docx document and splits it into individual pages or sections.
"""

loader = UnstructuredWordDocumentLoader("contract_interview.docx")
documents = loader.load_and_split()

print(f"Total documents loaded: {len(documents)}")

Total documents loaded: 38


In [38]:
"""
This code initializes a RecursiveCharacterTextSplitter to split documents into smaller chunks.

Parameters:
    - chunk_size: Maximum size of each chunk (in characters).
    - chunk_overlap: Number of overlapping characters between consecutive chunks.
    - length_function: Function to calculate the length of the text (default is `len`).

The text splitter is then used to split the input documents into smaller, manageable chunks.
"""

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,  
    chunk_overlap=300,  
    length_function=len,
)

text_chunks = text_splitter.split_documents(documents)
print(f"Total chunks: {len(text_chunks)}")

Total chunks: 100


In [40]:
def extract_clauses_with_groq(text_chunk):
    
    """
    Extracts all occurrences of the "Audit & Inspection Rights" clause from a given contract text using the Groq API.

    Args:
        text_chunk (str): The contract text from which to extract the clauses.

    Returns:
        str: The extracted "Audit & Inspection Rights" clauses as a string, formatted by the Groq API.

    """
    
    prompt = """
    Extract all occurrences of the "Audit & Inspection Rights" clause from the following contract text. 
    The clause should include:
    1. The right of one party to audit the books and records of another party.
    2. The right of one party to give or receive access to the books and records of another party.
    3. The right to inspect or examine the premises, books, accounts, records, papers, or other specified items.
    4. The obligation of one party to maintain or retain records, files, books, accounts, or papers for the purpose of audit or inspection.
    5. The obligation of a party to follow certain procedures and rules with respect to conducting the audit.

    Exclude:
    1. Governmental and regulatory audits and inspections.
    2. Inspection rights on the delivery of products specifically if they are related to acceptance testing.
    3. Definitions for “auditor”.

    Contract Text:
    {text_chunk}
    """

    response = client.chat.completions.create(
        model="mixtral-8x7b-32768",
        messages=[{"role": "user", "content": prompt.format(text_chunk=text_chunk)}],
        max_tokens=1024,  
        temperature=0.2,  
    )
    return response.choices[0].message.content.strip()


In [41]:
"""
1. Initialize an empty list to store the extracted clauses from each text chunk.
2. Iterate over each chunk in the list of text chunks.
3. Use the `extract_clauses_with_groq` function to extract clauses from the current chunk's content.
4. Append the extracted clauses to the `extracted_clauses` list.
5. Create a pandas DataFrame with the extracted clauses, storing them in a column named 'extracted_content'.
5. Save the DataFrame to an Excel file named 'extracted_content.xlsx'.
"""

extracted_clauses = []
for chunk in text_chunks:
    clauses = extract_clauses_with_groq(chunk.page_content)
    extracted_clauses.append(clauses)

df=pd.DataFrame({'text_chunk': text_chunks,'Extracted Content': extracted_clauses})
df.to_excel('extracted_content_1.xlsx')
