In [None]:
import os
import re
from PyPDF2 import PdfReader

folder_path = r"final" 
output = {}

for file in os.listdir(folder_path):
    if not file.lower().endswith(".pdf"):
        continue

    pdf_path = os.path.join(folder_path, file)
    print(f"\nüìÑ Processing: {file}")

    reader = PdfReader(pdf_path)
    all_text = []

    for i, page in enumerate(reader.pages):
        try:
            page_text = page.extract_text()

            if page_text:
                # Cleanup
                page_text = re.sub(r"\s+", " ", page_text).strip()
                all_text.append(page_text)
            else:
                print(f" No text extracted on page {i+1} of {file}")

        except Exception as e:
            print(f" Error on page {i+1} of {file}: {e}")

    # Save whole PDF text
    full_text = "\n".join(all_text)
    output[file] = full_text

    print(f"‚úî Extracted {len(all_text)} pages from {file}")

print("\nüéâ DONE extracting all PDFs!")



üìÑ Processing: ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINT VENTURE AGREEMENT.PDF
‚úî Extracted 3 pages from ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINT VENTURE AGREEMENT.PDF

üìÑ Processing: ATENTOSA_07_06_2020-EX-99.1-JOINT FILING AGREEMENT.PDF
‚úî Extracted 3 pages from ATENTOSA_07_06_2020-EX-99.1-JOINT FILING AGREEMENT.PDF

üìÑ Processing: BANGIINC_05_25_2005-EX-10-Premium Managed Hosting Agreement.PDF
‚úî Extracted 2 pages from BANGIINC_05_25_2005-EX-10-Premium Managed Hosting Agreement.PDF

üìÑ Processing: BerkshireHillsBancorpInc_20120809_10-Q_EX-10.16_7708169_EX-10.16_Endorsement Agreement.pdf
‚úî Extracted 12 pages from BerkshireHillsBancorpInc_20120809_10-Q_EX-10.16_7708169_EX-10.16_Endorsement Agreement.pdf

üìÑ Processing: BizzingoInc_20120322_8-K_EX-10.17_7504499_EX-10.17_Endorsement Agreement.pdf
‚úî Extracted 11 pages from BizzingoInc_20120322_8-K_EX-10.17_7504499_EX-10.17_Endorsement Agreement.pdf

üìÑ Processing: BLACKROCKMUNIHO

In [4]:
def dict_to_text(output):
    text_blocks = []
    for file, full_text in output.items():
        text_blocks.append(f"===== {file} =====\n{full_text}\n")
    return "\n".join(text_blocks)

converted_text = dict_to_text(output)
print(converted_text)


===== ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINT VENTURE AGREEMENT.PDF =====
EXHIBIT 10.13 JOINT VENTURE AGREEMENT Collectible Concepts Group, Inc. ("CCGI") and Pivotal Self Service Tech, Inc. ("PVSS"), (the "Parties" or "Joint Venturers" if referred to collectively, or the "Party" or Joint Venturer" if referred to singularly), by this Agreement associate themselves as business associates, and not as partners, in the formation of a joint venture (the "Joint Venture"), for the purpose of engaging generally in the business provided for by terms and provisions of this Agreement. 1. Name of the Joint Venture. The name of the Joint Venture will be MightyCell Batteries, and may sometimes be referred to as "MightyCell" or the "Joint Venture" in this Agreement. The principal office and place of business shall be located in 1600 Lower State Road, Doylestown, PA 18901. 2. Scope of the Joint Venture Business. The Joint Venture is formed for the purpose of engaging generally in t

In [5]:
# Remove lines that are repeated too often (e.g., headers/footers)
def remove_repeated_lines(text):
    lines = text.split("\n")
    unique = []
    seen = set()

    for line in lines:
        normalized = line.strip()
        if len(normalized) > 0:
            if lines.count(line) > 3: 
                continue
        unique.append(line)

    return "\n".join(unique)

raw_text = remove_repeated_lines(converted_text)

# General text cleaning
def clean_text(text):
    text = re.sub(r"\s+", " ", text)          
    text = re.sub(r"[\x00-\x1F\x7F]", "", text)  
    text = text.replace("¬≠", "")              
    return text

processed_text = clean_text(raw_text)





In [6]:
# Fix hyphenated words at line breaks
processed_text = re.sub(r"(\w+)-\s+(\w+)", r"\1\2", processed_text)


In [7]:
# Restore headings in all caps
def restore_headings(text):
    text = re.sub(r"(?<=\n)([A-Z][A-Z ]{3,})(?=\n)", r"\n## \1\n", text)
    return text

processed_text = restore_headings(processed_text)


In [8]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import JsonOutputParser
from langchain_huggingface import HuggingFaceEndpoint,ChatHuggingFace
from langchain_chroma import Chroma

load_dotenv()

True

In [9]:
contracts = processed_text.split("=====")
# Remove empty strings
contracts = [c.strip() for c in contracts if c.strip()]


In [10]:
combined_contracts = []
for i in range(0, len(contracts), 2):
    name = contracts[i]
    text = contracts[i + 1] if i + 1 < len(contracts) else ""
    combined_contracts.append(f"{name}\n{text}")


In [None]:

chat=HuggingFaceEndpoint(
    repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
    task="text-generation"
)
 
model=ChatHuggingFace(llm=chat)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
from pydantic import BaseModel,Field
class ContractClauses(BaseModel):
    termination_clause: str | None = None
    confidentiality_clause: str | None = None
    liability_clause: str | None = None
    summary: str

parser = JsonOutputParser(pydantic_object=ContractClauses,strict=False)


In [13]:

clause_extraction_prompt = PromptTemplate(
    template="""
You are a contract-analysis Agent. The input will contain one or more contracts pasted together.


Your task:
1. Extracts contract clauses.
2. For each contract, extract the following four items:
   - Termination Clause
   - Confidentiality Clause
   - Liability Clause
   - Summary of the contract (100-150 sentences)

  Output **only valid JSON** in this format:

{format_instructions}
{{
  "contract_id": "contract_##",
  "termination_clause": "string or null",
  "confidentiality_clause": "string or null",
  "liability_clause": "string or null",
  "summary": "string"
}},
  
Extraction rules:
- Extract the exact clause text from the contract where possible. Do NOT rewrite unless absolutely necessary.
- If a clause is missing, return null.
- Do not merge clauses from different contracts.
- Escape all internal double quotes inside text fields like this: \"example\".
- Never output raw quotes inside JSON strings.

- Do not remove important legal language.
- Do not hallucinate missing content.
- Keep all text clean and without formatting artifacts.
 Contract Text:{text}
 Contract ID: {contract_id}

Output **only valid JSON** in this format:
""",
input_variables= {"text","contract_id"},
partial_variables={"format_instructions": parser.get_format_instructions()},


)


In [14]:
extraction_chain = clause_extraction_prompt | model 

In [15]:
Final_text_extracted = []

In [16]:
for i in range(0,int(len(combined_contracts)-43)):
    contract_id = f"contract_{i+1:02d}"
    result=extraction_chain.invoke({"text": combined_contracts[i], "contract_id": contract_id})
    print(result.content)
    Final_text_extracted.append(result.content)
 


{
  "contract_id": "contract_01",
  "termination_clause": null,
  "confidentiality_clause": null,
  "liability_clause": null,
  "summary": "Collectible Concepts Group, Inc. and Pivotal Self Service Tech, Inc. associate themselves as business associates in the formation of a joint venture named MightyCell Batteries. The Joint Venture is formed for the purpose of engaging in the business of marketing batteries and related products, including the display of licensed logos, images, brand names, and other labels. The business activities include the purchase, acquisition of licenses, sale, and distribution of products to retailers and distributors. The Joint Venturers shall have full charge of all affairs of the Joint Venture and shall share income and losses equally. The Joint Venture shall maintain adequate books and records, and the fiscal year shall be the calendar year. The Joint Venture shall commence on March 1, 2003, and shall be effective until February 28, 2004 unless extended by w

In [17]:
for i in range(int(len(combined_contracts)-43),len(combined_contracts)-39):
    contract_id = f"contract_{i+1:02d}"
    result=extraction_chain.invoke({"text": combined_contracts[i], "contract_id": contract_id})
    print(result.content)
    Final_text_extracted.append(result.content)
 


{
  "contract_id": "contract_08",
  "termination_clause": null,
  "confidentiality_clause": null,
  "liability_clause": "Party A shall adjust its maintenance time based on the production schedule of Party B. If Party A affects the production of Party B, Party A shall compensate for the loss. Party A cannot transfer or mortgage the CDQ and CDQ power generation systems without the consent of Party B, otherwise it shall be responsible for the losses. The CDQ and CDQ power generation systems shall comply with the national environmental protection standards. If the environment is polluted during the operation of the power plant, Party A shall bear the liability. If the power generation causes upper level power network, each party shall bear their own liabilities based on the determination of the upper level power network operator.",
  "summary": "Xi‚Äôan Zhonghong New Energy Technology Co., Ltd. and Boxing County Chengli Gas Supply Co., Ltd. entered into a Project Cooperation Agreement for 

In [18]:
for i in range(len(combined_contracts)-39,len(combined_contracts)-30):
    contract_id = f"contract_{i+1:02d}"
    result=extraction_chain.invoke({"text": combined_contracts[i], "contract_id": contract_id})
    print(result.content)
    Final_text_extracted.append(result.content)
 


{
    "contract_id": "contract_12",
    "termination_clause": null,
    "confidentiality_clause": null,
    "liability_clause": null,
    "summary": "Exhibit A JOINT FILING AGREEMENT Pursuant to and in accordance with the Securities Exchange Act of 1934, as amended, and the rules and regulations thereunder (the \u201cExchange Act\u201d) the undersigned hereby agree to the joint filing of Clarus IV-A, L.P., Clarus IV-B, L.P., Clarus IV-C, L.P., Clarus IV-D, L.P., Clarus IV GP, L.P., Blackstone Clarus GP L.P., Blackstone Clarus GP L.L.C., Blackstone Holdings II L.P., Blackstone Holdings I/II GP L.L.C., The Blackstone Group Inc., Blackstone Group Management L.L.C. and Stephen A. Schwarzman, on behalf of each of them of any filing required by such party under Section 13 of the Exchange Act or any rule or regulation thereunder (including any amendment, restatement, supplement, and/or exhibit thereto) with respect to securities of Galera Therapeutics, Inc., a Delaware corporation, and furthe

In [19]:
for i in range(len(combined_contracts)-30,len(combined_contracts)-10):
    contract_id = f"contract_{i+1:02d}"
    result=extraction_chain.invoke({"text": combined_contracts[i], "contract_id": contract_id})
    print(result.content)
    Final_text_extracted.append(result.content)
 


{
  "contract_id": "contract_21",
  "termination_clause": null,
  "confidentiality_clause": null,
  "liability_clause": null,
  "summary": "EXHIBIT D JOINT FILING AGREEMENT MFA FINANCIAL, INC. In accordance with Rule 13d-1(k) under the Securities Exchange Act of 1934, as amended, the undersigned hereby confirm the agreement by and among them to the joint filing on behalf of them of the Statement on Schedule 13D and any and all further amendments thereto, with respect to the securities of the above referenced issuer, and that this Agreement be included as an Exhibit to such filing. This Agreement may be executed in any number of counterparts each of which shall be deemed to be an original and all of which together shall be deemed to constitute one and the same Agreement. IN WITNESS WHEREOF, the undersigned hereby execute this Agreement as of July 6, 2020. ATHENE ANNUITY AND LIFE COMPANY By: Apollo Insurance Solutions Group LP, its investment adviser By: AISG GP Ltd., its general partner

In [20]:
for i in range(len(combined_contracts)-10,len(combined_contracts)-4):
    
    contract_id = f"contract_{i+1:02d}"
    result=extraction_chain.invoke({"text": combined_contracts[i], "contract_id": contract_id})
    print(result.content)
    Final_text_extracted.append(result.content)


{
  "contract_id": "contract_41",
  "termination_clause": "In the event that either party materially or repeatedly defaults in the performance of any of its duties or obligations hereunder and does not substantially cure such default within thirty days after being given written notice specifying the default, or, with respect to those defaults which cannot reasonably be cured within thirty days, if the defaulting party fails to proceed promptly after being given such notice to commence curing the default and thereafter to reasonably proceed to cure the same, then the party not in default may, by giving written notice to the defaulting party, terminate this Agreement as of a date specified in such notice of termination.",
  "confidentiality_clause": "SHPS and HPS each agree that all information communicated to it by the other will be held in strict confidence and will be used only for purposes of this Agreement, and that no such information will be disclosed by the recipient party, its a

In [21]:
for i in range(len(combined_contracts)-4,len(combined_contracts)):
    
    contract_id = f"contract_{i+1:02d}"
    result=extraction_chain.invoke({"text": combined_contracts[i], "contract_id": contract_id})
    print(result.content)
    Final_text_extracted.append(result.content)


{
  "contract_id": "contract_47",
  "termination_clause": "Upon any termination of this Agreement, eGain shall immediately cease providing all Hosting Services, and Customer shall no longer have access to the Software or the eGain System. Except in the event of termination for Customer's breach, eGain shall provide Customer with an electronic copy of the final Reports (covering the month just prior to termination of this Agreement). eGain shall be entitled to retain a copy (whether electronic or otherwise) of the Online Messages and the Reports for its records and internal purposes and shall not disclose such Online Messages or Reports to any third party except as permitted under Section 4. Within fifteen (15) days of any termination of this Agreement, Customer shall pay to eGain all unpaid fees accrued prior to termination. Sections 4, 5 (as to amounts accrued but unpaid), 7, 8, 10.2 and 12 and Exhibit A (as to amounts accrued but unpaid) shall survive any expiration or termination of

In [47]:
len(Final_text_extracted)

50

In [33]:
import json
import re

def manual_json_repair(raw):
    """
    Attempt to parse a broken JSON-like string manually.
    - Extracts key-value pairs using regex
    - Converts 'null' strings to None
    - Removes problematic characters
    """
    # Remove newlines and excessive spaces
    s = raw.replace('\n', ' ').replace('\r', ' ').strip()

    # Ensure starts with { and ends with }
    if not s.startswith('{'):
        s = '{' + s
    if not s.endswith('}'):
        s = s + '}'

    obj = {}
    # Match "key": "value" or 'key': 'value' or "key": null
    pattern = re.compile(r'"(.*?)"\s*:\s*(null|"(?:\\.|[^"])*")')
    for match in pattern.finditer(s):
        key = match.group(1).strip()
        val = match.group(2).strip()
        if val == 'null':
            obj[key] = None
        else:
            # Remove surrounding quotes and unescape
            val_clean = val[1:-1].replace('\\"', '"')
            obj[key] = val_clean
    return obj if obj else None

cleaned_objects = []

for idx, raw in enumerate(Final_text_extracted):
    obj = None
    try:
        obj = json.loads(raw)
    except Exception:
        try:
            # Repair via manual extraction
            obj = manual_json_repair(raw)
        except Exception:
            obj = None

    if obj:
        cleaned_objects.append(obj)
    else:
        print(f"‚ùå Still failed to parse index {idx}")

# Save combined JSON
with open("Output_of_text_extraction.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_objects, f, indent=4, ensure_ascii=False)

print(f"‚úÖ Finished! Total valid JSON objects: {len(cleaned_objects)}")


‚úÖ Finished! Total valid JSON objects: 50


# Semantic search  using embeddings.

In [38]:
def chunk_contract(text):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,        
        chunk_overlap=250,     
        length_function=len,
        separators=[
            "\n\n",   
            "\n",   
            ". ",     
            " ",     
            ""      
        ]
    )
    return splitter.split_text(text)


In [None]:
# splitting the data in to chunks
chunks = chunk_contract(processed_text)
print(f"Total chunks: {len(chunks)}")


Total chunks: 339


In [44]:
chunks[0]

'===== ACCELERATEDTECHNOLOGIESHOLDINGCORP_04_24_2003-EX-10.13-JOINT VENTURE AGREEMENT.PDF ===== EXHIBIT 10.13 JOINT VENTURE AGREEMENT Collectible Concepts Group, Inc. ("CCGI") and Pivotal Self Service Tech, Inc. ("PVSS"), (the "Parties" or "Joint Venturers" if referred to collectively, or the "Party" or Joint Venturer" if referred to singularly), by this Agreement associate themselves as business associates, and not as partners, in the formation of a joint venture (the "Joint Venture"), for the purpose of engaging generally in the business provided for by terms and provisions of this Agreement. 1. Name of the Joint Venture. The name of the Joint Venture will be MightyCell Batteries, and may sometimes be referred to as "MightyCell" or the "Joint Venture" in this Agreement. The principal office and place of business shall be located in 1600 Lower State Road, Doylestown, PA 18901. 2. Scope of the Joint Venture Business. The Joint Venture is formed for the purpose of engaging generally in 

In [45]:
# embeddings for semantic search
embedding_model = OpenAIEmbeddings()
vector_store = Chroma.from_texts(
    texts=chunks,
    embedding=embedding_model,
    persist_directory="chroma_store"
)


In [48]:
# querying the vector store
user_input = input("Enter your query: ")
query = user_input
results = vector_store.similarity_search(query, k=1)

for r in results:
    print(r.page_content)

. ARBITRATION: All disputes in connection with this Contract or the execution thereof shall be settled friendly through negotiations. In case no settlement can be reached, the case may then be submitted for arbitration to the Foreign Economic and Trade Arbitration Committee of the China Beijing Council for the Promotion of International Trade in accordance with its Provisional Rules of Procedures by the said Arbitration Committee. The Arbitration shall take place in Beijing and the decision of the Arbitration Committee shall be final and binding upon both partiesÕæ neither party shall seek recourse to a law court nor other authorities to appeal for revision of the decision. Arbitration fee shall be borne by the losing party. 20. This final price is the confidential information. Dissemination, distribution or duplication of this price is strictly prohibited. 5 Source: LOHA CO. LTD., F-1, 12/9/2019 21. Law application It will be governed by the law of the People‚Äôs Republic of China ,ot