In [1]:
!pip install pytesseract python-docx cohere pandas





In [2]:
!pip install pdfplumber





In [3]:
import os
import json
import pandas as pd
import pytesseract
import tempfile
from PIL import Image
from docx import Document
import pdfplumber
from tqdm import tqdm
import cohere

In [4]:
COHERE_API_KEY = "wAxVCab3xnABNI8EkReLEtEa7tnPUAiF7srUPoFC"
co = cohere.Client(COHERE_API_KEY)

In [5]:
pytesseract.pytesseract.tesseract_cmd = r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
tempfile.tempdir = "C:/Users/ASUS/AppData/Local/Temp"

In [6]:
def extract_text_from_png(filepath):
    try:
        image = Image.open(filepath).convert("L")
        return pytesseract.image_to_string(
            image,
            config="--psm 6",
            lang="eng"
        )
    except pytesseract.TesseractNotFoundError as e:
        print(f"❌ Tesseract not found. Please ensure it is installed and in PATH. Skipping: {filepath}")
        return ""
    except Exception as e:
        print(f"❌ Error reading PNG file {filepath}: {e}")
        return ""

In [7]:
def extract_text_from_docx(filepath):
    try:
        doc = Document(filepath)
        return "\n".join([p.text for p in doc.paragraphs])
    except Exception as e:
        print(f"❌ Error reading DOCX file {filepath}: {e}")
        return ""

In [8]:
def extract_text_from_pdf(filepath):
    try:
        text = ""
        with pdfplumber.open(filepath) as pdf:
            for page in pdf.pages:
                text += page.extract_text() or ""
        return text
    except Exception as e:
        print(f"❌ Error reading PDF file {filepath}: {e}")
        return ""

In [9]:
def resolve_full_filename(basename, folder):
    for ext in [".docx", ".pdf", ".png"]:
        full_path = os.path.join(folder, basename + ext)
        if os.path.exists(full_path):
            return basename + ext
    return None  # Not found

In [10]:
def extract_metadata_from_text(text):
    prompt = f"""
You are an intelligent assistant trained to extract metadata from legal agreement documents.
Extract the following fields:
- agreement_value
- start_date
- end_date
- renewal_notice_days
- party_one
- party_two

### Example:
Input:
This Rental Agreement is made between John Doe and Jane Smith. The rent is $2500. It starts on 01.01.2022 and ends on 31.12.2022. Notice for renewal is 60 days prior.

Output:
{{
  "agreement_value": "2500",
  "start_date": "01.01.2022",
  "end_date": "31.12.2022",
  "renewal_notice_days": "60",
  "party_one": "John Doe",
  "party_two": "Jane Smith"
}}

### Now extract from the following document:
{text}

Output as JSON:
"""

    try:
        response = co.generate(
            model="command-r-plus",
            prompt=prompt,
            max_tokens=300,
            temperature=0.2
        )
        return response.generations[0].text
    except Exception as e:
        print(f"❌ Cohere API Error: {e}")
        return ""

In [11]:
# train_df = pd.read_csv("data/train.csv")
# print("📁 Files in data/train/:")
# print(os.listdir("data/train"))
# print(train_df.head())

# # Example: Process one file
# filename = train_df.loc[0, "File Name"]
# filepath = os.path.join("data/train", filename)

# if filename.endswith(".png"):
#     raw_text = extract_text_from_png(filepath)
# elif filename.endswith(".docx"):
#     raw_text = extract_text_from_docx(filepath)
#     print("\n--- Raw Text Extracted ---\n")
#     print(raw_text[:1000])  # First 1000 characters
# else:
#     raw_text = ""

# # Extract metadata using Cohere
# output = extract_metadata_from_text(raw_text)
# print("\nExtracted Metadata:\n", output)

In [12]:
train_df = pd.read_csv("data/train.csv")
results = []
skipped_missing = 0

for idx, row in tqdm(train_df.iterrows(), total=len(train_df)):
    basename = row["File Name"]
    filename = resolve_full_filename(basename, folder="data/train")

    if not filename:
        print(f"❌ File not found for: {basename}")
        skipped_missing += 1
        continue

    filepath = os.path.join("data/train", filename)
    ext = os.path.splitext(filename)[-1].lower()

    if ext == ".png":
        raw_text = extract_text_from_png(filepath)
    elif ext == ".docx":
        raw_text = extract_text_from_docx(filepath)
    elif ext == ".pdf":
        raw_text = extract_text_from_pdf(filepath)
    else:
        print(f"⚠️ Skipping unsupported file type: {filename}")
        continue

    if len(raw_text.strip()) < 10:
        print(f"⚠️ Too little text in: {filename}")
        continue

    metadata_json_str = extract_metadata_from_text(raw_text)

    try:
        metadata = json.loads(metadata_json_str) if metadata_json_str.strip().startswith("{") else {}
    except Exception as e:
        print(f"❌ Failed to parse model output for {filename}: {e}")
        metadata = {}

    results.append({
        "File Name": basename,
        "agreement_value": metadata.get("agreement_value"),
        "start_date": metadata.get("start_date"),
        "end_date": metadata.get("end_date"),
        "renewal_notice_days": metadata.get("renewal_notice_days"),
        "party_one": metadata.get("party_one"),
        "party_two": metadata.get("party_two"),
    })

# Save train output safely
os.makedirs("output", exist_ok=True)
try:
    pd.DataFrame(results).to_csv("output/train_predictions.csv", index=False)
    print(f"\n✅ Saved {len(results)} predictions to output/train_predictions.csv")
except PermissionError:
    print("❌ Could not write to output/train_predictions.csv — is it open in Excel?")

print(f"🧾 Skipped due to missing train files: {skipped_missing}")

 30%|████████████████████████▉                                                          | 3/10 [00:09<00:21,  3.08s/it]

❌ File not found for: 24158401-Rental-Agreement


 50%|█████████████████████████████████████████▌                                         | 5/10 [00:10<00:07,  1.47s/it]

❌ Error reading PNG file data/train\36199312-Rental-Agreement.png: [Errno 2] No such file or directory: 'C:\\Users\\ASUS\\AppData\\Local\\Temp\\tess_lbsruptp.txt'
⚠️ Too little text in: 36199312-Rental-Agreement.png


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.64s/it]


✅ Saved 8 predictions to output/train_predictions.csv
🧾 Skipped due to missing train files: 1





In [13]:
test_df = pd.read_csv("data/test.csv")
test_results = []
skipped_test_missing = 0

for idx, row in tqdm(test_df.iterrows(), total=len(test_df)):
    basename = row["File Name"]
    filename = resolve_full_filename(basename, folder="data/test")

    if not filename:
        print(f"❌ Test file not found for: {basename}")
        skipped_test_missing += 1
        continue

    filepath = os.path.join("data/test", filename)
    ext = os.path.splitext(filename)[-1].lower()

    if ext == ".png":
        raw_text = extract_text_from_png(filepath)
    elif ext == ".docx":
        raw_text = extract_text_from_docx(filepath)
    elif ext == ".pdf":
        raw_text = extract_text_from_pdf(filepath)
    else:
        print(f"⚠️ Skipping unsupported test file type: {filename}")
        continue

    if len(raw_text.strip()) < 10:
        print(f"⚠️ Too little text in test file: {filename}")
        continue

    metadata_json_str = extract_metadata_from_text(raw_text)

    try:
        metadata = json.loads(metadata_json_str) if metadata_json_str.strip().startswith("{") else {}
    except Exception as e:
        print(f"❌ Failed to parse model output for test file {filename}: {e}")
        metadata = {}

    test_results.append({
        "File Name": basename,
        "agreement_value": metadata.get("agreement_value"),
        "start_date": metadata.get("start_date"),
        "end_date": metadata.get("end_date"),
        "renewal_notice_days": metadata.get("renewal_notice_days"),
        "party_one": metadata.get("party_one"),
        "party_two": metadata.get("party_two"),
    })

# Save test output safely
try:
    pd.DataFrame(test_results).to_csv("output/test_predictions.csv", index=False)
    print(f"\n✅ Saved {len(test_results)} predictions to output/test_predictions.csv")
except PermissionError:
    print("❌ Could not write to output/test_predictions.csv — is it open in Excel?")

print(f"🧾 Skipped due to missing test files: {skipped_test_missing}")

 25%|█████████████████████                                                               | 1/4 [00:00<00:00,  3.92it/s]

❌ Error reading PNG file data/test\24158401-Rental-Agreement.png: [Errno 2] No such file or directory: 'C:\\Users\\ASUS\\AppData\\Local\\Temp\\tess_vv8bo0_i.txt'
⚠️ Too little text in test file: 24158401-Rental-Agreement.png


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  8.02it/s]

❌ Error reading PNG file data/test\95980236-Rental-Agreement.png: [Errno 2] No such file or directory: 'C:\\Users\\ASUS\\AppData\\Local\\Temp\\tess_hjo65s37.txt'
⚠️ Too little text in test file: 95980236-Rental-Agreement.png
❌ Test file not found for: 156155545-Rental-Agreement-Kns-Home
❌ Test file not found for: 228094620-Rental-Agreement

✅ Saved 0 predictions to output/test_predictions.csv
🧾 Skipped due to missing test files: 2





In [18]:
def compute_recall(pred_path="output/train_predictions.csv", gt_path="data/train.csv"):
    # Load both files
    gt_df = pd.read_csv(gt_path)
    pred_df = pd.read_csv(pred_path)

    # Rename ground truth columns to snake_case for comparison
    gt_df = gt_df.rename(columns={
        "Aggrement Value": "agreement_value",
        "Aggrement Start Date": "start_date",
        "Aggrement End Date": "end_date",
        "Renewal Notice (Days)": "renewal_notice_days",
        "Party One": "party_one",
        "Party Two": "party_two"
    })

    # Merge on 'File Name'
    merged = pd.merge(gt_df, pred_df, on="File Name")

    fields = [
        "agreement_value",
        "start_date",
        "end_date",
        "renewal_notice_days",
        "party_one",
        "party_two"
    ]

    print("📊 Field-wise Recall:\n")
    for field in fields:
        true_values = merged[field + "_x"].fillna("").astype(str).str.strip().str.lower()
        pred_values = merged[field + "_y"].fillna("").astype(str).str.strip().str.lower()

        match = (true_values == pred_values) & (true_values != "")
        total = (true_values != "").sum()
        recall = match.sum() / total if total else 0

        print(f"{field:25s}: Recall = {recall:.2%} ({match.sum()} / {total})")

In [19]:
compute_recall()

📊 Field-wise Recall:

agreement_value          : Recall = 37.50% (3 / 8)
start_date               : Recall = 37.50% (3 / 8)
end_date                 : Recall = 12.50% (1 / 8)
renewal_notice_days      : Recall = 0.00% (0 / 7)
party_one                : Recall = 0.00% (0 / 8)
party_two                : Recall = 12.50% (1 / 8)


In [17]:
print("📄 Columns in train.csv:")
print(pd.read_csv("data/train.csv").columns.tolist())

print("\n📄 Columns in output/train_predictions.csv:")
print(pd.read_csv("output/train_predictions.csv").columns.tolist())

📄 Columns in train.csv:
['File Name', 'Aggrement Value', 'Aggrement Start Date', 'Aggrement End Date', 'Renewal Notice (Days)', 'Party One', 'Party Two']

📄 Columns in output/train_predictions.csv:
['File Name', 'agreement_value', 'start_date', 'end_date', 'renewal_notice_days', 'party_one', 'party_two']
