In [1]:
import pytesseract
import docx
from PIL import Image
import os
import pandas as pd
import ollama

In [2]:
src_train = 'data/train'
files = os.listdir(src_train)
files

['46239065-Standard-Rental-Agreement-Rental-With-Performance-Fee.docx',
 '18325926-Rental-Agreement-1.docx',
 '6683127-House-Rental-Contract-GERALDINE-GALINATO-v2-Page-1.docx',
 '47854715-RENTAL-AGREEMENT.docx',
 '36199312-Rental-Agreement.png',
 '6683129-House-Rental-Contract-Geraldine-Galinato-v2.docx',
 '54770958-Rental-Agreement.png',
 '44737744-Maddireddy-Bhargava-Reddy-Rental-Agreement.docx',
 '50070534-RENTAL-AGREEMENT.docx',
 '54945838-Rental-Agreement.png']

In [9]:
data = pd.read_csv('data/train.csv')

In [65]:
doc_file = os.path.join(src_train, files[1])
doc = docx.Document(doc_file)
doc_text = ' '.join(p.text for p in doc.paragraphs if p.text.strip() != '')

def clean_data(text):
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.replace("\'", ' ')
    text = text.replace('\"', ' ')
    return text
doc_text = clean_data(doc_text)
doc_text

'RENTAL AGREEMENT This deed of rental agreement executed at Bangalore this fifth day of December 2008 between MR.K.Kuttan S/o Kelu Aehari (Late) residing at site No 152 Geethalayam OMH colong S.M. Road 1st main, T.Dasarahalli, Bangalore-57. Here after called the lessor which term shall where the contest so admits mean and includes his legal heirs representatives executors and assign and Sri. P.M. Narayana Namboodri “Laxmi Leela” ground floor 3rd cross Ayyappa Nagar behind Ayyappa Temple, Jalahalli West, Bangalore - 15 referred as tenant which term shall wherever the context so admits mean and includes his legal heirs representations executors assign witnesses. Whereas the lessor is the absolute owner after premises at site No. 8/20 S.M. Road, Jalahalli West, Bangalore - 15 and he has constructed approximate 500 sft area of Ilnd floor with A.C. Sheet roof with power light in the above said premises. Where as the owner has offered to let out the premises for non residential purpose, wher

In [66]:
img = Image.open(os.path.join(src_train, "54945838-Rental-Agreement.png"))
text = pytesseract.image_to_string(img, lang='eng')


text = clean_data(text)
text

'RENTAL AGREEMENT  This Rental Agreement made and executed at Bangalore on this 1st day April two thousand eleven (01/04/2011), between Prof. K. Parthasarathy, S/o. Late T.S.Krishna lyengar aged about 75 years, No. 46, Srinivasa, 2  Main Road, Hanumanthanagar, Bangalore-560019, herein after called “LESSOR” (which expression shall where-ever the context so requires or permits mean  and include here heirs, executors, administrators and assign) of the FIRST PART. AND  Mr. Veerabrahmam Bathini, S/o Mr.Lingaiah Bathini aged about 28 years (Sr. Engineer-PSS working at Power Research and Development Consultants, #5, 11™  cross, 2  stage, West of Chord Road, Bangalore-560086, herein after called “LESSEE” (Tenant) of the SECOND PART.  WITNESSETH AS FOLLOWS:  1. The Lessor hereby covenants with Lessee and warrants that he is the owner and as such as the right and the authority to lease the First Floor of premises No. 146, Krishna Krupa, 3  Cross, 1%t Main, VHBCSL Layout, West of Chord Road, Bang

In [63]:
def get_prompt(text, prev_note=""):
    return f"""
You are a contract parser. From the following legal/rental agreement, extract and return *only* a JSON object with these exact fields:

- "Agreement Value": The monthly payable rent or total financial obligation. It may be mentioned as "rent", "sum of Rs.", etc.
- "Agreement Start Date": The date when the agreement begins. If mentioned indirectly (e.g., "commences from the 1st of April"), interpret accordingly.
- "Agreement End Date": The date the agreement ends. If given as a duration (e.g., "valid for 11 months" or "90 days after start"), compute and convert it.
- "Renewal Notice (Days)": The number of days before the end date that a notice must be given to renew or terminate the agreement. This could be written as "30 days prior notice", "one month notice", "per months", etc.
- "Party One": The name of the first party (e.g., owner, landlord, licensor).
- "Party Two": The name of the second party (e.g., tenant, licensee, lessee).

**Important Formatting Instructions**:
- Dates must be in `dd.mm.yyyy` format.
- If any field is not found or cannot be interpreted, set it as `null`.
- If it says "per months" or similar, interpret it as a monthly payment.
- The Agreement Value should be a number, not a string.

Here is the agreement text:
{text}

previous note (the structure remains the same, update the values as you find them):
{prev_note}
"""

In [64]:
def run_llm(text_chunk, prev_note=""):
    stream = ollama.chat(
        model='qwen3:4b',
        messages=[
            {
                'role': 'system',
                "content": '/set nothink'
            },
            {
                'role': 'user',
                'content': get_prompt(text_chunk, prev_note=prev_note)
            }
        ],
        stream=True
    )

    response = ''
    for chunk in stream:
        response += chunk['message']['content']
    return response

dct = {}
for i in range(0, len(doc_text), 4000):
    note = run_llm(doc_text[:i+4000], prev_note=dct.get('prev_note', ''))
    dct['prev_note'] = note
    print(note)
    print('---')

<think>

</think>

{
  "Agreement Value": 4000,
  "Agreement Start Date": "05.12.2008",
  "Agreement End Date": "15.05.2009",
  "Renewal Notice (Days)": 90,
  "Party One": "MR.K.Kuttan S/o Kelu Aehari (Late)",
  "Party Two": "Sri. P.M. Narayana Namboodri \"Laxmi Leela\""
}
---


In [8]:
data

Unnamed: 0,File Name,Aggrement Value,Aggrement Start Date,Aggrement End Date,Renewal Notice (Days),Party One,Party Two
0,6683127-House-Rental-Contract-GERALDINE-GALINA...,6500,20.05.2007,20.05.2008,15.0,"Antonio Levy S. Ingles, Jr. and/or Mary Rose C...",GERALDINE Q. GALINATO
1,6683129-House-Rental-Contract-Geraldine-Galina...,6500,20.05.2007,20.05.2008,15.0,"Antonio Levy S. Ingles, Jr. and/or Mary Rose C...",GERALDINE Q. GALINATO
2,18325926-Rental-Agreement-1,4000,05.12.2008,31.11.2009,90.0,MR.K.Kuttan,P.M. Narayana Namboodri
3,24158401-Rental-Agreement,12000,01.04.2008,31.03.2009,60.0,Hanumaiah,Vishal Bhardwaj
4,36199312-Rental-Agreement,3800,01.05.2010,31.04.2011,30.0,Balaji.R,Kartheek R
5,44737744-Maddireddy-Bhargava-Reddy-Rental-Agre...,3000,20.09.2010,19.07.2011,,M.V.V. VIJAYA SHANKAR,MADDIREDDY BHARGAVA REDDY
6,47854715-RENTAL-AGREEMENT,9000,01.04.2010,31.02.2011,60.0,P C MATHEW,L GOPINATH
7,50070534-RENTAL-AGREEMENT,10000,01.04.2010,30.03.2011,90.0,P. JohnsonRavikumar,Saravanan BV
8,54770958-Rental-Agreement,8000,01.04.2011,31.03.2012,90.0,K. Parthasarathy,Veerabrahmam Bathini
9,54945838-Rental-Agreement,5500,21.04.2011,19.02.2012,60.0,Asha Ramesh & Ramesh K.N,Sadasivuni Deepthi & Sadasivuni Kiran
