In [1]:
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import io

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as fh:
        # iterate over all pages of PDF document
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True,):
            # creating a resoure manager
            resource_manager = PDFResourceManager()
            
            # create a file handle
            fake_file_handle = io.StringIO()
            
            # creating a text converter object
            converter = TextConverter(
                                resource_manager, 
                                fake_file_handle, 
                                codec='utf-8', 
                                laparams=LAParams()
                        )

            # creating a page interpreter
            page_interpreter = PDFPageInterpreter(
                                resource_manager, 
                                converter
                            )

            # process current page
            page_interpreter.process_page(page)
            
            # extract text
            text = fake_file_handle.getvalue()
            yield text

            # close open handles
            converter.close()
            fake_file_handle.close()


In [3]:
text = ""
for page in extract_text_from_pdf("service letter sample.pdf"):
    text += ' ' + page

In [4]:
text

" SADISA MANAGEMENT & TECHNOLOGIES\nSADISA M&T\nB.R. No. 1998/M3 No. 5/61 138, Thalgaspedesa, Kirimatimulla, Thelijavila.\n077-8603555/071-8081809\nsadisamanagment@gmail.com\n22nd AUGUST 2023\n\nTO WHOM IT MAY CONCERN.\n\nThis is to certify that DON KODITHUWAKKU KARUNARATHNE RAVEEN SHENOL of 28/A/1,\nWALANA, WELIGAMA, bearer of NIC 200118200862 is employed at our organization as a\n'Production Associator 'on sub-contract basis from 20th July 2021 to 30th March 2022.\n\nDuring this period he has been assigned to the production department of Midigama Air Tire\ndivision at Michelin Lanka (pvt) Ltd.\n\nDuring this period, we found him to be honest, loyal & hard working. He performed all the duties\nentrusted to him to the satisfaction of his superiors.\n\nHe bears a good moral character & I have no hesitation in recommending him to any\nprospective employer who needs his service in the same field.\n\nWe wish him all success in his future endeavors.\n\nThis letter was issued on his request 

In [8]:
from spacy.matcher import Matcher
import spacy
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
import pandas as pd
import re

In [9]:
def change_month_to_number(text):
    month_dict = {
        'JANUARY': '01',
        'FEBRUARY': '02',
        'MARCH': '03',
        'APRIL': '04',
        'MAY': '05',
        'JUNE': '06',
        'JULY': '07',
        'AUGUST': '08',
        'SEPTEMBER': '09',
        'OCTOBER': '10',
        'NOVEMBER': '11',
        'DECEMBER': '12'
    }
    for month, value in month_dict.items():
        text = re.sub(r'\b{}\b'.format(month), value, text.upper())
    return text

In [10]:
tt = change_month_to_number(text)

In [12]:
tt = re.sub(r'(\d{2})(?:ST|ND|RD|TH) (\d{2}) (\d{4})', r'\1 \2 \3', tt)
tt = re.sub(r'\.', ' ', tt)

In [13]:
lines = tt.split("\n")

for indx, line in enumerate(lines):
    # print(line)
    if " TO " in line:
        # dates = re.findall(r'\b\d{2}(?:ST|ND|RD|TH) \d{2} \d{4}\b', line)
        dates = re.findall(r'\b\d{2} \d{2} \d{4}\b', line)
        if dates:
            date_indexes = [line.index(date) for date in dates]
            to_index = line.index(" TO ")
            
            for date_index in date_indexes:
                print("Date range:", dates[date_indexes.index(date_index)], "to", dates[date_indexes.index(date_index)+1])
                break

Date range: 20 07 2021 to 30 03 2022


In [21]:
# Process the text with spaCy
doc = nlp(text)

# Extract person names
names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

print("Extracted Person Names:", names)

Extracted Person Names: ['KARUNARATHNE RAVEEN SHENOL', 'WELIGAMA', 'Krishantha']


In [23]:
pattern = r'\b(?:Production Associator|Manager Administration|Developer|Analyst|Engineer)\b'

roles = re.findall(pattern, text)
print("Extracted Job Roles:", roles)

Extracted Job Roles: ['Production Associator', 'Manager Administration']


In [24]:
pattern = r'\bNIC\s*(\d{9,12}[A-Z]?)\b|\b(\d{9,12}[A-Z]?)\s*NIC\b'

# Find all matches
matches = re.findall(pattern, text)

# Extract NIC numbers from matched groups
nic_numbers = [match[0] or match[1] for match in matches]

print("Extracted NIC Numbers:", nic_numbers)

Extracted NIC Numbers: ['200118200862']


In [25]:
pattern_ofz = r'\b(?:Sadisa Management & Technologies)\b'

ofz = re.findall(pattern_ofz, text)
print("Extracted ofz:", ofz)

Extracted ofz: ['Sadisa Management & Technologies']


In [4]:
import requests
import json

url = "http://127.0.0.1:8000/extract-data/"
pdf_file_path = "service letter sample.pdf"
validation_input = {
    "names": "John Doe",
    "roles": "Developer",
    "nic_numbers": "199811820086V",
    "date_ranges": "01 01 2021 to 31 12 2021"
}
ofz_term = "Sadisa"

with open(pdf_file_path, 'rb') as f:
    files = {
        'pdf_file': f,
    }
    data = {
        'ofz_term': ofz_term,
        'validation_input': json.dumps(validation_input)
    }
    response = requests.post(url, files=files, data=data)

print(response.json())


{'detail': [{'loc': ['body', 'validation_input'], 'msg': 'value is not a valid dict', 'type': 'type_error.dict'}]}


In [5]:
import requests
import json

url = "http://127.0.0.1:8000/extract-data/"
validation_input = {
    "names": "John Doe",
    "roles": "Developer",
    "nic_numbers": "199811820086V",
    "date_ranges": "01 01 2021 to 31 12 2021"
}
ofz_term = "Sadisa"

with open(pdf_file_path, 'rb') as f:
    files = {
        'pdf_file': f,
    }
    data = {
        'ofz_term': ofz_term,
        'validation_input': json.dumps(validation_input)  # Encode validation_input as a JSON string
    }
    response = requests.post(url, files=files, data=data)

print(response.json())


{'detail': [{'loc': ['body', 'validation_input'], 'msg': 'value is not a valid dict', 'type': 'type_error.dict'}]}


In [10]:
from datetime import datetime

def parse_date_range(date_range_str):
    start_str, end_str = date_range_str.split(' to ')
    start_date = datetime.strptime(start_str.strip(), '%d %m %Y')
    end_date = datetime.strptime(end_str.strip(), '%d %m %Y')
    return start_date, end_date

# validation_input = {'names': 'John Doe','roles': 'Developer','nic_numbers': '199811820086V','date_ranges': '01 01 2021 to 31 12 2021'}
# extracted_data = {'names': ['KARUNARATHNE RAVEEN SHENOL', 'WELIGAMA', 'Krishantha','John Doe'],'roles': ['Production Associator', 'Manager Administration','Developer'],'nic_numbers': ['200118200862'],'date_ranges': ['Date range: 20 07 2021 to 30 03 2022']}

validation_input = {'names': ['John Doe'], 'roles': ['Manager'], 'nic_numbers': ['199811820086V'], 'date_ranges': ['01 01 2020 to 31 12 2020']}
extracted_data = {'names': ['KARUNARATHNE RAVEEN SHENOL', 'WELIGAMA', 'Krishantha'], 'roles': ['Production Associator', 'Manager Administration'], 'nic_numbers': ['200118200862'], 'ofz': [], 'date_ranges': ['Date range: 20 07 2021 to 30 03 2022']}

validations = {
    "name" : False,
    "role" : False,
    "nic_number" : False,
    "date_ranges": False
}
# Check if the validation_input name is in the extracted_data names
if validation_input['names'] in extracted_data['names']:
    print("Valid")
    validations["name"] = True
if validation_input['roles'] in extracted_data['roles']:
    print("Valid")
    validations["role"] = True
if validation_input['nic_numbers'] in extracted_data['nic_numbers']:
    print("Valid")
    validations["nic_number"] = True

# Parse validation_input date range
val_start, val_end = parse_date_range(validation_input['date_ranges'])

# Check if the validation_input date range is found in any extracted_data date ranges
for range_str in extracted_data['date_ranges']:
    # Extract date range from the string (removing 'Date range:' prefix)
    extracted_range = range_str.replace('Date range:', '').strip()
    ext_start, ext_end = parse_date_range(extracted_range)
    
    # Check for overlap or exact match
    if val_start <= ext_end and val_end >= ext_start:
        validations["date_ranges"] = True
        break

print(validations)

AttributeError: 'list' object has no attribute 'split'

In [12]:
from datetime import datetime

def parse_date_range(date_range_str):
    start_str, end_str = date_range_str.split(' to ')
    start_date = datetime.strptime(start_str.strip(), '%d %m %Y')
    end_date = datetime.strptime(end_str.strip(), '%d %m %Y')
    return start_date, end_date

validation_input = {'names': ['John Doe'], 'roles': ['Manager'], 'nic_numbers': ['199811820086V'], 'date_ranges': ['20 07 2021 to 30 03 2022']}
extracted_data = {'names': ['KARUNARATHNE RAVEEN SHENOL', 'WELIGAMA', 'Krishantha'], 'roles': ['Production Associator', 'Manager Administration'], 'nic_numbers': ['200118200862'], 'ofz': [], 'date_ranges': ['Date range: 20 07 2021 to 30 03 2022']}

validations = {
    "name": False,
    "role": False,
    "nic_number": False,
    "date_ranges": False
}

# Check if the validation_input name is in the extracted_data names
if validation_input['names'][0] in extracted_data['names']:
    print("Valid name")
    validations["name"] = True

# Check if the validation_input role is in the extracted_data roles
if validation_input['roles'][0] in extracted_data['roles']:
    print("Valid role")
    validations["role"] = True

# Check if the validation_input nic_number is in the extracted_data nic_numbers
if validation_input['nic_numbers'][0] in extracted_data['nic_numbers']:
    print("Valid NIC number")
    validations["nic_number"] = True

# Parse validation_input date range
val_start, val_end = parse_date_range(validation_input['date_ranges'][0])

# Check if the validation_input date range is found in any extracted_data date ranges
for range_str in extracted_data['date_ranges']:
    # Extract date range from the string (removing 'Date range:' prefix)
    extracted_range = range_str.replace('Date range:', '').strip()
    ext_start, ext_end = parse_date_range(extracted_range)
    
    # Check for overlap or exact match
    if val_start <= ext_end and val_end >= ext_start:
        validations["date_ranges"] = True
        break

print(validations)


{'name': False, 'role': False, 'nic_number': False, 'date_ranges': True}


In [8]:
from datetime import datetime

# Function to parse date ranges
def parse_date_range(date_range_str):
    start_str, end_str = date_range_str.split(' to ')
    start_date = datetime.strptime(start_str.strip(), '%d %m %Y')
    end_date = datetime.strptime(end_str.strip(), '%d %m %Y')
    return start_date, end_date

validation_input = {'date_ranges': '01 01 2021 to 31 12 2021'}
extracted_data = {'date_ranges': ['Date range: 20 07 2021 to 30 03 2022']}
validations = {"date_ranges": False}

# Parse validation_input date range
val_start, val_end = parse_date_range(validation_input['date_ranges'])

# Check if the validation_input date range is found in any extracted_data date ranges
for range_str in extracted_data['date_ranges']:
    # Extract date range from the string (removing 'Date range:' prefix)
    extracted_range = range_str.replace('Date range:', '').strip()
    ext_start, ext_end = parse_date_range(extracted_range)
    
    # Check for overlap or exact match
    if val_start <= ext_end and val_end >= ext_start:
        validations["date_ranges"] = True
        break

print(validations)


{'date_ranges': True}
