In [26]:
from common_import import *
from ocr_extractor import extract_text_from_pdf

from getpass import getpass

# LLM Models
from openai import OpenAI
import openai
from pydantic import BaseModel, Field, AfterValidator, WithJsonSchema
import instructor

# Typing
from typing import Optional, Iterable, List, Annotated
from datetime import date

# Print
from pprint import pprint
import json

In [27]:
# Setup OpenAI
if os.getenv("OPEN_AI_KEY") is None:
    if any(['VSCODE' in x for x in os.environ.keys()]):
        print("Please enter password in the VS Code prompt at the top of your VS Code window!")
    
    os.environ['OPENAI_API_KEY'] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
    openai.api_key = os.getenv("OPENAI_API_KEY", "")

assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "Doesn't look like an API key"
print("OpenAI API key configured")

Please enter password in the VS Code prompt at the top of your VS Code window!


AssertionError: Doesn't look like an API key

# Structure of resume

In [13]:
def day_validator(day: int):
    if day == None:
        day = 15
    if day > 31 or day < 0:
        raise ValueError("Day not in range")
    return day
    

def month_validator(month: int):
    if month == None:
        month = 6
        
    if month > 12 or month < 0:
        raise ValueError("Month not in range")

    return month



class ForgivingDate(BaseModel):
    day: int = Annotated[
        int, 
        AfterValidator(day_validator),
        WithJsonSchema({
            'type': 'int',
            'description': 'the day (optional)'
        })
    ]
    month: int = Annotated[
        int, 
        AfterValidator(month_validator),
        WithJsonSchema({
            'type': 'int',
            'description': 'the month (optional)'
        })
    ]
    year: int

    # def __init__(self, date: date):
    #     self.day = day_validator(date.day)
    #     self.month = month_validator(date.month)
    #     self.year = date.year



class DateRange(BaseModel):
    start: ForgivingDate
    end: ForgivingDate = Field(description='Date of the end', default=ForgivingDate(day=date.today().day, month=date.today().month, year=date.today().year))


class Skill(BaseModel):
    name: str = Field(description="Extract the technical tools in the following text. Technical tools are generally in 2-3 words")
    years: int = Field(description='Years of experience deducted from the (number of days between the dates)/365 and rounded up')




class Experience(BaseModel):
    dates: DateRange
    title: str = Field(description='The name of the role')
    company: str = Field(description='The employer')
    skills: List[Skill]
    description:str = Field(description='spell check all these sentences. Do not summarize anything')


class Education(BaseModel):
    college: str = Field(description='Institution from which the person received their degree')
    
    dates: DateRange


class Contact(BaseModel):
    phone_number: str 
    email: str 
    location: str = Field(
        default_factory=str, 
        description='Complete street address wherever possible.'
        )

class Candidate(BaseModel):
    name: str
    contact: Contact
    education: Education
    
    experiences: List[Experience]

class OptionalCandidate(BaseModel):
    result: Optional[Candidate] = Field(default=None)
    error: bool = Field(default=False)
    message: Optional[str]

In [28]:
#  for mistral

from pydantic import BaseModel, Field
from typing import List, Optional
from datetime import date
from common_import import *
from ocr_extractor import extract_text_from_pdf

# Your validator functions and Pydantic models
def day_validator(day: int):
    if day is None:
        day = 15
    if day > 31 or day < 0:
        raise ValueError("Day not in range")
    return day

def month_validator(month: int):
    if month is None:
        month = 6
    if month > 12 or month < 0:
        raise ValueError("Month not in range")
    return month

class ForgivingDate(BaseModel):
    day: int = Field(description='the day (optional)')
    month: int = Field(description='the month (optional)')
    year: int

class DateRange(BaseModel):
    start: ForgivingDate
    end: ForgivingDate = Field(description='Date of the end', default=ForgivingDate(day=date.today().day, month=date.today().month, year=date.today().year))

class Skill(BaseModel):
    name: str = Field(description="Extract the technical tools in the following text. Technical tools are generally in 2-3 words")
    years: int = Field(description='Years of experience deducted from the (number of days between the dates)/365 and rounded up')

class Experience(BaseModel):
    dates: DateRange
    title: str = Field(description='The name of the role')
    company: str = Field(description='The employer')
    skills: List[Skill]
    description: str = Field(description='spell check all these sentences. Do not summarize anything')

class Education(BaseModel):
    college: str = Field(description='Institution from which the person received their degree')
    dates: DateRange

class Contact(BaseModel):
    phone_number: str 
    email: str 
    location: str = Field(default_factory=str, description='Complete street address wherever possible.')

class Candidate(BaseModel):
    name: str
    contact: Contact
    education: Education
    experiences: List[Experience]

class OptionalCandidate(BaseModel):
    result: Optional[Candidate] = Field(default=None)
    error: bool = Field(default=False)
    message: Optional[str]


# Extracting data from pdf


In [6]:
#for getting information about person

import requests

person_prompt_tpl=f"""From the Resume text for a job aspirant below, extract Entities strictly as instructed below
1. First, look for the Person Entity type in the text and extract the needed information defined below:
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. NEVER create new entity types that aren't mentioned below. Document must be summarized and stored inside Person entity under `description` property
    Entity Types:
    label:'Person',id:string,role:string,description:string //Person Node
2. Description property should be a crisp text summary and MUST NOT be more than 100 characters
3. If you cannot find any information on the entities & relationships above, it is okay to return empty value. DO NOT create fictious data
4. Do NOT create duplicate entities
5. Restrict yourself to extract only Person information. No Position, Company, Education or Skill information should be focussed.
6. NEVER Impute missing values
Example Output JSON:
{{"entities": [{{"label":"Person","id":"person1","role":"Prompt Developer","description":"Prompt Developer with more than 30 years of LLM experience"}}]}}

Question: Now, extract the Person for the text below -

{text_chunk}

I only want json format apart from that all things are unncessesary for me so please don't return it in output.

Answer:
"""



API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
headers = {"Authorization": "Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": person_prompt_tpl,
})

skill_prompt_tpl=f"""From the Resume text below, extract Entities strictly as instructed below
1. Look for all Skill, framework and coursework Entities in the text. The`id` property of each entity must be alphanumeric and must be unique among the entities. NEVER create new entity types that aren't mentioned below:
    Entity Definition:
    label:'Skill',id:string,name:string,level:string //Skill Node
2. NEVER Impute missing values
3. If you do not find any level information: assume it as `expert` if the experience in that skill is more than 5 years, `intermediate` for 2-5 years and `beginner` otherwise.
Example Output Format:
{{"entities": [{{"label":"Skill","id":"skill1","name":"Neo4j","level":"expert"}},{{"label":"Skill","id":"skill2","name":"Pytorch","level":"expert"}}]}}

Question: Now, extract entities as mentioned above for the text below -
{text_chunk}

Answer:

please return only answer which is in json format to me apart from that all other details which are mentioned is unnecessary.

"""

output1 = query({"inputs": skill_prompt_tpl})

print(output)

print(output1)

[{'generated_text': 'From the Resume text for a job aspirant below, extract Entities strictly as instructed below\n1. First, look for the Person Entity type in the text and extract the needed information defined below:\n   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. NEVER create new entity types that aren\'t mentioned below. Document must be summarized and stored inside Person entity under `description` property\n    Entity Types:\n    label:\'Person\',id:string,role:string,description:string //Person Node\n2. Description property should be a crisp text summary and MUST NOT be more than 100 characters\n3. If you cannot find any information on the entities & relationships above, it is okay to return empty value. DO NOT create fictious data\n4. Do NOT create duplicate entities\n5. Restrict yourself to extract only Person information. No Position, Company, Educatio

In [4]:
import requests

text_chunk = extract_text_from_pdf(SETTINGS['source_pdf_file'])

API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2"
headers = {"Authorization": "Bearer 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"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()
	
output = query({
	"inputs": f"Your role is to extract information from the following resume. The present is {date.today()}\n"+text_chunk,
})

INFO - Number of pages: 1


Karan Shah
♂phone(+91)910-692-6865 /envel⌢pekarannshah13@gmail.com /linkedinkaranshaah /githubKarannshah1
Education
Nirma University Jun 2020 – Currently
B.Tech in Computer Science and Engineering 7.66 CGPA
Experience
Kavayahcloud PVT LTD. Jan 2024 – Present
Software Engineer (Intern) Gandhinagar, India
•Created user interactive dashboard, and screens with API integration to improve user experience and design patterns.
•Testing code and debugging backend in CI/CD manner in distributed system resulting to team’s productivity
increased by 25%
•Tech stack: Javascript, OutSystems platform, MS SQL Server, MS Azure
Civic Infotech June 2023 – July 2023
Software Engineer (Intern) Ahmedabad, India
•Developed a procurement portal with the best UI design and UX practice in Agile Software Development Life cycle
which resulted in boosting response time by 50%.
•Written code, contributed to documentation and system design for multiple screens and features like tender create,
bidding details and auct

In [None]:
# Get instructed
text_chunk = extract_text_from_pdf(SETTINGS['source_pdf_file'])


# install client
client = instructor.patch(OpenAI(), mode=instructor.Mode.MD_JSON)


# extractions
extraction = client.chat.completions.create(
    model=SETTINGS['model']['name'],
    response_model = OptionalCandidate,
    messages=[
        {
            'role': 'system',
            'content': f'Your role is to extract information from the following resume. The present is {date.today()}'
        },
        {
            'role': 'user',
            'content': output
        }
    ],
) 

extraction = client.chat.completions.create(
    model=SETTINGS['model']['name'],
    response_model = Candidate,
    messages=[
        {
            'role': 'system',
            'content': f'Your role is to extract information from the following resume. The present is {date.today()}'
        },
        {
            'role': 'user',
            'content': output
        }
    ],
) 

In [None]:
json_result = extraction.model_dump_json()
print(json.dumps(json.loads(json_result), indent=4))

In [None]:
writeFile =open('../data/example.json', 'w')
writeFile.write(json_result)
writeFile.close()