In [36]:
from common_imports import *
from ocr_extractor import extract_text_from_pdf

from getpass import getpass

# LLM Models
from openai import OpenAI
import openai
from pydantic import BaseModel, Field, AfterValidator, WithJsonSchema
import instructor

from typing import Optional, Iterable, List, Annotated
from datetime import date



In [6]:
# Setup OpenAI
if os.getenv("OPEN_AI_KEY") is None:
    if any(['VSCODE' in x for x in os.environ.keys()]):
        print("Please enter password in the VS Code prompt at the top of your VS Code window!")
    
    os.environ['OPENAI_API_KEY'] = getpass("Paste your OpenAI key from: https://platform.openai.com/account/api-keys\n")
    openai.api_key = os.getenv("OPENAI_API_KEY", "")

assert os.getenv("OPENAI_API_KEY", "").startswith("sk-"), "Doesn't look like an API key"
print("OpenAI API key configured")

    

Please enter password in the VS Code prompt at the top of your VS Code window!
OpenAI API key configured


# Create the structure of a resume

Create the structure to help LLMs scrape necessary data from the resume

In [63]:
def day_validator(day: int):
    if day == None:
        day = 15
    if day > 31 or day < 0:
        raise ValueError("Day not in range")
    return day
    

def month_validator(month: int):
    if month == None:
        month = 6
        
    if month > 12 or month < 0:
        raise ValueError("Month not in range")

    return month



class ForgivingDate(BaseModel):
    day: int = Annotated[
        int, 
        AfterValidator(day_validator),
        WithJsonSchema({
            'type': 'int',
            'description': 'the day (optional)'
        })
    ]
    month: int = Annotated[
        int, 
        AfterValidator(month_validator),
        WithJsonSchema({
            'type': 'int',
            'description': 'the month (optional)'
        })
    ]
    year: int


# def date_validator(date: ForgivingDate | str): 
#     if type(date) == str:
#         raise ValueError(f"Date is not of the correct format. Change it based on today's date {date.today()}")


class DateRange(BaseModel):
    start: ForgivingDate
    end: ForgivingDate | str #= Annotated[
    #     ForgivingDate | str,
    #     AfterValidator(date_validator),
    #     WithJsonSchema({
    #         'type': 'ForgivingDate',
    #         'description': f'Today is {date.today()}'
    #     })
    # ]




class Skill(BaseModel):
    name: str
    years: int = Field(description='Years of experience deducted from the dates')



class Experience(BaseModel):
    dates: DateRange
    title: str = Field(description='The name of the role')
    company: str = Field(description='The employer')
    skills: List[Skill]
    points: List[str] = Field(description='Each bullet point in the experience')


class Education(BaseModel):
    college: str = Field(description='Institution from which the person received their degree')
    
    dates: DateRange


class Contact(BaseModel):
    phone_number: str 
    email: str 
    location: str = Field(
        default_factory=str, 
        description='Complete street address wherever possible.'
        )
    
    
    



class Candidate(BaseModel):
    name: str
    contact: Contact
    education: Education
    
    experiences: List[Experience]

class OptionalCandidate(BaseModel):
    result: Optional[Candidate] = Field(default=None)
    error: bool = Field(default=False)
    message: Optional[str]




# Extracting a PDF 

Extract PDF of a resume for a quick RAG 

In [70]:
# Get instructed
text_chunk = extract_text_from_pdf(SETTINGS['source_pdf_file'])


# install client
client = instructor.patch(OpenAI(), mode=instructor.Mode.JSON)


# extractions
extraction = client.chat.completions.create(
    model='gpt-3.5-turbo',#SETTINGS['model']['name'],
    response_model = OptionalCandidate,
    messages=[
        {
            'role': 'system',
            'content': f'Your role is to extract information from the following resume. The present is {date.today()}'
        },
        {
            'role': 'user',
            'content': text_chunk
        }
    ],
) 

INFO - Number of pages: 1


In [72]:
from pprint import pprint
pprint(extraction.model_dump_json(indent=2))

('{\n'
 '  "result": {\n'
 '    "name": "Hemanth Vikash Kannan Rajan",\n'
 '    "contact": {\n'
 '      "phone_number": "+1 540-449-8971",\n'
 '      "email": "hemanthv@vt.edu",\n'
 '      "location": "Brooklyn NYC - 11226"\n'
 '    },\n'
 '    "education": {\n'
 '      "college": "Virginia Tech",\n'
 '      "dates": {\n'
 '        "start": {\n'
 '          "day": 1,\n'
 '          "month": 8,\n'
 '          "year": 2016\n'
 '        },\n'
 '        "end": {\n'
 '          "day": 1,\n'
 '          "month": 5,\n'
 '          "year": 2020\n'
 '        }\n'
 '      }\n'
 '    },\n'
 '    "experiences": [\n'
 '      {\n'
 '        "dates": {\n'
 '          "start": {\n'
 '            "day": 1,\n'
 '            "month": 5,\n'
 '            "year": 2021\n'
 '          },\n'
 '          "end": "Present"\n'
 '        },\n'
 '        "title": "Core Data Scientist",\n'
 '        "company": "Moshman Research",\n'
 '        "skills": [\n'
 '          {\n'
 '            "name": "Custom ML/DL Algori