In [1]:
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
import pandas as pd
import requests
import time
import json
from datetime import datetime
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from langchain.callbacks import get_openai_callback


def printOutput(output):
    print(json.dumps(output, sort_keys=True, indent=3))

In [2]:
openai_api_key = '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'
serpapi_api_key = ''

In [3]:
llm = ChatOpenAI(
    temperature=0,
    max_tokens=2000,
    openai_api_key=openai_api_key
)

In [4]:
person_schema = Object(
    id='person',
    description='Personal information about a person',
    attributes=[
        Text(
            id='first_name',
            description='The first name of a person'
        )
    ],
    examples=[
        ('Alice and Bob are friends', [{'first_name': 'Alice'}, {'first_name': 'Bob'}])
    ]
)

In [5]:
chain = create_extraction_chain(llm, person_schema)

In [6]:
text = """
    My name is Bobby.
    My sister's name is Rachel.
    My brother's name Joe. My dog's name is Spot
"""

output = chain.predict_and_parse(text=text)['data']

printOutput(output)

{
   "person": [
      {
         "first_name": "Bobby"
      },
      {
         "first_name": "Rachel"
      },
      {
         "first_name": "Joe"
      }
   ]
}


In [7]:
output = chain.predict_and_parse(text=('The dog went to the park'))['data']
printOutput(output)

{
   "person": []
}


In [8]:
plant_schema = Object(
    id='plant',
    description='Information about a plant',
    attributes=[
        Text(
            id='plant_type',
            description='The common name of the plant'
        ),
        Text(
            id='color',
            description='The color of the plant'
        ),
        Number(
            id='rating',
            description='The rating of the plant'
        )
    ],
    examples=[
        (
            'Roses are red, lilies are white and a 8 out of 10',
            [
                {'plant_type': 'Roses', 'color': 'red'},
                {'plant_type': 'Lily', 'color': 'white', 'rating': 8}
            ]
        )
    ]
)

In [9]:
text = 'Palm trees are brown with a 6 rating. Sequoia trees are green'

chain = create_extraction_chain(llm, plant_schema)
output = chain.predict_and_parse(text=text)['data']

printOutput(output)

{
   "plant": [
      {
         "color": "brown",
         "plant_type": "Palm trees",
         "rating": "6.0"
      },
      {
         "color": "green",
         "plant_type": "Sequoia trees",
         "rating": ""
      }
   ]
}


In [10]:
parts = Object(
    id='parts',
    description='A single part of a car',
    attributes=[
        Text(
            id='part', description='The name of the part'
        )
    ],
    examples=[
        (
            'the jeep has wheels and windows',
            [
                {'part': 'wheel'},
                {'part': 'window'}
            ]
        )
    ]
)

cars_schema = Object(
    id='car',
    description='Information about a car',
    examples=[
        (
            'the bmw is red and has an engine and steering wheel',
            [
                {'type': 'BMW', 'color': 'red', 'parts': ['engine', 'steering wheel']}
            ]
        )
    ],
    attributes=[
        Text(
            id='type',
            description='The make or brand of the car'
        ),
        Text(
            id='color',
            description='The color of the car'
        ),
        parts
    ]
)

In [11]:
text = 'The blue jeep has rear view mirror, roof, windshield'

chain = create_extraction_chain(
    llm, cars_schema, encoder_or_encoder_class='json')
output = chain.predict_and_parse(text=text)['data']

printOutput(output)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-RoeWzfsGX1BMb27fi9TWyy1L on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-RoeWzfsGX1BMb27fi9TWyy1L on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit ht

{
   "car": {
      "color": "blue",
      "parts": [
         {
            "part": "rear view mirror"
         },
         {
            "part": "roof"
         },
         {
            "part": "windshield"
         }
      ],
      "type": "jeep"
   }
}


In [12]:
prompt = chain.prompt.format_prompt(text=text).to_string()
print(prompt)

Your goal is to extract structured information from the user's input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.

```TypeScript

car: { // Information about a car
 type: string // The make or brand of the car
 color: string // The color of the car
 parts: { // A single part of a car
  part: string // The name of the part
 }
}
```


Please output the extracted information in JSON format. Do not output anything except for the extracted information. Do not add any clarifying information. Do not add any fields that are not in the schema. If the text contains attributes that do not appear in the schema, please ignore them. All output must be in JSON format and follow the schema specified above. Wrap the JSON in <json> tags.



Input: the bmw is red and has an engine and steering wheel
Output: <json>{"car": [{"type": "BMW", "color": "red", "

In [13]:
schema = Object(
    id="forecaster",
    description=(
        "User is controling an app that makes financial forecasts. "
        "They will give a command to update a forecast in the future"
    ),
    attributes=[
        Text(
            id="year",
            description="Year the user wants to update",
            examples=[("please increase 2014's customers by 15%", "2014")],
            many=True,
        ),
        Text(
            id="metric",
            description="The unit or metric a user would like to influence",
            examples=[("please increase 2014's customers by 15%", "customers")],
            many=True,
        ),
        Text(
            id="amount",
            description="The quantity of a forecast adjustment",
            examples=[("please increase 2014's customers by 15%", ".15")],
            many=True,
        )
    ],
    many=False,
)

In [14]:
chain = create_extraction_chain(llm, schema, encoder_or_encoder_class='json')
output = chain.predict_and_parse(text='please add 15 more units sold to 2023')['data']

printOutput(output)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-RoeWzfsGX1BMb27fi9TWyy1L on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 2.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-RoeWzfsGX1BMb27fi9TWyy1L on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit ht

{
   "forecaster": {
      "amount": [
         "15"
      ],
      "metric": [
         "units sold"
      ],
      "year": [
         "2023"
      ]
   }
}


### Real World Example

In [15]:
llm = ChatOpenAI(
    temperature=0,
    max_tokens=2000, 
    openai_api_key=openai_api_key
)

In [16]:
def pull_from_greenhouse(board_token):
    url = f'https://boards-api.greenhouse.io/v1/boards/{board_token}/jobs?content=true'
    try:
        response = requests.get(url)
    except:
        print('Whoops, error')
        return 
    status_code = response.status_code 
    jobs = response.json()['jobs']
    print(f'{board_token}: {status_code}, Found {len(jobs)} jobs')
    return jobs 

In [17]:
jobs = pull_from_greenhouse('okta')

Whoops, error


In [18]:
job_index = 0

print('Preview:\n')
print(json.dumps(jobs[job_index])[:400])

Preview:



TypeError: 'NoneType' object is not subscriptable

In [None]:
def describeJob(job_description):
    print(f"Job ID: {job_description['id']}")
    print(f"Link: {job_description['absolute_url']}")
    print(f"Updated At: {datetime.fromisoformat(job_description['updated_at'])}")
    print(f"Title: {job_description['title']}\n")
    print(f"Content:\n{job_description['content'][:550]}")

In [None]:
job_id = 4858786

job_description = [item for item in jobs if item['id'] == job_id][0]
    
describeJob(job_description)

Job ID: 4858786
Link: https://www.okta.com/company/careers/opportunity/4858786?gh_jid=4858786
Updated At: 2023-05-12 11:53:14-04:00
Title: Anaplan Manager

Content:
&lt;div class=&quot;content-intro&quot;&gt;&lt;p&gt;&lt;span style=&quot;color: #000000;&quot;&gt;&lt;strong&gt;Get to know Okta&lt;/strong&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&lt;span style=&quot;color: #000000;&quot;&gt;&lt;br&gt;&lt;/span&gt;Okta is The World’s Identity Company. We free everyone to safely use any technology—anywhere, on any device or app. Our Workforce and Customer Identity Clouds enable secure yet flexible access, authentication, and automation that transforms how people move through the digital world, putting Identity at t


In [None]:
soup = BeautifulSoup(job_description['content'], 'html.parser')

In [None]:
text = soup.get_text() 
text = md(text)
print(text[:600])

**Get to know Okta**


  
Okta is The World’s Identity Company. We free everyone to safely use any technology—anywhere, on any device or app. Our Workforce and Customer Identity Clouds enable secure yet flexible access, authentication, and automation that transforms how people move through the digital world, putting Identity at the heart of business security and growth.   
  
At Okta, we celebrate a variety of perspectives and experiences. We are not looking for someone who checks every single box, we’re looking for lifelong learners and people who can make us better with their unique experien


In [None]:
tools = Object(
    id='tools',
    description="""
        A tool, application, or other company that is listed in a job description.
        Analytics, eCommerce and GTM are not tools
    """,
    attributes=[
        Text(
            id='tool',
            description='The name of a tool or company'
        )
    ],
    examples=[
        (
            'Experience in working with Netsuite, or Looker a plus',
            [
                {'tool': 'Netsuite'},
                {'tool': 'Looker'}
            ]
        ),
        (
            'Experience with Microsoft Excel',
            [
                {'tool': 'Microsoft Excel'}
            ]
        ),
        (
            'You must know AWS to do well in the job',
            [
                {'tool': 'AWS'}
            ]
        ),
        (
            'Troubleshooting customer issues and debugging from logs (Splunk, Syslogs, etc.)',
            [
                {'tool': 'Splunk'}
            ]
        )
    ],
    many=True
 )

In [None]:
chain = create_extraction_chain(llm, tools, input_formatter='triple_quotes')

output = chain.predict_and_parse(text=text)['data']
printOutput(output)

{
   "tools": [
      {
         "tool": "Anaplan"
      },
      {
         "tool": "Hyperion"
      },
      {
         "tool": "Tableau"
      }
   ]
}


In [None]:
salary_range = Object(
    id='salary_range',
    description="""
        The range of salary offered for a job mentioned in a job description
    """,
    attributes=[
        Number(
            id='low_end',
            description='The low end of a salary range'
        ),
        Number(
            id='high_end',
            description='The high end of a salary range'
        )
    ],
    examples=[
        (
            'This position will make between $140 thousand and $230,000.00',
            [
                {'low_end': 140000, 'high_end': 230000},
            ]
        )
    ]
)

In [None]:
jobs = pull_from_greenhouse('okta')

okta: 200, Found 149 jobs


In [None]:
job_id = 4858786

job_description = [item for item in jobs if item['id'] == job_id][0]
describeJob(job_description)

text = soup.get_text() 
text = md(text)
print(text[:600])

Job ID: 4858786
Link: https://www.okta.com/company/careers/opportunity/4858786?gh_jid=4858786
Updated At: 2023-05-12 11:53:14-04:00
Title: Anaplan Manager

Content:
&lt;div class=&quot;content-intro&quot;&gt;&lt;p&gt;&lt;span style=&quot;color: #000000;&quot;&gt;&lt;strong&gt;Get to know Okta&lt;/strong&gt;&lt;/span&gt;&lt;/p&gt;
&lt;p&gt;&lt;span style=&quot;color: #000000;&quot;&gt;&lt;br&gt;&lt;/span&gt;Okta is The World’s Identity Company. We free everyone to safely use any technology—anywhere, on any device or app. Our Workforce and Customer Identity Clouds enable secure yet flexible access, authentication, and automation that transforms how people move through the digital world, putting Identity at t
**Get to know Okta**


  
Okta is The World’s Identity Company. We free everyone to safely use any technology—anywhere, on any device or app. Our Workforce and Customer Identity Clouds enable secure yet flexible access, authentication, and automation that transforms how people move t

In [None]:
chain = create_extraction_chain(llm, salary_range)
output = chain.predict_and_parse(text=text)['data']

printOutput(output)

{
   "salary_range": [
      {
         "high_end": "165000",
         "low_end": "122000"
      }
   ]
}


In [None]:
with get_openai_callback() as cb:
    result = chain.predict_and_parse(text=text)
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Successful Requests: {cb.successful_requests}")
    print(f"Total Cost (USD): ${cb.total_cost}")

Total Tokens: 1572
Prompt Tokens: 1561
Completion Tokens: 11
Successful Requests: 1
Total Cost (USD): $0.0031439999999999997
