LLM Setup

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("API_KEY")

In [3]:
from langchain_groq import ChatGroq
llm = ChatGroq(
    model="llama3-70b-8192",
    temperature = 0,
    groq_api_key = api_key
)

Web Scrapping setup

In [4]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://careers.nike.com/lead-data-scientist/job/R-62512")

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [5]:
page_data = loader.load().pop().page_content
page_data

'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLead Data Scientist\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to main content\nOpen Virtual Assistant\n\n\n\n\n\n\n\n\n\n\nHome\n\n\nCareer Areas\n\n\nTotal Rewards\n\n\nLife@Nike\n\n\nPurpose\n\n\n\n\n\n\n\n\n\n\nLanguage\n\n\n\n\n\nSelect a Language\n\n  Deutsch  \n  English  \n  Español (España)  \n  Español (América Latina)  \n  Français  \n  Italiano  \n  Nederlands  \n  Polski  \n  Tiếng Việt  \n  Türkçe  \n  简体中文  \n  繁體中文  \n  עִברִית  \n  한국어  \n  日本語  \n\n\n\n\n\n\n\n\nCareers\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nClose Menu\n\n\n\n\n\n\n\nCareers\n\n\n\n\n\n\nChat\n\n\n\n\n\n\n                                Home\n                            \n\n\n\n                                Career Areas\n                            \n\n\n\n                                Total Rewards\n                            \n\n\n\n                                Life@Nike\n                            \n\n\

PromptTempleate

In [6]:
from langchain_core.prompts import PromptTemplate

prompt_extract = PromptTemplate.from_template(
        """
        ### SCRAPED TEXT FROM WEBSITE:
        {page_data}
        ### INSTRUCTION:
        The scraped text is from the career's page of a website.
        Your job is to extract the job postings and return them in JSON format containing the 
        following keys: `role`, `experience`, `skills` and `description`.
        Only return the valid JSON.
        ### VALID JSON (NO PREAMBLE):    
        """
)

In [7]:
chain_extract = prompt_extract | llm
res = chain_extract.invoke(input={'page_data':page_data})
print(res.content)

[
    {
        "role": "Lead Data Scientist",
        "experience": "4 years",
        "skills": ["Python", "Machine Learning", "Statistics", "Business Intelligence", "Data Analytics", "SQL programming", "Data Visualization", "Time Series Forecasting"],
        "description": "Design, develop, and program methods, processes, and systems to consolidate and analyze structured/unstructured diverse sources to generate actionable insights and solutions for client services and product enhancement; build products for analysis; interact with product and service teams to identify questions and issues for data analysis and experiments; develop and code software programs, algorithms and automated processes to cleanse, integrate, and evaluate large datasets from multiple disparate sources; identify meaningful insights from large data and metadata sources; interpret and communicate insights and findings from analysis and experiments to product, service, and business managers; lead others to solve 

In [8]:
print(type(res.content))

<class 'str'>


parse llm output to json object

In [21]:
from langchain_core.output_parsers.json import JsonOutputParser
json_parse = JsonOutputParser()
json_res = json_parse.parse(res.content)
if isinstance(json_res, list) and len(json_res) == 1:
    json_res = json_res[0]
json_res

{'role': 'Lead Data Scientist',
 'experience': '4 years',
 'skills': ['Python',
  'Machine Learning',
  'Statistics',
  'Business Intelligence',
  'Data Analytics',
  'SQL programming',
  'Data Visualization',
  'Time Series Forecasting'],
 'description': 'Design, develop, and program methods, processes, and systems to consolidate and analyze structured/unstructured diverse sources to generate actionable insights and solutions for client services and product enhancement; build products for analysis; interact with product and service teams to identify questions and issues for data analysis and experiments; develop and code software programs, algorithms and automated processes to cleanse, integrate, and evaluate large datasets from multiple disparate sources; identify meaningful insights from large data and metadata sources; interpret and communicate insights and findings from analysis and experiments to product, service, and business managers; lead others to solve complex problems; and 

In [22]:
type(json_res)

dict

In [23]:
job = json_res
job['skills']

['Python',
 'Machine Learning',
 'Statistics',
 'Business Intelligence',
 'Data Analytics',
 'SQL programming',
 'Data Visualization',
 'Time Series Forecasting']