## Webscarpping and structuring results

In [1]:
from langchain_community.document_loaders import WebBaseLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
def get_page_content(url):
    loader = WebBaseLoader(url)
    page_data = loader.load().pop().page_content
    return page_data

In [3]:
get_page_content("https://www.infineon.com/cms/en/careers/jobsearch/jobsearch/#!view=jobs&functional_area=Data%20Science&country=Germany&job_attributes=Full%20time")



#### TO DO: Aim is to convert above web scraped content into jason, so that it is usable

##### I will pass webscrapped content along with a prompt asking an LLM to convert given text into dict format

In [4]:
from langchain_core.prompts import PromptTemplate

prompt_extract = PromptTemplate.from_template(
        """
        ### SCRAPED TEXT FROM WEBSITE:
        {page_data}
        ### INSTRUCTION:
        The scraped text is from the career's page of a website.
        Your job is to extract the job postings and return them in JSON format containing the 
        following keys: `role`, `experience`, `skills` and `description`.
        Only return the valid JSON.
        ### VALID JSON (NO PREAMBLE):    
        """
) 

In [5]:
## Getting an LLM Model [llama-3.1], (use of groq cloud for faster inference)

from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama-3.1-70b-versatile",
    groq_api_key = "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",
    temperature=0,
    # max_tokens=None,
    # timeout=None,
    # max_retries=2,
    # # other params...
)

In [6]:
chain_extract = prompt_extract | llm  ## forming chain using pipe operation

In [7]:
url = "https://jobs.bosch.com/en/job/REF221607P-phd-predictive-maintenance-in-manufacturing"
page_data = get_page_content(url)
page_data 

'\n      PhD Predictive Maintenance in Manufacturing                                                                  Skip to main content                                          Close                Bosch Career Portal         PhD Predictive Maintenance in Manufacturing              Location Salzgitter        Fields of work  Manufacturing         Join as  Graduate         Starting date According to arrangement        Working time Vollzeit        Legal entity Robert Bosch Elektronik GmbH            Bosch data privacy statement eRecruiting     SmartRecruiters data privacy statement          Your tasks Als PhD Kandidat bist du Schnittstelle zwischen Forschung und Industrie. Du erforschst die Integration von Predictive Maintenance in das Bosch Production System.\xa0• Du identifizierst Potentiale für Predictive Maintenance in\n\xa0 unserer Elektronikfertigung.\n• Du erforschst die Integrationsmöglichkeiten von Predictive\n\xa0 Maintenance in eine existierende\n\xa0 Wartungsprozesslandscha

In [8]:
res = chain_extract.invoke(input={'page_data':page_data})
res.content

'[\n  {\n    "role": "PhD Predictive Maintenance in Manufacturing",\n    "experience": "Master\'s degree in a relevant field, experience in manufacturing and predictive maintenance",\n    "skills": [\n      "Predictive maintenance",\n      "Manufacturing systems",\n      "Programming skills",\n      "German and English language skills"\n    ],\n    "description": "As a PhD candidate, you will be the interface between research and industry. You will research the integration of predictive maintenance into the Bosch Production System. You will identify potential for predictive maintenance in our electronics manufacturing, research integration possibilities into an existing maintenance process landscape, and implement your research approaches in selected application cases in our production environment."\n  }\n]'

In [9]:
type(res.content)

str

##### note: need to convert str type to jason or dict format

In [10]:
from langchain_core.output_parsers import JsonOutputParser

json_parser = JsonOutputParser()
json_res = json_parser.parse(res.content)
json_res

[{'role': 'PhD Predictive Maintenance in Manufacturing',
  'experience': "Master's degree in a relevant field, experience in manufacturing and predictive maintenance",
  'skills': ['Predictive maintenance',
   'Manufacturing systems',
   'Programming skills',
   'German and English language skills'],
  'description': 'As a PhD candidate, you will be the interface between research and industry. You will research the integration of predictive maintenance into the Bosch Production System. You will identify potential for predictive maintenance in our electronics manufacturing, research integration possibilities into an existing maintenance process landscape, and implement your research approaches in selected application cases in our production environment.'}]

In [11]:
type(json_res[0])

dict

In [12]:
import pandas as pd

df = pd.read_csv("my_portfolio.csv")
df

Unnamed: 0,Techstack,Links
0,"React, Node.js, MongoDB",https://example.com/react-portfolio
1,"Angular,.NET, SQL Server",https://example.com/angular-portfolio
2,"Vue.js, Ruby on Rails, PostgreSQL",https://example.com/vue-portfolio
3,"Python, Django, MySQL",https://example.com/python-portfolio
4,"Java, Spring Boot, Oracle",https://example.com/java-portfolio
5,"Flutter, Firebase, GraphQL",https://example.com/flutter-portfolio
6,"WordPress, PHP, MySQL",https://example.com/wordpress-portfolio
7,"Magento, PHP, MySQL",https://example.com/magento-portfolio
8,"React Native, Node.js, MongoDB",https://example.com/react-native-portfolio
9,"iOS, Swift, Core Data",https://example.com/ios-portfolio


In [14]:
import chromadb

# client = chromadb.Client('Vectorstore') ## this create chormadb in memory
client = chromadb.PersistentClient('Vectorstore') ## this create chormadb in local disk
collection = client.get_or_create_collection(name = "portfolio")  # Get a collection object from an existing collection, by name. If it doesn't exist, create it.

collection.count()

0

In [20]:
import uuid

if not collection.count():
    for _, row in df.iterrows():
        collection.add(documents=row["Techstack"],
                       metadatas={"links": row["Links"]},
                       ids=[str(uuid.uuid4())] ## generates a random UUID (Universally Unique Identifier).
                      ) 

In [21]:
collection.get()

{'ids': ['231199e7-c0be-4bfd-be1e-88ee0d1081b2',
  '2e2f9fd8-ccfc-4401-a7c2-304471188970',
  '3f1ec5df-89ec-47a4-b6ef-165aab913c3b',
  '45f49651-273f-4472-8742-fe31b110740e',
  '545b18bd-149b-4cb9-9944-5ee35eb2dcbb',
  '65d82bbd-7277-4cea-a8da-e1e5307fc9b0',
  '71bacb53-f24e-4b90-87b7-78f524055e8d',
  '853ec0a4-2656-4acb-9c2e-7ba0f3c5c4f1',
  '899579fb-91a4-4246-bb07-e6d8b6cbaeda',
  '9b201292-0bc7-491a-a79a-cca409c06814',
  'a4867712-7b90-4345-86bd-86bc55aadb0b',
  'a5dc585c-0ec4-4061-a869-3f81fc04e896',
  'a78e58c6-7ef1-4319-8c4f-ad56ae8c9ce1',
  'b0448338-9d27-4a84-9100-d6a4692a50db',
  'b107777f-56bc-4df4-b55c-71c4abce7a5e',
  'b70eb747-ab64-4cce-8f88-f19c9ba75978',
  'cad99431-14c2-4bbd-8d9b-f843551ccc4c',
  'd0efa386-b445-42dd-843b-77df830bd96d',
  'dda2476f-0c06-48d0-8454-a4510d209693',
  'e4b6f7d0-59e9-4d59-b592-df4b2ebd9d2e'],
 'embeddings': None,
 'metadatas': [{'links': 'https://example.com/magento-portfolio'},
  {'links': 'https://example.com/python-portfolio'},
  {'links':

In [22]:
## fecthing records based on queries

collection.query(
    query_texts=["experience in java"],
    n_results= 3,
    # where={"metadata_field": "is_equal_to_this"},## tmplate
    # where_document={"$contains":"search_string"} ##template
)


{'ids': [['9b201292-0bc7-491a-a79a-cca409c06814',
   'e4b6f7d0-59e9-4d59-b592-df4b2ebd9d2e',
   'a78e58c6-7ef1-4319-8c4f-ad56ae8c9ce1']],
 'distances': [[1.0862322998041076, 1.1617181510147703, 1.4697257478258057]],
 'metadatas': [[{'links': 'https://example.com/java-portfolio'},
   {'links': 'https://example.com/android-portfolio'},
   {'links': 'https://example.com/kotlin-backend-portfolio'}]],
 'embeddings': None,
 'documents': [['Java, Spring Boot, Oracle',
   'Android, Java, Room Persistence',
   'Backend, Kotlin, Spring Boot']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [27]:
## fecthing records based on queries

collection.query(
    query_texts=["experience in java"],
    n_results= 3,
    # where={"metadata_field": "is_equal_to_this"},## tmplate
    # where_document={"$contains":"search_string"} ##template
).get('metadatas')


[[{'links': 'https://example.com/java-portfolio'},
  {'links': 'https://example.com/android-portfolio'},
  {'links': 'https://example.com/kotlin-backend-portfolio'}]]

In [29]:
links = collection.query(
    query_texts=["experience in java"],
    n_results= 3,
    # where={"metadata_field": "is_equal_to_this"},## tmplate
    # where_document={"$contains":"search_string"} ##template
).get('metadatas')
links

[[{'links': 'https://example.com/java-portfolio'},
  {'links': 'https://example.com/android-portfolio'},
  {'links': 'https://example.com/kotlin-backend-portfolio'}]]

In [31]:
job = json_res[0]
job

{'role': 'PhD Predictive Maintenance in Manufacturing',
 'experience': "Master's degree in a relevant field, experience in manufacturing and predictive maintenance",
 'skills': ['Predictive maintenance',
  'Manufacturing systems',
  'Programming skills',
  'German and English language skills'],
 'description': 'As a PhD candidate, you will be the interface between research and industry. You will research the integration of predictive maintenance into the Bosch Production System. You will identify potential for predictive maintenance in our electronics manufacturing, research integration possibilities into an existing maintenance process landscape, and implement your research approaches in selected application cases in our production environment.'}

In [32]:
prompt_email = PromptTemplate.from_template(
        """
        ### JOB DESCRIPTION:
        {job_description}
        
        ### INSTRUCTION:
        You are Mohan, a business development executive at AtliQ. AtliQ is an AI & Software Consulting company dedicated to facilitating
        the seamless integration of business processes through automated tools. 
        Over our experience, we have empowered numerous enterprises with tailored solutions, fostering scalability, 
        process optimization, cost reduction, and heightened overall efficiency. 
        Your job is to write a cold email to the client regarding the job mentioned above describing the capability of AtliQ 
        in fulfilling their needs.
        Also add the most relevant ones from the following links to showcase Atliq's portfolio: {link_list}
        Remember you are Mohan, BDE at AtliQ. 
        Do not provide a preamble.
        ### EMAIL (NO PREAMBLE): 
        
        """
) 

chain_email = prompt_email | llm  ## forming chain using pipe operation
email_res = chain_email.invoke({"job_description": str(job), "link_list": links})
print(email_res.content)

Subject: Expertise in Predictive Maintenance Solutions for Manufacturing

Dear Hiring Manager,

I came across the PhD position for Predictive Maintenance in Manufacturing at Bosch and was impressed by the company's commitment to innovation and research. As a Business Development Executive at AtliQ, I'd like to introduce our company's capabilities in developing cutting-edge predictive maintenance solutions that can support your research and industry needs.

AtliQ is an AI & Software Consulting company with a proven track record of empowering enterprises with tailored solutions, fostering scalability, process optimization, cost reduction, and heightened overall efficiency. Our expertise in developing predictive maintenance solutions can help you identify potential for predictive maintenance in electronics manufacturing, research integration possibilities into an existing maintenance process landscape, and implement research approaches in selected application cases.

Our team of experts h