In [None]:
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
import openai
from langchain import HuggingFaceHub
import os

import requests
import json

## Data Collection

In [12]:
## the function that collects the job post data

In [None]:
url = "https://jobs-api14.p.rapidapi.com/v2/list"

querystring = {"query":"Data Scientist","location":"United States","autoTranslateLocation":"true","remoteOnly":"false","employmentTypes":"fulltime;parttime;intern;contractor"}

headers = {
	"x-rapidapi-key": os.getenv("JOBS-API-KEY"), # Please title the key as `JOBS-API-KEY`
	"x-rapidapi-host": "jobs-api14.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring)

print(response.json())

In [None]:
# Save the file info 
with open('scrapedData.json', 'w+') as file:
    json.dump(response.json(), file, indent=4)

"""
Right now it just saves everything, we should talk about how we should
clean the raw data.
"""

## embedding

In [None]:
# Initialize OpenAI Embeddings model
load_dotenv()

In [4]:
# Initialize Chroma (for vector DB)
vector_db = Chroma(collection_name="job-postings")

# This is to contain the returned job postings from data collection function
job_postings = [
    {"job_title": "Data Scientist", "company": "Google", "description": "Full job description here..."},
    {"job_title": "Software Engineer", "company": "Amazon", "description": "Full job description here..."},
    # Add 1000 job postings in a similar format
]

# Initialize LangChain's TextSplitter (you can adjust `chunk_size` based on your tokens)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10, chunk_overlap=0)

  vector_db = Chroma(collection_name="job-postings")


In [21]:
[text_splitter.split_text(job_posting['description']) for job_posting in job_postings]

[['Full job', 'descripti', 'on', 'here...'],
 ['Full job', 'descripti', 'on', 'here...']]

In [24]:
# Process and embed each job posting
for job_post in job_postings:
    # Split the job description into smaller chunks (for large texts)
    chunks = text_splitter.split_text(job_post["description"])

    # Create document objects with metadata (job title, company, location)
    documents = [Document(page_content=chunk, metadata={"job_title": job_post["job_title"], "company": job_post["company"]}) for chunk in chunks]

    # Embed the chunks and store in ChromaDB
    for document in documents:
        # Embed the chunk using the embedding model
        embedding = default_ef(document.page_content)
        
        # Add to ChromaDB
        vector_db.add_documents([document], embeddings=[embedding])

ValueError: You must provide an embedding function to compute embeddings.https://docs.trychroma.com/guides/embeddings in upsert.

In [None]:
load_dotenv()
# Ensure the environment variable is set
huggingfacehub_api_token = os.getenv("HUGGINGFACE_API_TOKEN")
if huggingfacehub_api_token is None:
	raise ValueError("HUGGINGFACE_API_TOKEN environment variable is not set")

from transformers import AutoModel
import torch

model = AutoModel.from_pretrained("google/flan-t5-xl", token=huggingfacehub_api_token)

# Initialize the HuggingFaceHub model
llm = HuggingFaceHub(repo_id="google/flan-t5-xl", huggingfacehub_api_token=huggingfacehub_api_token)

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# Use LangChain's Chroma wrapper
vector_store = Chroma(
    persist_directory="./chroma_db",
    embedding_function=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
)

retriever = vector_store.as_retriever(search_kwargs={"k": 3})

qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model_name="gpt-4o", api_key="your-api-key"),
    retriever=retriever
)

query = "Explain vector databases"
print(qa_chain.run(query))