In [3]:
# API Keys retrieval
from dotenv import load_dotenv
import os
load_dotenv()

# Data Collection
import requests
import pprint

# Storing Data
import uuid 
import requests
import pinecone
from transformers import AutoTokenizer, AutoModel

# ChromaDB
import os
import chromadb
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma

from langchain.embeddings import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from langchain_core.messages import HumanMessage, SystemMessage

# Data Wrangling & Cleaning
import json
import pandas as pd
import torch
import hashlib
import datetime
import re
from dateutil.relativedelta import relativedelta
import re
import jsonlines

For this part of the RAG model; we'll focus on using templates from langchain-crash-course-main

In [5]:
current_dir = os.getcwd() # NOTE: use this for .ipynb files
persistent_directory = os.path.join(current_dir, "db", "chroma_db_with_metadata")

# Initialize embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

# embeddings = OpenAIEmbeddings(
#     model="text-embedding-3-small"
# )

# Init ChromaDB collection once
collection_name = "jobs_collection"
db = Chroma(
    collection_name=collection_name,
    persist_directory=persistent_directory,
    embedding_function=embeddings
)

In [None]:
# Load the existing vector store with the embedding function
db = Chroma(
    persist_directory=persistent_directory,
    embedding_function=embeddings,
    collection_name="jobs_collection"  # Match the name used in push
)

# Define the user's question
query = "What are the skills I need to get a data scientist job?"

# Retrieve relevant documents based on the query
retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 20, "score_threshold": 0.25},  # Lower threshold for testing
)
# `k`: Top # of document hits 
# `score_threshold`: Similarity score from text document to query
relevant_docs = retriever.invoke(query)

# Display the relevant results with metadata
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")

Formatting with ChatGPT

In [5]:
"""RAG PORTION"""
# Load the existing vector store with the embedding function
db = Chroma(
    persist_directory=persistent_directory,
    embedding_function=embeddings,
    collection_name="jobs_collection"  # Match the name used in push
)

# Define the user's question
query = "What are the skills I need to get a data scientist job?"

# Retrieve relevant documents based on the query
retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 10, "score_threshold": 0.25},  # Lower threshold for testing
)
# `k`: Top # of document hits 
# `score_threshold`: Similarity score from text document to query
relevant_docs = retriever.invoke(query)

# Display the relevant results with metadata
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")

"""AGENTIC AI PORTION"""

# Combine the query and the relevant document contents
combined_input = (
    "Here are some documents that might help answer the question: "
    + query
    + "\n\nRelevant Documents:\n"
    + "\n\n".join([doc.page_content for doc in relevant_docs])
    + "\n\nPlease provide an answer based only on the provided documents. If the answer is not found in the documents, respond with 'I'm not sure'."
)

# Create a ChatOpenAI model
model = ChatOpenAI(model="gpt-4o")

# Define the messages for the model
messages = [
    SystemMessage(content="You are a career advisor."),
    HumanMessage(content=combined_input),
]

# Invoke the model with the combined input
result = model.invoke(messages)

# Display the full result and content only
print("\n--- Generated Response ---")
# print("Full result:")
# print(result)
print("Content only:")
print(result.content)


--- Relevant Documents ---
Document 1:
MINIMUM QUALIFICATIONS:
• Bachelor’s Degree required OR equivalent combination of training, education, and relevant experience may be considered in lieu of a degree.
• Advanced analytical knowledge of data
• Statistical analysis
• Conducting big data analysis
• Data conditioning
• Programming advanced computing
• Developing machine learning algorithms (classification, regression, clustering, model validation, etc.)
• Developing software and data models
• Executing predictive analytics
• Proficient with one or more programming languages (Python, R, SQL, Alteryx, SAS, etc.) and hands-on working experience with database systems, business intelligence, and visual reporting tools (BOBJ, Tableau, Power BI, etc.

WORK ENVIRONMENT:
• Work is performed 4 days in the office, with 1 day remote.
• May require traveling up to 10% of the time.
• Additional duties as required.

COMMUNICATIONS AND INTERPERSONAL SKILLS:

Must have excellent oral and written commu

Attempting to Format Streamlit

In [None]:
from pydantic import BaseModel
from typing import List
import matplotlib.pyplot as plt
from langchain.agents import initialize_agent, Tool, AgentType

#  Pydantic model to structure the skills and their importance
class SkillData(BaseModel):
    skill: str
    importance: int

class SkillsResponse(BaseModel):
    skills: List[SkillData]

"""RAG PORTION"""
# Load the existing vector store with the embedding function
db = Chroma(
    persist_directory=persistent_directory,
    embedding_function=embeddings,
    collection_name="jobs_collection"  # Match the name used in push
)

# Define the user's question
query = "What are the top skills I need to get a data scientist job?"

# Retrieve relevant documents based on the query
retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 10, "score_threshold": 0.25},  # Lower threshold for testing
)
# `k`: Top # of document hits 
# `score_threshold`: Similarity score from text document to query
relevant_docs = retriever.invoke(query)

# Display the relevant results with metadata
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n") # DEBUG

"""AGENTIC AI PORTION"""

# Combine the query and the relevant document contents
combined_input = (
    "Here are some documents that might help answer the question: "
    + query
    + "\n\nRelevant Documents:\n"
    + "\n\n".join([doc.page_content for doc in relevant_docs])
    + "\n\nPlease extract the top skills for the following job posting and their importance (on a scale from 1 to 100):\nReturn the skills in this format: [{{'skill': 'Skill Name', 'importance': 95}}, ...]"
)

# Define the messages for the model
messages = [
    SystemMessage(content="You are a career advisor."),
    HumanMessage(content=combined_input),
]

# Create a ChatOpenAI model
model = ChatOpenAI(model="gpt-4o")

# Invoke the model with the combined input
result = model.invoke(messages)
print("------------------CHATGPT OUTPUT------------------")
print(result.content)
print("------------------CHATGPT OUTPUT------------------")

# NOTE: need to extract the skills set 


""" OUTPUT """
# plot_skills(structured_skills)

# Display the full result and content only
print("\n--- Generated Response ---")
# print("Full result:")
# print(result)
print("Content only:")
print(result.content)


--- Relevant Documents ---
Document 1:
PREFERRED QUALIFICATIONS:
• Master’s or PhD in in Statistics, Computer Science, Data Analytics, Operations Research, Mathematics, Physics, Economics, Finance, or other quantitative disciplines.
• Strong problem-solving skills with an emphasis on product development.
• Sound knowledge of statistical methods and machine learning algorithms.
• High degree of proficiency in Python and SQL.
• 1+ years of experience in collecting, integrating, processing, and analyzing data.
• 1+ years of experience in using open source/commercial data visualization libraries/tools.
• 1+ years of experience with cloud services (AWS, Azure).
• 1+ years of experience in version control workflows and technologies (Git, GitHub/Gitlab)
• Ability to analyze, summarize and cogently present quantitative and qualitative information.
• Passion for implementing industry standards / best coding practices.
• Excellent written and verbal communication skills for coordinating across 

In [13]:
from pydantic import BaseModel
from typing import List
import matplotlib.pyplot as plt
from langchain.agents import initialize_agent, Tool, AgentType

#  Pydantic model to structure the skills and their importance
class SkillData(BaseModel):
    skill: str
    importance: int

class SkillsResponse(BaseModel):
    skills: List[SkillData]

"""RAG PORTION"""
# Load the existing vector store with the embedding function
db = Chroma(
    persist_directory=persistent_directory,
    embedding_function=embeddings,
    collection_name="jobs_collection"  # Match the name used in push
)

# Define the user's question
query = "What are the top skills I need to get a data scientist job?"

# Retrieve relevant documents based on the query
retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 10, "score_threshold": 0.25},  # Lower threshold for testing
)

# Combine the query and the relevant document contents
combined_input = (
    "Here are some documents that might help answer the question: "
    + query
    + "\n\nRelevant Documents:\n"
    + "\n\n".join([doc.page_content for doc in relevant_docs])
    + "\n\nPlease extract the top skills for the following job posting and their importance (on a scale from 1 to 100):\nReturn the skills in this format: [{{'skill': 'Skill Name', 'importance': 95}}, ...]"
)

# Use the LLM (ChatGPT) to extract the skills and importance scores
response = ChatOpenAI(model="gpt-4o").invoke(combined_input)

print(response)

content="Here's a list of the top skills extracted from the job posting along with their importance:\n\n1. `{{'skill': 'Machine Learning and Artificial Intelligence', 'importance': 95}}`\n2. `{{'skill': 'Experience with programming languages (Python, Java, C++)', 'importance': 90}}`\n3. `{{'skill': 'Data Wrangling over massive datasets using distributed computing platform', 'importance': 88}}`\n4. `{{'skill': 'Strong analytical and problem-solving skills', 'importance': 85}}`\n5. `{{'skill': 'Statistical Analysis', 'importance': 83}}`\n6. `{{'skill': 'Data Visualization and reporting tools (e.g., Tableau)', 'importance': 82}}`\n7. `{{'skill': 'Understanding of big data ecosystem (Hadoop, Spark)', 'importance': 80}}`\n8. `{{'skill': 'Advanced knowledge in SQL', 'importance': 78}}`\n9. `{{'skill': 'Experience with cloud platforms (AWS, Azure)', 'importance': 77}}`\n10. `{{'skill': 'Excellent written and verbal communication skills', 'importance': 76}}`\n11. `{{'skill': 'Familiarity with 

In [16]:
print(response.content)

Here's a list of the top skills extracted from the job posting along with their importance:

1. `{{'skill': 'Machine Learning and Artificial Intelligence', 'importance': 95}}`
2. `{{'skill': 'Experience with programming languages (Python, Java, C++)', 'importance': 90}}`
3. `{{'skill': 'Data Wrangling over massive datasets using distributed computing platform', 'importance': 88}}`
4. `{{'skill': 'Strong analytical and problem-solving skills', 'importance': 85}}`
5. `{{'skill': 'Statistical Analysis', 'importance': 83}}`
6. `{{'skill': 'Data Visualization and reporting tools (e.g., Tableau)', 'importance': 82}}`
7. `{{'skill': 'Understanding of big data ecosystem (Hadoop, Spark)', 'importance': 80}}`
8. `{{'skill': 'Advanced knowledge in SQL', 'importance': 78}}`
9. `{{'skill': 'Experience with cloud platforms (AWS, Azure)', 'importance': 77}}`
10. `{{'skill': 'Excellent written and verbal communication skills', 'importance': 76}}`
11. `{{'skill': 'Familiarity with database technologies

In [19]:
print(result.content)

Based on the provided documents and qualifications, here are the top skills extracted for a data scientist position, along with their importance:

1. {{'skill': 'Programming Proficiency (Python, SQL)', 'importance': 95}}
2. {{'skill': 'Machine Learning and AI Techniques', 'importance': 90}}
3. {{'skill': 'Data Wrangling and Data Analysis', 'importance': 88}}
4. {{'skill': 'Statistical Methods and Analysis', 'importance': 85}}
5. {{'skill': 'Data Visualization and Reporting Tools (Tableau, Power BI)', 'importance': 80}}
6. {{'skill': 'Cloud Services Experience (AWS, Azure)', 'importance': 78}}
7. {{'skill': 'Big Data Technologies (Hadoop, Spark)', 'importance': 75}}
8. {{'skill': 'Strong Problem-Solving Skills', 'importance': 73}}
9. {{'skill': 'Version Control (Git, GitHub/GitLab)', 'importance': 70}}
10. {{'skill': 'Excellent Communication Skills', 'importance': 68}}
11. {{'skill': 'Collaboration and Teamwork', 'importance': 65}}
12. {{'skill': 'Model Deployment and Maintenance', 'imp

We need to format the returned `result.content` into a nice dict of the format that is expected by `SkillsResponse.parse_obj()` 

In [None]:
import re
from icecream import ic
# Get the matches of {'skill': <>, 'importance': <>} using regular expression

pattern = r"\{\{'skill': '(.*?)', 'importance': (\d{2})\}\}"


skills_matched = re.findall(pattern, result.content)
ic(skills_matched)

data = {}
for skill, rank in skills_matched:
    data[skill] = rank

# Then graph out this dict

def plot_skills(skills_response: SkillsResponse):
    skills = [skill.skill for skill in skills_response.skills]
    importance = [skill.importance for skill in skills_response.skills]
    
    # Create the plot
    skills_df = pd.DataFrame({
        "Skill": skills,
        "Importance": importance
    })
    
    plt.figure(figsize=(8, 6))
    plt.barh(skills_df["Skill"], skills_df["Importance"], color="skyblue")
    plt.xlabel("Importance")
    plt.title("Top Skills for Job Role")
    plt.show()





ic| skills_matched: [('Programming Proficiency (Python, SQL)', '95'),
                     ('Machine Learning and AI Techniques', '90'),
                     ('Data Wrangling and Data Analysis', '88'),
                     ('Statistical Methods and Analysis', '85'),
                     ('Data Visualization and Reporting Tools (Tableau, Power BI)', '80'),
                     ('Cloud Services Experience (AWS, Azure)', '78'),
                     ('Big Data Technologies (Hadoop, Spark)', '75'),
                     ('Strong Problem-Solving Skills', '73'),
                     ('Version Control (Git, GitHub/GitLab)', '70'),
                     ('Excellent Communication Skills', '68'),
                     ('Collaboration and Teamwork', '65'),
                     ('Model Deployment and Maintenance', '60')]


TypeError: sequence item 0: expected str instance, tuple found