In [1]:
import sys, httpx, os
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage, AIMessage

In [2]:
DATA_PATH = "../data/main-data/synthetic-resumes.csv"
FAISS_PATH = "../vectorstore"
RAG_K_THRESHOLD = 10
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
OPENAI_ENDPOINT = "https://aalto-openai-apigw.azure-api.net"
OPENAI_KEY = ""

In [3]:
documents = pd.read_csv(DATA_PATH)
documents["ID"] = documents["ID"].astype('str')  
id_resume_dict = dict(zip(documents["ID"], documents["Resume"]))

In [4]:
embedding_model = HuggingFaceEmbeddings(
  model_name=EMBEDDING_MODEL,
  model_kwargs={"device": "cpu"},
)

In [5]:
vectorstore_db = FAISS.load_local(FAISS_PATH, embedding_model, distance_strategy=DistanceStrategy.COSINE)

In [6]:
def update_base_url(request: httpx.Request):
  if request.url.path == "/chat/completions":
    request.url = request.url.copy_with(path="/v1/chat/gpt-35-turbo-1106")

llm = ChatOpenAI(
    default_headers={"Ocp-Apim-Subscription-Key": OPENAI_KEY},
    base_url=OPENAI_ENDPOINT,
    api_key=False,
    http_client=httpx.Client(
      event_hooks={
        "request": [update_base_url],
    }),
    temperature=0.1,
  )

In [7]:
def generate_response(llm, question: str, docs: list):
  context = "\n\n".join(doc for doc in docs)
  
  system_message = SystemMessage(content="""
    You are an expert in talent acquisition that helps determine the best candidate among multiple suitable resumes.
    Use the following pieces of context to determine the best resume given a job description. 
    You should provide some detailed explanations for the best resume choice. Make sure to also return a detailed summary of the original text of the best resume.
    Because there can be applicants with similar names, use the applicant ID to refer to resumes in your response. 
    If you don't know the answer, just say that you don't know, do not try to make up an answer.
  """)
  user_message = HumanMessage(content=f"""
    Context: {context}
    Question: {question}
  """)

  response = llm.invoke([system_message, user_message])
  return response.content


def generate_subquestions(llm, question: str):
  system_message = SystemMessage(content="""
    You are an expert in talent acquisition. Separate this job description into 3-4 more focused aspects for efficient resume retrieval. 
    Make sure every single relevant aspect of the query is covered in at least one query. You may choose to remove irrelevant information that doesn't contribute to finding resumes such as the expected salary of the job, the ID of the job, the duration of the contract, etc.
    Only use the information provided in the initial query. Do not make up any requirements of your own.
    Put each result in one line, separated by a linebreak.
    """)
  
  user_message = HumanMessage(content=f"""
    Generate 3 to 4 sub-queries based on this initial job description: 
    {question}
  """)

  oneshot_example = HumanMessage(content="""
    Generate 3 to 4 sub-queries based on this initial job description:

    Wordpress Developer
    We are looking to hire a skilled WordPress Developer to design and implement attractive and functional websites and Portals for our Business and Clients. You will be responsible for both back-end and front-end development including the implementation of WordPress themes and plugins as well as site integration and security updates.
    To ensure success as a WordPress Developer, you should have in-depth knowledge of front-end programming languages, a good eye for aesthetics, and strong content management skills. Ultimately, a top-class WordPress Developer can create attractive, user-friendly websites that perfectly meet the design and functionality specifications of the client.
    WordPress Developer Responsibilities:
    Meeting with clients to discuss website design and function.
    Designing and building the website front-end.
    Creating the website architecture.
    Designing and managing the website back-end including database and server integration.
    Generating WordPress themes and plugins.
    Conducting website performance tests.
    Troubleshooting content issues.
    Conducting WordPress training with the client.
    Monitoring the performance of the live website.
    WordPress Developer Requirements:
    Bachelors degree in Computer Science or a similar field.
    Proven work experience as a WordPress Developer.
    Knowledge of front-end technologies including CSS3, JavaScript, HTML5, and jQuery.
    Knowledge of code versioning tools including Git, Mercurial, and SVN.
    Experience working with debugging tools such as Chrome Inspector and Firebug.
    Good understanding of website architecture and aesthetics.
    Ability to project manage.
    Good communication skills.
    Contract length: 12 months
    Expected Start Date: 9/11/2020
    Job Types: Full-time, Contract
    Salary: 12,004.00 - 38,614.00 per month
    Schedule:
    Flexible shift
    Experience:
    Wordpress: 3 years (Required)
    web designing: 2 years (Required)
    total work: 3 years (Required)
    Education:
    Bachelor's (Preferred)
    Work Remotely:
    Yes
  """)

  oneshot_response = AIMessage(content="""
    1. WordPress Developer Skills:
      - WordPress, front-end technologies (CSS3, JavaScript, HTML5, jQuery), debugging tools (Chrome Inspector, Firebug), code versioning tools (Git, Mercurial, SVN).
      - Required experience: 3 years in WordPress, 2 years in web designing.
   
    2. WordPress Developer Responsibilities:
      - Meeting with clients for website design discussions.
      - Designing website front-end and architecture.
      - Managing website back-end including database and server integration.
      - Generating WordPress themes and plugins.
      - Conducting website performance tests and troubleshooting content issues.
      - Conducting WordPress training with clients and monitoring live website performance.

    3. WordPress Developer Other Requirements:
      - Education requirement: Bachelor's degree in Computer Science or similar field.
      - Proven work experience as a WordPress Developer.
      - Good understanding of website architecture and aesthetics.
      - Ability to project manage and strong communication skills.

    4. Skills and Qualifications:
      - Degree in Computer Science or related field.
      - Proven experience in WordPress development.
      - Proficiency in front-end technologies and debugging tools.
      - Familiarity with code versioning tools.
      - Strong communication and project management abilities.
  """)

  response = llm.invoke([system_message, oneshot_example, oneshot_response, user_message])
  return response.content

In [8]:
def reciprocal_rank_fusion(document_rank_list: list[dict], k=50):
  fused_scores = {}
  for doc_list in document_rank_list:
    for rank, (doc, score) in enumerate(doc_list.items()):
      if doc not in fused_scores:
        fused_scores[doc] = 0
      fused_scores[doc] += 1 / (rank + k)
  reranked_results = {doc: score for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)}
  return reranked_results


def retrieve_docs_id(question: str, k=3):
  docs_score = vectorstore_db.similarity_search_with_score(question, k=k)
  docs_score = {str(doc.metadata["ID"]): score for doc, score in docs_score}
  return docs_score


def retrieve_id_and_rerank(subquestion_list: list):
  document_rank_list = []
  for subquestion in subquestion_list:
    document_rank_list.append(retrieve_docs_id(subquestion, RAG_K_THRESHOLD))
  reranked_documents = reciprocal_rank_fusion(document_rank_list)
  return reranked_documents


def retrieve_documents_with_id(doc_id_with_score: dict, threshold=5):
  retrieved_ids = list(sorted(doc_id_with_score, key=doc_id_with_score.get, reverse=True))[:threshold]
  retrieved_documents = [id_resume_dict[id] for id in retrieved_ids]
  for i in range(len(retrieved_documents)):
    retrieved_documents[i] = "Applicant ID " + retrieved_ids[i] + "\n" + retrieved_documents[i]
  return retrieved_documents 

### Result on test set

In [9]:
test_set = pd.read_csv("../data/main-data/test-sets/testset-1.csv")

question_list = test_set["Job Description"]
ground_truth_list = test_set["Ground Truth"]
context_list = []
response_list = []

In [14]:
for question in question_list:
  subquestion_list = generate_subquestions(llm, question).split("\n\n")
  id_list = retrieve_id_and_rerank([question] + subquestion_list)
  document_list = retrieve_documents_with_id(id_list)
  response = generate_response(llm, question, document_list)

  context_list.append("===".join(document_list))
  response_list.append(response)  

In [48]:
test_result = pd.DataFrame({"question": question_list, "ground_truth": ground_truth_list, "answer": response_list, "contexts": context_list})
test_result.to_csv("../data/main-data/gpt4-ragfusion/test-results/testres-1.csv", index=False)