In [7]:
%%sql

DROP DATABASE IF EXISTS resume_evaluator;
CREATE DATABASE resume_evaluator;

In [9]:
%%sql

CREATE TABLE IF NOT EXISTS resumes_profile_data(
    names text,
    email text,
    phone_no text,
    years_of_experience text,
    skills text,
    profile_name text,
    resume_summary text,
    resume_embeddings blob
);

In [10]:
!pip install -q pdfminer.six openai==1.3.3

In [11]:
import getpass
import numpy as np
import openai
import os
import pandas as pd
import requests
import re
from openai import OpenAI
from pdfminer.high_level import extract_text
from singlestoredb import create_engine
from sqlalchemy import text

In [12]:
os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key: ')
client = openai.OpenAI()

OpenAI API Key:  ········


4. Create a function called get_embedding() to transform textual content into vector embeddings

In [47]:
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    response = openai.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

5. Create a function called print_pdf_text() to extract and clean the text from PDFThis function is designed to extract and clean the text from a provided PDF, either from a web URL or a local file path.



In [55]:
def print_pdf_text(url=None, file_path=None):
    # Determine the source of the PDF (URL or local file)
    if url:
        response = requests.get(url)
        response.raise_for_status()  # Ensure the request was successful
        temp_file_path = "temp_pdf_file.pdf"
        with open(temp_file_path, 'wb') as temp_file:
            temp_file.write(response.content)  # Save the PDF to a temporary file
        pdf_source = temp_file_path
    elif file_path:
        pdf_source = file_path  # Set the source to the provided local file path
    else:
        raise ValueError("Either url or file_path must be provided.")

    # Extract text using pdfminer
    text = extract_text(pdf_source)

    # Remove special characters except "@", "+", ".", and "/"
    cleaned_text = re.sub(r"[^a-zA-Z0-9\s@+./:,]", "", text)

    # Format the text for better readability
    cleaned_text = cleaned_text.replace("\n\n", " ").replace("\n", " ")
    # If a temporary file was used, delete it
    if url and os.path.exists(temp_file_path):
        os.remove(temp_file_path)

    return cleaned_text

6. Create a function called pinfo_extractor() to extract specific details from a candidate's resume text

Functionality:

Prompt Creation:
A context is formed using the provided resume text.
A detailed question prompt is generated to guide the extraction of desired details from the resume.
OpenAI API Interaction:
Uses the gpt-3.5-turbo model to process the prompt and generate a detailed extraction.
Extracts relevant sections like Name, Email, Phone Number, and more from the generated response.
Data Structuring:
The extracted details are organized into a dictionary.

In [60]:
def pinfo_extractor(resume_text):
    context = f"Resume text: {resume_text}"
    question = """ From above candidate's resume text, extract the only following details:
                Name: (Find the candidate's full name. If not available, specify "not available.")
                Email: (Locate the candidate's email address. If not available, specify "not available.")
                Phone Number: (Identify the candidate's phone number. If not found, specify "not available.")
                Years of Experience: (If not explicitly mentioned, calculate the years of experience by analyzing the time durations at each company or position listed. Sum up the total durations to estimate the years of experience. If not determinable, write "not available.")
                Skills Set: Extract the skills which are purely technical and represent them as: [skill1, skill2,... <other skills from resume>]. If no skills are provided, state "not available."
                Profile: (Identify the candidate's job profile or designation. If not mentioned, specify "not available.")
                Summary: provide a brief summary of the candidate's profile without using more than one newline to segregate sections.
                """

    prompt = f"""
        Based on the below given candidate information, only answer asked question:
        {context}
        Question: {question}
    """
    # print(prompt)
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful HR recruiter."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=700,
        temperature=0.5,
        n=1  # assuming you want one generation per document
    )
    # Extract the generated response
    response_text = response.choices[0].message.content # response['choices'][0]['message']['content']
    print(response_text)
    # Split the response_text into lines
    lines = response_text.strip().split('\n')

    # Now, split each line on the colon to separate the labels from the values
    # Extract the values
    name = lines[0].split(': ')[1]
    email = lines[1].split(': ')[1]
    phone_no = lines[2].split(': ')[1]
    years_of_expiernce = lines[3].split(': ')[1]
    skills = lines[4].split(': ')[1]
    profile = lines[5].split(': ')[1]
    summary = lines[6].split(': ')[1]
    data_dict = {
        'name': name,
        'email': email,
        'phone_no': phone_no,
        'years_of_expiernce': years_of_expiernce,
        'skills': skills,
        'profile': profile,
        'summary': summary
    }
    print(data_dict, "\n")
    return data_dict;

7. Create a function called add_data_to_db()

Functionality:

Database Connection:
Establishes a connection to the database using SQLAlchemy's create_engine with the given connection URL.
Embedding Creation:
Calls the get_embedding() function to generate an embedding for the resume summary.
SQL Query Formation:
Crafts an SQL query to insert the provided data (from the input dictionary) into the resumes_profile_data table in the database.
Data Insertion:
Opens a connection, executes the SQL query, commits the changes, and then closes the connection.


In [62]:
def add_data_to_db(input_dict):
    # Create the SQLAlchemy engine
    # engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}:{port}/{database_name}')

    engine = create_engine(connection_url)
    # Get the embedding for the summary text
    summary = input_dict['summary']
    embedding = get_embedding(summary)
    # Create the SQL query for inserting the data
    query_sql = f"""
        INSERT INTO resumes_profile_data (names, email, phone_no, years_of_experience, skills, profile_name, resume_summary, resume_embeddings)
        VALUES ("{input_dict['name']}", "{input_dict['email']}", "{input_dict['phone_no']}", "{input_dict['years_of_expiernce']}",
        "{input_dict['skills']}", "{input_dict['profile']}", "{summary}", JSON_ARRAY_PACK('{embedding}'));
    """
    with engine.connect() as connection:
        connection.execute(text(query_sql))
        connection.commit()
    print("\nData Written to resumes_profile_data_2 table")

8. Create a function called search_resumes() to search for resumes that are most similar to a given query, leveraging embeddings and database operations.

Functionality:

Embedding Creation:
Converts the given query into its corresponding embedding using the get_embedding() function.
SQL Query Formation:
Creates an SQL query to search for the top 5 resumes in the resumes_profile_data table that have the highest similarity (dot product) to the query embedding.
Database Operations:
Opens a connection to the database, runs the SQL query to fetch the results, and then closes the connection.

Returns a list of the top 5 most relevant resumes based on the given query.

In [69]:
def search_resumes(query):
    query_embed = get_embedding(query)
    query_sql = f"""
            SELECT names, resume_summary, dot_product(
                    JSON_ARRAY_PACK('{query_embed}'),
                    resume_embeddings
                ) AS similarity
                FROM resumes_profile_data
                ORDER BY similarity DESC
                LIMIT 5;
    """
    # print(query_sql,"\n")
    # engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}:{port}/{database_name}')
    engine = create_engine(connection_url)
    connection = engine.connect()
    result = connection.execute(text(query_sql)).fetchall()
    connection.close()
    engine.dispose()
    return result

8. Create a function called evaluate_candidates()

Functionality:

Resume Retrieval:
Utilizes the search_resumes() function to get the top matching resumes based on the job description.
OpenAI API Interaction:
For each retrieved resume, a prompt is crafted, asking to evaluate how well the candidate fits the job description.
Interacts with the gpt-3.5-turbo model to process this prompt and receive an efficient, concise response.
Data Aggregation:
Collects the model's evaluation responses for each candidate in a list.

Returns a list of tuples, where each tuple contains:

Candidate's name.
Evaluation response from the model, describing the compatibility of the candidate with the given job description.

In [72]:
def evaluate_candidates(query):
    result = search_resumes(query)
    responses = []  # List to store responses for each candidate
    for resume_str in result:
        name = resume_str[0]
        context = f"Resume text: {resume_str[1]}"
        question = f"What percentage of the job requirements does the candidate meet for the following job description? answer in 3 lines only and be effcient while answering: {query}."
        prompt = f"""
            Read below candidate information about the candidate:
            {context}
            Question: {question}
        """
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a expert HR analyst and recuriter."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=100,
            temperature=0.2,
            n=1  # assuming you want one generation per document
        )
        # Extract the generated response
        response_text = response.choices[0].message.content # response['choices'][0]['message']['content']
        responses.append((name, response_text))  # Append the name and response_text to the responses list
    return responses

In [74]:
urls = [
    "https://github.com/vishwajeetdabholkar/vishwajeet-resume/raw/main/Vishwajeet_Dabholkar_Resume_June_2023.pdf"
]

for url in urls:
    resume_text = print_pdf_text(url=url).replace('\n',' ')
    print("Resume Text extracted\n")
    ip_data_dict = pinfo_extractor(resume_text)
    print("Information extracted\n")
    add_data_to_db(ip_data_dict)
    print("\n")

Resume Text extracted

Name: Vishwajeet Dabholkar
Email: vishwajeetdabholkar@gmail.com
Phone Number: +91 702 046 9342
Years of Experience: Not available
Skills Set: [Python, SQL, Scala, PySpark, SparkSQL, MemSQL, Pandas, boto3, ChatGPT, Azure data factory, Databricks REST APIs, AWS Glue, RDS, EMR, S3, EC2, API Gateway, KMS, Syniti DB replication, Bigquery, Azkaban, Jupyter notebook, Github, PyCharm, Postman]
Profile: Enterprise Solutions Engineer
Summary: Vishwajeet Dabholkar is an experienced Enterprise Solutions Engineer with expertise in data engineering and analytics. He has worked on various projects involving data ingestion, migration, and optimization using a range of technologies such as Python, SQL, PySpark, AWS, and more. Vishwajeet has a strong technical skill set and has successfully implemented proof of concepts and developed applications in his previous roles. He is proficient in utilizing cloud platforms like AWS and has experience working with different data sources and

In [78]:
%%sql

SELECT * FROM resumes_profile_data;

names,email,phone_no,years_of_experience,skills,profile_name,resume_summary,resume_embeddings
Vishwajeet Dabholkar,vishwajeetdabholkar@gmail.com,+91 702 046 9342,Not available,"[Python, SQL, Scala, PySpark, SparkSQL, MemSQL, Pandas, boto3, ChatGPT, Azure data factory, Databricks REST APIs, AWS Glue, RDS, EMR, S3, EC2, API Gateway, KMS, Syniti DB replication, Bigquery, Azkaban, Jupyter notebook, Github, PyCharm, Postman]",Enterprise Solutions Engineer,"Vishwajeet Dabholkar is an experienced Enterprise Solutions Engineer with expertise in data engineering and analytics. He has worked on various projects involving data ingestion, migration, and optimization using a range of technologies such as Python, SQL, PySpark, AWS, and more. Vishwajeet has a strong technical skill set and has successfully implemented proof of concepts and developed applications in his previous roles. He is proficient in utilizing cloud platforms like AWS and has experience working with different data sources and tools.","b'A=\x07\xbbo\xce1\xbc\x80\xa8A<\xc1U=\xbc\x92\xde\xf5\xbc\xfe\xd7\xa1<\x06\x00>\xbc\x90\xde\x96\xbc\x81\xd6\xd3\xbb\xe8H\xa3\xbc\x02\x039<\xa7\x9b\';F\x960<\x03\xa7s\xba\xa5m\x15\xbc\xb9\xd1\xdb\xb9\x80\x1b\x90<\x90k\xc8\xbc\x18\x927;\xfb\x08\xaf\xba\x15\xac\xbb\xbc\x87\xb9T\xbc\x92\xad\t< \xba\xd3<\xbe\xfc\xf2<=\xb6\xaa\xbcw\xf6M<\x159m\xbb\x02\x039\xbb\x0b?\x84\xbc\xdek\xf7;\xd4\x88\x17;\xe3\x06\x03;\x9b\xec\xae\xbc\x86\xfe\x10<\xb7^\xae;\xbe\xcb\x06\xbc\xff\x05\xb4\xbb\xce\x8e\r\xbd\x16Pv\xbcc\x92\x87\xbcL\xefY\xbc\x05\xd2\xab\xbby;\xe9\xbc4\x91\xe8\xba\x1f\x16\x99\xbc\x91\xb0c;\x9f\x8a\x94<\xb6G%<\xba[3;!\xe8\xe5<9/N:zi\xfb\xbc\x95\x93\x05\xbd\x80\xa8A\xbc`\xaf\xe5<\xd6C[\xbc\x0fm\xf5\xbb\x02y\xe1\xbb\x9e\\\x02=&\xf9\x99<\x9eve<\xd3Z\x05\xbd\x80\xa8\xc1\xbc\x05\xd2\xab<\x88C,<\xf99\xbc\xba\x8f=6<\x994E\xbc.\xc5p;e\xd7""\xbc\xe4K\x9e\xbb\xa7(\xd9\xba\tY\x08\xbc\xefY6<\xec\xe6\x08=\x0e\x9b\xa8\xbcW\xe3\x8e\xbc\x8d\x0f$\xbf\xd8\x9f\xff\xbc\n\xa1}\xbc\x9a\xd5\xa5\xbc\xb2M\xfa\xbbq*V\xbco\xce1<\xd6C[5\xbc\xa5\xe3\xbd<\x90k\xc8<\xe3\x1d\x0c\xbdN\x1a\x12;\xb80{<\xa2p\x10\xbc\x0f\xe0\xc3<\xc5\xdc\x19\x05\xbax$\xe0\xbc\xea\x8d>\xbc\'\xb1\x03\xbb}L\x1d<\x8b\xcdb<\xdc=e:\x80\xa8A\xbc\xf2\xb2\x80\xbc\xa2\xfdA<\x80\xa8A<\xb8\xff\x0e<\xa9\xf7K\xbc{}*;\x1b\x05\xe5\xbb\xab""\x04;\xac\xdd\xc7;\xb0d\xa4\xbc+\xc5\x11=J\x06\x84\xbc\xfag\xce\xbc\xc8\xc2\x95<\xd8\x9c\xa5\xbc\x16\xda\xcd<\xb8\xa3\xc9<%\xcb\x87\xbc\xcf\xd3\xa8\xb5:\x91\x96\x00=\xf7\xdd\x17;=\xcd3;\xc7\x94\x83;\xd8\x9c\xa5<\xb1\x1fh=\x11\x81$\xbc\xbf\xf9\x98<\xad\x951\xbclu\xe7;\xfc\xc3\xf2;;\xfe\xc0:VY7\xbc\xb8\xa3I=\xb2\xc0H\xbcU\xa1M\xbc\xfc\xc3r\xbcpX\t=i\x02\xba\xbc\xad\x08\x80\xbc\xdek\xf7\xbb\xa5p\xef\xbbe\xee+<:\xe7\xb7\xbc\xf4\xf7\x9b<~\xf0W\xbc\x03\x8d\x90<1\x91\x89\xe4<\xfbE<\xa5V\x0c=\xa5p\xef\xbb\x18\x927\xbc\\%\xaf\'\xb1\x83;\xd8)W\xb5<\x97\xd8 \xbd\xdc\x99*;G \x88\xbb|\x1e\x8b<\xc7\xab\x8c\xbcf\x8f\x0c<\xb7^\xae<\xc4\xc8j\xbb_\x0b\xab\xbc\xcb\xc2t\xbcD#\x03=aPF\xbc\xd4\xfe\xbf\xb9\x87,\xa3\xbd\xe07\x10\xbaU+\xa5\xbc\xcdw\x84\xbb""\xe5\x8b;\x16\xc3D\xbclu\xe7<\xa9\x0e\xd5\xbbT\x00m\xbb\xf6\xc9\xe8:\xd0\x01\xbb\xbc\xb8\xff\x0e;\xee\x14\x9b\xbc\xd6\xb6)\xbc\xc8f\xd0\xbc\xc9\x94\xe2\xb8\x14~)=7]\x01;\xd6@\x01::Z\x86;\xc0\'+\xba@@a\xbc!\xce\x02=\x9d\xbb!;=@\x02\xbd,iL;\x1b\x1cn\xbcS\xcf\x80;,\xdc\x9a\xba\n\x87\x9a\xbc\x88\xcd\x03\xbc}5\x94\xbb\xa5p\xef;\xebE\xa8<\x05\xd2+:\x1c\x19\x14\xbc\x9d\xbb\xa1\xe4\xbc<\x9b\x038<\xc7\xab\x0c\xbd7\x01<<\xef\xcc\x04=\xe6y0<\x19\x1c\x0f"


In [80]:
job_description = input("Enter Job description : \n")
evaluate_candidates(job_description)

Enter Job description : 
 Data Scientist


[('Vishwajeet Dabholkar',
  'Based on the provided information, the candidate meets approximately 60% of the job requirements for the Data Scientist position. They have experience in data engineering and analytics, as well as proficiency in Python, SQL, and AWS. However, it is unclear if they have specific experience in statistical modeling and machine learning, which are key requirements for a Data Scientist role.')]

In [None]:
%%sql

DROP DATABASE resume_evaluator