In [1]:
from transformers import AutoTokenizer
import json

model_path = "NousResearch/Hermes-2-Pro-Llama-3-8B"


tokenizer = AutoTokenizer.from_pretrained(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
def get_best_candidate(obj):
    action = obj['action']
    try:
        if len(action):
            return action[0]
    except:
        return action

def get_func_obj_by_response(response):
    result = json.loads(response.replace("`", "").replace("\n", ""))
    func_obj = get_best_candidate(result)
    return func_obj

In [3]:
functions_description = """
Function: _list_data
    Description:
        Display some resumes in the system
    Params:

    Output:
        - List resumes


Function: _find_resume
    Description:
        - Search resumes by criteria: (GPA, major, skills, gender)
    Params:
        GPA
        - Description: GPA score at least
        - Type: float
        major
        - Description: Major of candidate
        - Type: String
        skills
        - Note: Skill is a name.
        - Description: Skill list of candidate
        gender
        - Type: Enum (male, female)
        job_title
        - Description: Past job titles, experience job
        - Type: List of string
    Output:
        - List resumes
        - If there is no information for a field, the value of that data field is not returned

"""

In [4]:
CONTROLLER_PROMPT_TEMPLATE = """You are a controller, you receive below query from user, utilize the insights and choose what is the best one main action from given functions

Query: $$QUERY$$

List function:
$$FUNCTIONS_DECRIPTION$$

The response should be exactly like format and don't say anything else:

```json
{
    "observation": <what is the current situation, what should follow>,
    "guidelines": <what is the most suitable action in this situation and why>,
    "action": {
        "fn": <function name 1>,
        "params": <function param 1>
    }
}
```
RESPONSE:
```json
"""




In [5]:
import ctranslate2

ct2_path = "./generative_model/llama_3_hermes"
generator = ctranslate2.Generator(
    ct2_path,
    device="cuda",
    compute_type = "int8_float16",
    device_index = [0],
    # flash_attention = True,
    inter_threads=4
)

from transformers import AutoTokenizer


model_path = "NousResearch/Hermes-2-Pro-Llama-3-8B"


tokenizer = AutoTokenizer.from_pretrained(model_path)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# query = "Find people has experiment in programming with linux or android"
# query = "Look for candidates with experience in AI and a degree in computer science. Male preferred"
# query = "Look for candidates with experience in AI and have a degree in computer science. Male preferred, priority is given to candidates who have worked in jobs related to data engineer and software engineer and GPA at least 3.8"


In [7]:
query = "Looking for candidates with experience in terms of AI and blockchain."
inputs = CONTROLLER_PROMPT_TEMPLATE.replace("$$QUERY$$", query).replace("$$FUNCTIONS_DECRIPTION$$", functions_description)
chat = [
    {
        "role": "user",
        "content": inputs
    }
]

prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
prompts = [prompt]
token_inputs = [tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt)) for prompt in prompts]


generated_text = generator.generate_batch(
    token_inputs,
    max_length=500,
    sampling_topp = 0.9,
    include_prompt_in_result=False,
    # end_token = [28789, 28749, 28767]
    # end_token = [28789, 28735, 28767]
    # end_token = [32005, 32006]
    # sampling_topk=10,
)
response = tokenizer.decode(generated_text[0].sequences_ids[0], skip_special_tokens=False)

print(response)
func_metadata = get_func_obj_by_response(response)
print(func_metadata)


{
    "observation": "The user is looking for candidates with experience in AI and blockchain. We have resumes in the system that need to be filtered based on these criteria.",
    "guidelines": "To find the most suitable candidates, we should search the resumes based on the given criteria of AI and blockchain experience.",
    "action": {
        "fn": "_find_resume",
        "params": {
            "skills": ["AI", "blockchain"]
        }
    }
}
```
{'fn': '_find_resume', 'params': {'skills': ['AI', 'blockchain']}}


In [8]:
import pymongo
import os
from dotenv import load_dotenv
load_dotenv(override=True)

connection_str = os.getenv("MONGODB_CONNECTION_STRING")
client = pymongo.MongoClient(connection_str)
database_name = os.getenv("CV_MONGO_DATABASE")
database = client[database_name]
collection = database["c_v_info"]

In [9]:
import re
float(re.findall("\d+\.\d+", "'gpa 3.85/4.00")[0])

3.85

In [10]:
def check_gpa(gpa_str, gpa_min):
    try:
        gpa_str = "".join([c for c in gpa_str if c != "'"])
        gpa = float(re.findall("\d+\.\d+", gpa_str)[0])
        if gpa > gpa_min - 0.15:
            return 1
        return 0
    except:
        pass

In [11]:
import difflib
def check_major(arr, major):
    best_matches = difflib.get_close_matches(major, arr, n = 1, cutoff=0.3)
    if len(best_matches):
        return 1
    return 0

arr = [
    "advanced program",
    "ai/software research intern",
    "information and communication technology",
    "computer science"
    ]
# check_major(arr, major = "computer communication")

In [12]:
def check_skill(source_skills, target_skills):
    cnt = 0
    for skill in target_skills:

        # best_matches = difflib.get_close_matches(skill.lower(), source_skills, n = 1, cutoff=0.5)
        # if len(best_matches):
        #     return 1

        process_source_skills = []
        for source_skill in source_skills:
            s = ""
            for c in source_skill:
                if c.isalpha():
                    s += c
                else:
                    process_source_skills.append(s)
                    s = ""
            if s != "":
                process_source_skills.append(s)

        if skill.lower() in process_source_skills + source_skills:
            print(skill)
            cnt += 1

        if "and" in skill:
            sub_skills = skill.split("and")
            for sub_skill in sub_skills:
                if sub_skill.strip().lower() in process_source_skills + source_skills:
                    cnt += 1

    if (cnt == 0):
        return 0


    return cnt / len(target_skills)


In [13]:
def _find_resume(body):
    if 'gender' in body:
        data = list(collection.find({"gender": body['gender'].lower()}))
    else:
        data = list(collection.find({}))

    print(len(data))

    if 'GPA' in body:
        data = [item for item in data if check_gpa(item['gpa'], body['GPA'])]

    if 'major' in body:
        try:
            major = body['major'].lower()
        except:
            major = body['major'][0].lower()
        data = [item for item in data if check_major(item['major'], major)]

    print(len(data))

    total_skills = []
    if 'skills' in body:
        total_skills += [item.lower() for item in body['skills']]
    if 'job_title' in body:
        total_skills += [item.lower() for item in body['job_title']]

    if len(total_skills) == 0:
        names = []
        result = []
        for item in data:
            try:
                if item['name'][0] not in names:
                    names.append(item['name'][0])
                    result.append(item)
            except:
                names.append("anonymous")
        return [{**item, "score": 1} for item in data], names

    process_data = []

    for item in data:
        score = check_skill(item['skill'] + item['major'] + item['job_title'], total_skills)
        # if(score >= 0.8):
        process_data.append({**item, "score": score})



    data = process_data

    names = []
    result = []
    for item in data:
        try:
            if item['name'][0] not in names:
                names.append(item['name'][0])
                result.append(item)
        except:
            names.append("anonymous")

    return data, names
print(func_metadata['params'])
print(_find_resume(func_metadata['params']))

{'skills': ['AI', 'blockchain']}


11
11
ai
ai
ai
ai
ai
ai
blockchain
([{'_id': ObjectId('66c6abfeea4dd50402925c8d'), 'organization': ['ho chi minh university of science', 'soict hackathon', 'vietai hanoi vietnam', 'foundation of deep learning and advanced natural language processing', 'tymlez cohort', 'naver samsung tiki and vietcombank', 'apcs', 'ieee international conference on systems man and cybernetics', 'ieee international conference on software testing verification and validation', 'katalon inc.', 'slu spoken language understanding', 'department of science and technology', 'ho chi minh university of science vietnam', 'advantech group', 'hanoi university of science and technology', 'benit ltd. hanoi vietnam', 'department of information and communications vietnam national university', 'ho chi minh vietnam', 'fodl', 'aiot innoworks', 'department of education and training hcm', 'bkai & naver'], 'major': ['ai/software research intern', 'advanced program', 'information and communication technology', 'computer science'

In [14]:
import gradio as gr
import time

In [15]:
CLASSIFIER_PROMPT_TEMPLATE ="""
You are a classifier, you receive the following query from the user and select which of the following type of query the user request falls into is given in json format with the following {key:value} format: (key is the action type, value is the meaning of the action):
{
"FIND": Command queries execute system functions: search for candidates who meet recruitment criteria, not summary function
"OTHERS": Other types of questions such as summary function, get resume example
}

"Note": If user only mention the above issues and ask questions about resume, the question type is still OTHERS

Query: $$QUERY$$


Please return a single string of the type of the query, not json. Do not return anything else.

RESPONSE: query_type

"""

In [16]:
# query = "Looking for candidates have experience with AI"
# query = "Give me summary information about CV has name: nguyen tien dung"
query = "Give me some resume example in your system"
inputs = CLASSIFIER_PROMPT_TEMPLATE.replace("$$QUERY$$", query)

chat = []
chat.append({
    "role": "user",
    "content": inputs
})

prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
prompts = [prompt]
token_inputs = [tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt)) for prompt in prompts]


generated_text = generator.generate_batch(
    token_inputs,
    max_length=500,
    sampling_topp = 0.9,
    include_prompt_in_result=False,
    # end_token = [28789, 28749, 28767]
    # end_token = [28789, 28735, 28767]
    # end_token = [32005, 32006]
    # sampling_topk=10,
)
response = tokenizer.decode(generated_text[0].sequences_ids[0], skip_special_tokens=False)
print(response)

OTHERS


In [17]:
conversation = None
def create_conversation(data):
    global conversation
    conversation = [
        {
            "role": "system",
            "content": f"You are a helpful assistant. When users ask about CV information, write it down in a summary way. Use below information to answer the question from user. \n Information: {str(data)}"
        }
    ]



In [18]:
init_data = list(collection.find())

In [19]:
MAX_THREAD = 10
examples = json.load(open('./data/examples.json', 'r'))


def slow_echo(message, history):
    query = message
    inputs = CLASSIFIER_PROMPT_TEMPLATE.replace("$$QUERY$$", query)

    chat = []
    chat.append({
        "role": "user",
        "content": inputs
    })

    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    prompts = [prompt]
    token_inputs = [tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt)) for prompt in prompts]


    generated_text = generator.generate_batch(
        token_inputs,
        max_length=500,
        sampling_topp = 0.9,
        include_prompt_in_result=False,
        # end_token = [28789, 28749, 28767]
        # end_token = [28789, 28735, 28767]
        # end_token = [32005, 32006]
        # sampling_topk=10,
    )
    response = tokenizer.decode(generated_text[0].sequences_ids[0], skip_special_tokens=False)

    response = response.replace("`", "").replace("\n", "")
    action_type = response

    if action_type == "FIND":


        inputs = CONTROLLER_PROMPT_TEMPLATE.replace("$$QUERY$$", query).replace("$$FUNCTIONS_DECRIPTION$$", functions_description)

        chat = [
            {
                "role": "system",
                "content": f"Use information in detailed resume data to answer the question from user."
            }
        ]


        chat.append({
            "role": "user",
            "content": inputs
        })

        prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
        prompts = [prompt]
        token_inputs = [tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt)) for prompt in prompts]


        generated_text = generator.generate_batch(
            token_inputs,
            max_length=500,
            sampling_topp = 0.9,
            include_prompt_in_result=False,
            # end_token = [28789, 28749, 28767]
            # end_token = [28789, 28735, 28767]
            # end_token = [32005, 32006]
            # sampling_topk=10,
        )
        response = tokenizer.decode(generated_text[0].sequences_ids[0], skip_special_tokens=False)

        func_metadata = get_func_obj_by_response(response)
        print(func_metadata['params'])
        result, names = _find_resume(func_metadata['params'])
        print(result)
        response = ""

        all_datas = []
        cnt = 1
        for i, item in enumerate(result):
            if item['score']:
                response += f"{cnt}. ***Name:*** {names[i]} - ***Score:*** {item['score']}\n+ ***Link:*** http://localhost:8001{item['file_path']}\n\n"
                del item['score']
                del item['batch_id']
                del item['image_paths']
                all_datas.append(item)
                cnt += 1

        for i in range(len(response)):
            time.sleep(0.01)
            yield "Here are best resume matches:\n " + response[: i+1]

        create_conversation(all_datas)
        # chat.append({
        #     "role": "assistant",
        #     "content": response + f"\n Detailed resume data: {str(all_datas)}"
        # })
    else:

        if conversation is None:
            tmp = []
            for i, item in enumerate(init_data):
                del item['batch_id']
                del item['image_paths']
                tmp.append(item)
            create_conversation(tmp)
        conversation.append({
            "role": "user",
            "content": query
        })

        print(conversation)

        prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
        prompts = [prompt]
        token_inputs = [tokenizer.convert_ids_to_tokens(tokenizer.encode(prompt)) for prompt in prompts]


        generated_text = generator.generate_batch(
            token_inputs,
            max_length=500,
            sampling_topp = 0.9,
            include_prompt_in_result=False,
            # end_token = [28789, 28749, 28767]
            # end_token = [28789, 28735, 28767]
            # end_token = [32005, 32006]
            # sampling_topk=10,
        )
        response = tokenizer.decode(generated_text[0].sequences_ids[0], skip_special_tokens=False)

        yield response



gr.ChatInterface(slow_echo, title = "Resume Search Summer Soict 2024", examples = examples).launch(share=True)

Running on local URL:  http://127.0.0.1:7860


Running on public URL: https://f13b6ccd788b93f7a1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




{'skills': ['AI', 'deep learning', 'machine learning']}
11
11
ai
ai
machine learning
ai
machine learning
ai
deep learning
machine learning
ai
deep learning
machine learning
deep learning
machine learning
ai
deep learning
[{'_id': ObjectId('66c6abfeea4dd50402925c8d'), 'organization': ['ho chi minh university of science', 'soict hackathon', 'vietai hanoi vietnam', 'foundation of deep learning and advanced natural language processing', 'tymlez cohort', 'naver samsung tiki and vietcombank', 'apcs', 'ieee international conference on systems man and cybernetics', 'ieee international conference on software testing verification and validation', 'katalon inc.', 'slu spoken language understanding', 'department of science and technology', 'ho chi minh university of science vietnam', 'advantech group', 'hanoi university of science and technology', 'benit ltd. hanoi vietnam', 'department of information and communications vietnam national university', 'ho chi minh vietnam', 'fodl', 'aiot innoworks',

Traceback (most recent call last):
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/gradio/chat_interface.py", line 592, in _stream_fn
    first_response = await async_iteration(generator)
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/gradio/utils.py", line 657, in async_iteration
    return await iterator.__anext__()
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/gradio/utils.py", line 650, in __anext__
    return await anyio.to_thread.run_sync(
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2177, in run_sync_in_worker_thread
    return await future
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 859, in run
    re

{'gender': 'female', 'skills': ['machine learning', 'deep learning']}
3
3
[{'_id': ObjectId('66c6b46eea4dd50402925c9b'), 'organization': ['safehorizons', 'vietnam korea university of information and communication technology'], 'major': [], 'gpa': ['gpa 3.65'], 'name': ['nguyen thi chau thi'], 'skill': ['assisting students', 'java', 'mysql', 'css', 'firebase', 'inventory tracking', 'debugging', 'php', 'flutter', 'c++', 'restful', 'spring', 'html'], 'gender': ['female'], 'job_title': ['teaching assistant', 'software engineer', 'internship'], 'profile_url': [], 'email': ['chauthi1704@gmail.com'], 'phone': ['84387285692'], 'education': [], 'batch_id': ObjectId('66c6b464ea4dd50402925c98'), 'file_path': '/data/files/cv/page/ChauThi_resume.pdf', 'image_paths': [], 'score': 0}, {'_id': ObjectId('66c6b46eea4dd50402925c9d'), 'organization': ['github', 'nals joint stock company', 'vietnam bank for agriculture and rural development', 'hai chau da nang', 'da nang', 'vietnam korea university of ict'

Traceback (most recent call last):
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/gradio/chat_interface.py", line 592, in _stream_fn
    first_response = await async_iteration(generator)
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/gradio/utils.py", line 657, in async_iteration
    return await iterator.__anext__()
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/gradio/utils.py", line 650, in __anext__
    return await anyio.to_thread.run_sync(
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2177, in run_sync_in_worker_thread
    return await future
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 859, in run
    re

{'gender': 'female', 'skills': 'software engineering'}
3
3
[{'_id': ObjectId('66c6b46eea4dd50402925c9b'), 'organization': ['safehorizons', 'vietnam korea university of information and communication technology'], 'major': [], 'gpa': ['gpa 3.65'], 'name': ['nguyen thi chau thi'], 'skill': ['assisting students', 'java', 'mysql', 'css', 'firebase', 'inventory tracking', 'debugging', 'php', 'flutter', 'c++', 'restful', 'spring', 'html'], 'gender': ['female'], 'job_title': ['teaching assistant', 'software engineer', 'internship'], 'profile_url': [], 'email': ['chauthi1704@gmail.com'], 'phone': ['84387285692'], 'education': [], 'batch_id': ObjectId('66c6b464ea4dd50402925c98'), 'file_path': '/data/files/cv/page/ChauThi_resume.pdf', 'image_paths': [], 'score': 0}, {'_id': ObjectId('66c6b46eea4dd50402925c9d'), 'organization': ['github', 'nals joint stock company', 'vietnam bank for agriculture and rural development', 'hai chau da nang', 'da nang', 'vietnam korea university of ict', 'vietnam kore

Traceback (most recent call last):
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/gradio/chat_interface.py", line 592, in _stream_fn
    first_response = await async_iteration(generator)
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/gradio/utils.py", line 657, in async_iteration
    return await iterator.__anext__()
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/gradio/utils.py", line 650, in __anext__
    return await anyio.to_thread.run_sync(
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2177, in run_sync_in_worker_thread
    return await future
  File "/home/long/anaconda3/envs/mlops-env/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 859, in run
    re

{'gender': 'female', 'skills': ['software engineering', 'web', 'app']}
3
3
web
[{'_id': ObjectId('66c6b46eea4dd50402925c9b'), 'organization': ['safehorizons', 'vietnam korea university of information and communication technology'], 'major': [], 'gpa': ['gpa 3.65'], 'name': ['nguyen thi chau thi'], 'skill': ['assisting students', 'java', 'mysql', 'css', 'firebase', 'inventory tracking', 'debugging', 'php', 'flutter', 'c++', 'restful', 'spring', 'html'], 'gender': ['female'], 'job_title': ['teaching assistant', 'software engineer', 'internship'], 'profile_url': [], 'email': ['chauthi1704@gmail.com'], 'phone': ['84387285692'], 'education': [], 'batch_id': ObjectId('66c6b464ea4dd50402925c98'), 'file_path': '/data/files/cv/page/ChauThi_resume.pdf', 'image_paths': [], 'score': 0}, {'_id': ObjectId('66c6b46eea4dd50402925c9d'), 'organization': ['github', 'nals joint stock company', 'vietnam bank for agriculture and rural development', 'hai chau da nang', 'da nang', 'vietnam korea university of