# NOTE for this notebook:
1. use pdfplumber to parse pdf file to avoid parsing word by word error
2. Add feedback loop

In [1]:
import json
import textwrap
import pandas as pd
import torch
import transformers
import os
import PyPDF2
import re
from langchain_core.output_parsers import JsonOutputParser
import gc
import pdfplumber
import evaluate
from pprint import pprint


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def parse_json_markdown(json_string: str) -> dict:
    try:
        # Try to find JSON string within first and last triple backticks
        if json_string[3:13].lower() == "typescript":
            json_string = json_string.replace(json_string[3:13], "",1)
        
        if 'JSON_OUTPUT_ACCORDING_TO_RESUME_DATA_SCHEMA' in json_string:
            json_string = json_string.replace("JSON_OUTPUT_ACCORDING_TO_RESUME_DATA_SCHEMA", "",1)
        
        if json_string[3:7].lower() == "json":
            json_string = json_string.replace(json_string[3:7], "",1)
        

        # match = re.search(r"""```*
        #                     (?:json)?
        #                     (.*)```""", json_string, flags=re.DOTALL|re.VERBOSE)

        # # If no match found, assume the entire string is a JSON string
        # if match is None:
        #     json_str = json_string
        # else:
        #     # If match found, use the content within the backticks
        #     json_str = match.group(1)

        # # Strip whitespace and newlines from the start and end
        # json_str = json_str.strip()

        # # Parse the JSON string into a Python dictionary while allowing control characters by setting strict to False
        # parsed = json.loads(json_str)
        parser = JsonOutputParser()
        parsed = parser.parse(json_string)

        return parsed
    except Exception as e:
        print(e)
        return None
    
def get_prompt(system_prompt_path: str) -> str:
        """
        Reads the content of the file at the given system_prompt_path and returns it as a string.

        Args:
            system_prompt_path (str): The path to the system prompt file.

        Returns:
            str: The content of the file as a string.
        """
        with open(system_prompt_path, encoding="utf-8") as file:
            return file.read().strip() + "\n"
        
def extract_text(pdf_path: str):
    resume_text = "" 
    with pdfplumber.open(pdf_path) as pdf:
        for page_num in range(len(pdf.pages)):
            resume_text  += pdf.pages[page_num].extract_text()
        return resume_text

    # resume_text = ""
    # with open(pdf_path, 'rb') as file:
    #     pdf_reader = PyPDF2.PdfReader(file)
    #     num_pages = len(pdf_reader.pages)

    #     for page_num in range(num_pages):
    #         page = pdf_reader.pages[page_num]
    #         text = page.extract_text().split("\n")

    #         # Remove Unicode characters from each line
    #         cleaned_text = [re.sub(r'[^\x00-\x7F]+', '', line) for line in text]

    #         # Join the lines into a single string
    #         cleaned_text_string = '\n'.join(cleaned_text)
    #         resume_text += cleaned_text_string
        
    #     return resume_text

In [3]:
# Education, Skills, Work Experience, Projects, Certifications, Achievements
#"/home/gabe/workspace/resume_work/job-llm/zlm/prompts/resume-extractor.txt"
system_prompt_path_heading_info = "resume-extractor_heading_info.txt"
system_prompt_path_education = "resume-extractor_education.txt" 
system_prompt_path_skills = "resume-extractor_skills.txt" 
system_prompt_path_work_experience = "resume-extractor_work_experience.txt" 
system_prompt_path_projects = "resume-extractor_projects.txt"
system_prompt_path_certifications = "resume-extractor_certifications.txt"
system_prompt_path_achievements = "resume-extractor_achievements.txt"

system_prompt_heading_info = get_prompt(system_prompt_path_heading_info)
system_prompt_education = get_prompt(system_prompt_path_education)
system_prompt_skills = get_prompt(system_prompt_path_skills)
system_prompt_work_experience = get_prompt(system_prompt_path_work_experience)
system_prompt_projects = get_prompt(system_prompt_path_projects)
system_prompt_certifications = get_prompt(system_prompt_path_certifications)
system_prompt_achievements = get_prompt(system_prompt_path_achievements)


pdf_path_demo = "/home/gabe/workspace/resume_work/job-llm/zlm/demo_data/user_resume.pdf"
pdf_path_mine = "/home/gabe/workspace/resume_work/job-llm/zlm/demo_data/my_resume.pdf"
resume_text_demo = extract_text(pdf_path_demo)
resume_text_mine = extract_text(pdf_path_mine)
resume_to_use = resume_text_mine

In [4]:
lines = resume_to_use.split("\n")
targets_dict = {
    "Education": ["Education", "Educations", "educational background", "education and training'"],
    "Skills": ["Skills", "Skill"],
    "Work Experience": ["Work", "Work Experience", "Experience"],
    "Projects": ["Projects", "Project"],
    "Certifications": ["Certification", "Certifications"],
    "Achievements": ["Achievements"]
}
section_line_number = {}

meteor = evaluate.load('meteor')

for line in lines:
    for target_name, target_set in targets_dict.items():
        best_score = 0
        name = ""
        for target in target_set:
            line_lower = line.lower()
            target_lower = target.lower()
            #print(line_lower, target)
            results = meteor.compute(predictions=[target_lower], references=[line_lower])
            # use the difference in length as a penalty
            penalty = abs(len(line_lower.split()) - len(target_lower.split()))
            penalty = 1 if penalty == 0 else penalty
            score = results["meteor"] / penalty
            if score > best_score:
                best_score = score
                name = target
        if best_score > 0.18:
            print("score between {} and {}: {}, line number: {}".format(line, name, best_score, lines.index(line)))
            section_line_number[lines.index(line)] = target_name
print(section_line_number)
print([lines[i] for i in section_line_number.keys()])
index_list = list(section_line_number.keys())


[nltk_data] Downloading package wordnet to /home/gabe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/gabe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/gabe/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


score between SKILLS and Skills: 0.5, line number: 3
score between WORK EXPERIENCE and Work Experience: 0.9375, line number: 5
score between PROJECTS and Projects: 0.5, line number: 25
score between EDUCATION and Education: 0.5, line number: 35
{3: 'Skills', 5: 'Work Experience', 25: 'Projects', 35: 'Education'}
['SKILLS', 'WORK EXPERIENCE', 'PROJECTS', 'EDUCATION']


In [5]:
from pprint import pprint
pprint(resume_text_demo)
print(resume_text_mine)

('AMEY SADANAND BHILEGAONKAR\n'
 '(cid:211) 480-616-3980  ameybhilegaonkar3@gmail.com (cid:135) '
 'github.com/ameygoes fl linkedin.com/in/amey-bhilegaonkar\n'
 'Withover3+yearsofexperienceasaDataEngineerspecializinginETLdatapipelinesforlarge-scaledistributeddatabase\n'
 'systems, I bring both expertise and enthusiasm for handling complex data '
 'challenges to your team.\n'
 'WORK EXPERIENCE\n'
 'BigCommerce Austin, Texas\n'
 'Data Science Intern June 2023 - August 2023\n'
 '· Designed and managed a large-scale Snowflake data retrieval pipeline for '
 'efficient data warehousing.\n'
 '· Implemented logistic regression, and predictive models, improving customer '
 'retention prediction accuracy by 12%.\n'
 '· Collaborated with data infrastructure teams to ensure data availability '
 'and resolve data-related issues.\n'
 '· Leveraged advanced data-mining techniques to process and analyze millions '
 'of data points, extracting critical features for\n'
 'search indexing and ranking.\n'
 

In [6]:
sections = {
    "heading_info": resume_to_use.split('\n')[:index_list[0]],
}

for idx, (start_line, section_name) in enumerate(section_line_number.items()):
    if idx < len(index_list) - 1:
        sections[section_name] = resume_to_use.split('\n')[index_list[idx]:index_list[idx+1]]
    else:
        sections[section_name] = resume_to_use.split('\n')[index_list[idx]:],
print(sections)

{'heading_info': ['Gabriel Chen (Shou-Zhong)', '(+1)647-979-9461 szgabrielchen@gmail.com linkedin.com/in/gabrielchen65', '4yearsofMachineLearningEngineerexperience,ledinfraredfacerecognitionproject,deliveredhigh-impactproduct.'], 'Skills': ['SKILLS', 'NLP|LLM|Python|PyTorch|TensorFlow|Ubuntu|GitHub|SQL(MySQL)|ComputerVision'], 'Work Experience': ['WORK EXPERIENCE', 'Egis Technology 2019May–2022June', 'Machine Learning Engineer (Infrared Face Recognition Project with Python, Pytorch)', '● Project lead since October 2020. The product has been in mass production since October 2021.', 'This product drove $5 million in revenue and fostered the following partnership with the client.', '● Analyzedrootcausesofpredictionerrors,andmanageddatacollectionfromin-houseandoutsourcedteams', '● Fine-tunedmodelwithlarge-scaleimagedatasetfromdifferentsourceswithdistributedcomputingandscheduler', '● Proposedandimplementeddatapipelinerefinementstoreduce10%falserejectionrate', '● Reduced5%falserejectionrateu

In [7]:
class Mistral:
    def __init__(self):
        
        model_id= "mistralai/Mistral-7B-Instruct-v0.2" # "mistralai/Mixtral-8x7B-Instruct-v0.1"
        quantization_config = transformers.BitsAndBytesConfig(
                                load_in_8bit=True,
                                bnb_8bit_compute_dtype=torch.bfloat16
                            )        

        model = transformers.AutoModelForCausalLM.from_pretrained(
            model_id,
            trust_remote_code=True,
            device_map='auto',
            quantization_config=quantization_config,
        )

        tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

        self.generate_text = transformers.pipeline(
            model=model, tokenizer=tokenizer,
            return_full_text=False,  # if using langchain set True
            task="text-generation",
            do_sample=False,
            max_new_tokens=4000,  # max number of tokens to generate in the output
            repetition_penalty=1.15,  # if output begins repeating increase
        )         

    def get_response(self, system_prompt, prompt_text, expecting_longer_output=False, need_json_output=False, feedback=False):
        # Special format required by the Mistral Instruct Chat Model 
        # where we can use system messages to provide more context about the task
        prompt = f'<s> [INST] {system_prompt} {prompt_text} [/INST]'

        response = self.generate_text(prompt)[0]["generated_text"]

        #torch.cuda.empty_cache()

        if need_json_output:
            parsed = parse_json_markdown(response)
            if parsed:
                # for debugging
                with open("/home/gabe/workspace/resume_work/job-llm/lab/resume_parsed.json", "w") as f:
                    json.dump(parsed, f, indent=4)
                return parsed, response
            else:
                if feedback:
                    print("Parsing failed, giving feedback to the model and run again...")
                    prompt += f'{prompt} {response} </s> [INST] Your previous output is not a valid JSON format, try to fix that [/INST]'
                    response = self.generate_text(prompt)[0]["generated_text"]
                    parsed = parse_json_markdown(response)
                    return parsed, response
                else:
                    print("parsing failed. Returning raw response")
                    return None, response
        else:
            return response

In [8]:
llm = Mistral()

Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.63s/it]


In [9]:
# heading_info, Education, Skills, Work Experience, Projects, Certifications, Achievements
resume_json_heading_info, response_heading_info = llm.get_response(
                system_prompt=system_prompt_heading_info, 
                prompt_text=sections["heading_info"], need_json_output=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [10]:
# heading_info, Education, Skills, Work Experience, Projects, Certifications, Achievements
resume_json_education, response_education = llm.get_response(
                system_prompt=system_prompt_education, 
                prompt_text=sections["Education"], need_json_output=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [11]:
resume_json_skills, response_skills = llm.get_response(
                system_prompt=system_prompt_skills, 
                prompt_text=sections["Skills"], need_json_output=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [12]:
resume_json_work_experience, response_work_experience = llm.get_response(
                system_prompt=system_prompt_work_experience, 
                prompt_text=sections["Work Experience"], need_json_output=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [13]:
resume_json_projects, response_projects = llm.get_response(
                system_prompt=system_prompt_projects, 
                prompt_text=sections["Projects"], need_json_output=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [19]:
# Certifications, Achievements
resume_json_certifications, response_certifications = llm.get_response(
                system_prompt=system_prompt_certifications, 
                prompt_text=sections["Certifications"], need_json_output=True)

KeyError: 'Certifications'

In [None]:
# Certifications, Achievements
resume_json_achievements, response_achievements = llm.get_response(
                system_prompt=system_prompt_achievements, 
                prompt_text=sections["Achievements"], need_json_output=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Assemble the JSON files from each section

In [17]:
resume_all = {
  "name": resume_json_heading_info["name"],
  "summary": resume_json_heading_info["summary"],
  "phone": resume_json_heading_info["phone"],
  "email": resume_json_heading_info["email"],
  "media": {
    "linkedin": resume_json_heading_info["linkedin"],
    "github": resume_json_heading_info["github"] if "github" in resume_json_heading_info else "",
    "devpost": resume_json_heading_info["devpost"] if "devpost" in resume_json_heading_info else "",
    "medium": resume_json_heading_info["medium"] if "medium" in resume_json_heading_info else "",
    "leetcode": resume_json_heading_info["leetcode"] if "leetcode" in resume_json_heading_info else "",
    "dagshub": resume_json_heading_info["dagshub"] if "dagshub" in resume_json_heading_info else "",
    "kaggle": resume_json_heading_info["kaggle"] if "kaggle" in resume_json_heading_info else "",
    "instagram": resume_json_heading_info["instagram"] if "instagram" in resume_json_heading_info else "",
  },
  "education": resume_json_education,
  "skills": resume_json_skills,
  "work_experience": resume_json_work_experience,
  "projects": resume_json_projects,
  "certifications": [],
  "achievements": []
}

json.dump(resume_all, open("resume_all_generate_by_section.json", "w"), indent=2)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise

def cosine_similarity(document1: str, document2: str) -> float:
    """Calculate the cosine similarity between two documents.

    Args:
        document1 (str): The first document.
        document2 (str): The second document.

    Returns:
        float: The cosine similarity between the two documents.
    """
    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Transform the documents into TF-IDF vectors
    vectors = vectorizer.fit_transform([document1, document2])

    cosine_similarity_score = pairwise.cosine_similarity(vectors[0], vectors[1])
    # Calculate the cosine similarity between the two vectors
    # cosine_similarity = np.dot(vectors[0], vectors[1].T) / (np.linalg.norm(vectors[0].toarray()) * np.linalg.norm(vectors[1].toarray()))

    return cosine_similarity_score.item()

In [None]:
import json
generated = json.dumps(resume_all)
with open("user_profile.json") as f:
    ground_truth = json.dumps(json.load(f))
score = cosine_similarity(generated, ground_truth)
print(score)
score = cosine_similarity(generated, generated)
print(score)
score = cosine_similarity(ground_truth, ground_truth)
print(score)


0.6308746705618988
1.0000000000000009
1.0


In [18]:
import json
generated = json.dumps(resume_all)
with open("my_resume_ground_truth.json") as f:
    ground_truth = json.dumps(json.load(f))
score = cosine_similarity(generated, ground_truth)
print(score)
score = cosine_similarity(generated, generated)
print(score)
score = cosine_similarity(ground_truth, ground_truth)
print(score)

0.9450426644246636
0.9999999999999998
0.9999999999999999


In [None]:
del llm

In [None]:
torch.cuda.empty_cache()
gc.collect() # Python thing

In [1]:
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate

import json
import textwrap
import pandas as pd
import torch
import transformers
import os
import PyPDF2
import re
from langchain_core.output_parsers import JsonOutputParser
import gc
import pdfplumber

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
n_gpu_layers = -1  # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU.
n_batch = 4000  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
#model_path = "/home/gabe/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.2/gguf/mistral-7b-instruct-v0.2.Q8_0.gguf"
#model_path = "/home/gabe/workspace/models_library/mixtral-instruct-8x7b-2.34bpw.gguf" #"/home/gabe/workspace/llama.cpp/Mistral-7B-Instruct-v0.2.gguf"
model_path = "/home/gabe/.cache/huggingface/hub/models--mistralai--Mistral-7B-Instruct-v0.2/gguf/mistral-7b-instruct-v0.2.Q8_0.gguf"
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    grammar_path="./json.gbnf",
    n_ctx=32000, # max number of tokens in the input
    rope_freq_base=1e6,
    verbose=False,
    echo=False,
    task="text-generation",
    do_sample=False,
    max_tokens=4000,  # max number of tokens to generate in the output
    repetition_penalty=1.15,  # if output begins repeating increase
    attention_dropout=0.0,
    eos_token_id=2,
    bos_token_id=1,
    hidden_act="silu",
    hidden_size=4096,
    initializer_range=0.02,
    intermediate_size=14336,
    max_position_embeddings=32768,
    model_type="mistral",
    num_attention_heads=32,
    num_hidden_layers=32,
    num_key_value_heads=8,
    rms_norm_eps=1e-05,
    rope_theta=1000000.0,
    #sliding_window=null,
    tie_word_embeddings=False,
    torch_dtype="bfloat16",
    transformers_version="4.36.0",
    use_cache=True,
    vocab_size=32000
)

                task was transferred to model_kwargs.
                Please confirm that task is what you intended.
                do_sample was transferred to model_kwargs.
                Please confirm that do_sample is what you intended.
                repetition_penalty was transferred to model_kwargs.
                Please confirm that repetition_penalty is what you intended.
                attention_dropout was transferred to model_kwargs.
                Please confirm that attention_dropout is what you intended.
                eos_token_id was transferred to model_kwargs.
                Please confirm that eos_token_id is what you intended.
                bos_token_id was transferred to model_kwargs.
                Please confirm that bos_token_id is what you intended.
                hidden_act was transferred to model_kwargs.
                Please confirm that hidden_act is what you intended.
                hidden_size was transferred to model_kwargs.
            

In [21]:
prompt = f'<s> [INST] {system_prompt_heading_info} {sections["heading_info"]} [/INST]'
response = llm.invoke(prompt)
resume_json_heading_info = parse_json_markdown(response)
#pprint(parsed)
#pprint(response)

In [16]:
prompt = f'<s> [INST] {system_prompt_education} {sections["Education"]} [/INST]'
response = llm.invoke(prompt)
resume_json_education = parse_json_markdown(response)

In [17]:
prompt = f'<s> [INST] {system_prompt_skills} {sections["Skills"]} [/INST]'
response = llm.invoke(prompt)
resume_json_skills = parse_json_markdown(response)

In [18]:
prompt = f'<s> [INST] {system_prompt_work_experience} {sections["Work Experience"]} [/INST]'
response = llm.invoke(prompt)
resume_json_work_experience = parse_json_markdown(response)

In [27]:
prompt = f'<s> [INST] {system_prompt_path_projects} {sections["Projects"]} [/INST]'
response = llm.invoke(prompt)
resume_json_projects = parse_json_markdown(response)

In [28]:
resume_all_langchain = {
  "name": resume_json_heading_info["name"],
  "summary": resume_json_heading_info["summary"],
  "phone": resume_json_heading_info["phone"],
  "email": resume_json_heading_info["email"],
  "media": {
    "linkedin": resume_json_heading_info["linkedin"] if "linkedin" in resume_json_heading_info else "",
    "github": resume_json_heading_info["github"] if "github" in resume_json_heading_info else "",
    "devpost": resume_json_heading_info["devpost"] if "devpost" in resume_json_heading_info else "",
    "medium": resume_json_heading_info["medium"] if "medium" in resume_json_heading_info else "",
    "leetcode": resume_json_heading_info["leetcode"] if "leetcode" in resume_json_heading_info else "",
    "dagshub": resume_json_heading_info["dagshub"] if "dagshub" in resume_json_heading_info else "",
    "kaggle": resume_json_heading_info["kaggle"] if "kaggle" in resume_json_heading_info else "",
    "instagram": resume_json_heading_info["instagram"] if "instagram" in resume_json_heading_info else "",
  },
  "education": resume_json_education,
  "skills": resume_json_skills,
  "work_experience": resume_json_work_experience,
  "projects": resume_json_projects,
  "certifications": [],
  "achievements": []
}

json.dump(resume_all_langchain, open("resume_all_langchain_generate_by_section.json", "w"), indent=2)

In [29]:
import json
generated = json.dumps(resume_all_langchain)
with open("my_resume_ground_truth.json") as f:
    ground_truth = json.dumps(json.load(f))
score = cosine_similarity(generated, ground_truth)
print(score)
score = cosine_similarity(generated, generated)
print(score)
score = cosine_similarity(ground_truth, ground_truth)
print(score)

0.8829009171403182
0.9999999999999998
0.9999999999999999
