In [10]:
import json
import pdfplumber
import ollama
from pprint import pprint
from pydantic import BaseModel, RootModel, Field

In [2]:
with open("../sample-data/job-desc.txt", "r") as file:
    job_desc = file.read()
    
with open("../sample-data/resumes.json", "r") as file:
    resumes = json.load(file)

In [4]:
def extract_pdf_text(pdf_path: str) -> str:
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

my_resume = extract_pdf_text("../sample-data/sample.pdf")

### TF-IDF for keyword extraction

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import re

In [6]:
corpus = [resumes[0], job_desc]

# Remove numbers before extracting keywords
corpus = [re.sub(r"\d+", "", text) for text in corpus]

vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.toarray()

feature_names

array(['abc', 'abilities', 'adhering', 'agile', 'apis', 'application',
       'applications', 'architecture', 'attention', 'bachelor', 'best',
       'boot', 'built', 'clean', 'closely', 'code', 'collaborate',
       'collaboration', 'com', 'commerce', 'communication', 'complex',
       'computer', 'cross', 'databases', 'debug', 'degree', 'deliver',
       'deploy', 'description', 'design', 'designers', 'develop',
       'developing', 'development', 'django', 'docker', 'doe', 'downtime',
       'dynamic', 'education', 'efficient', 'engineer', 'engineering',
       'environment', 'equivalent', 'example', 'excellent', 'experience',
       'experienced', 'expertise', 'field', 'frameworks', 'functional',
       'gather', 'git', 'growth', 'high', 'innovation', 'java',
       'javascript', 'job', 'john', 'johndoe', 'join', 'june',
       'knowledge', 'kubernetes', 'languages', 'led', 'linkedin',
       'maintain', 'maintainable', 'maintained', 'managers', 'members',
       'methodologies', '

In [7]:
resume_keywords = [
    (feature_names[i], tfidf_scores[0][i])
    for i in tfidf_scores[0].argsort()[::-1]
]
print("Top Resume Keywords:")
pprint(resume_keywords)

Top Resume Keywords:
[('software', 0.2405245183143361),
 ('engineer', 0.2405245183143361),
 ('javascript', 0.22536587875688172),
 ('com', 0.22536587875688172),
 ('doe', 0.22536587875688172),
 ('john', 0.22536587875688172),
 ('python', 0.1603496788762241),
 ('scalable', 0.1603496788762241),
 ('migration', 0.11268293937844086),
 ('microservices', 0.11268293937844086),
 ('maintained', 0.11268293937844086),
 ('mongodb', 0.11268293937844086),
 ('linkedin', 0.11268293937844086),
 ('monolithic', 0.11268293937844086),
 ('led', 0.11268293937844086),
 ('knowledge', 0.11268293937844086),
 ('passion', 0.11268293937844086),
 ('june', 0.11268293937844086),
 ('platforms', 0.11268293937844086),
 ('postgresql', 0.11268293937844086),
 ('johndoe', 0.11268293937844086),
 ('xyz', 0.11268293937844086),
 ('expertise', 0.11268293937844086),
 ('git', 0.11268293937844086),
 ('experienced', 0.11268293937844086),
 ('example', 0.11268293937844086),
 ('education', 0.11268293937844086),
 ('downtime', 0.1126829393784

TF-IDF isn't ideal for matching keywords between a resume and a job description, since TF-IDF assigns higher scores for words that are *unique*. If a word appears in both the resume and job description, its *uniqueness* score is decreased.

### Using NLP libraries

Part of speech tagging

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp(resumes[0])

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [17]:
doc

John Doe
Software Engineer
john.doe@example.com | (123) 456-7890 | linkedin.com/in/johndoe

Summary:
Experienced software engineer with expertise in developing scalable web applications, strong knowledge of Python and JavaScript, and a passion for solving complex problems.

Skills:
- Programming Languages: Python, JavaScript, Java
- Frameworks: Django, React, Spring Boot
- Tools: Git, Docker, Kubernetes
- Databases: PostgreSQL, MongoDB

Experience:
Software Engineer | ABC Tech | June 2020 - Present
- Built and maintained scalable APIs to support high-traffic e-commerce platforms.
- Led migration of a monolithic application to a microservices architecture, reducing downtime by 30%.

Education:
B.S. in Computer Science | University of XYZ | May 2020

In [25]:
keywords = [re.sub(f"[^a-zA-Z]", "", token.text) for token in doc if token.pos_ == "NOUN"]
keywords = [keyword for keyword in keywords if keyword and len(keyword) > 1]
keywords

['software',
 'engineer',
 'expertise',
 'web',
 'applications',
 'knowledge',
 'passion',
 'problems',
 'Skills',
 'Languages',
 'MongoDB',
 'Experience',
 'APIs',
 'traffic',
 'commerce',
 'platforms',
 'migration',
 'application',
 'microservices',
 'architecture',
 'downtime']

NER

In [24]:
entities = [(ent.text, ent.label_) for ent in doc.ents]
entities

[('John Doe\n', 'PERSON'),
 ('Software Engineer', 'ORG'),
 ('123', 'CARDINAL'),
 ('456', 'CARDINAL'),
 ('JavaScript', 'PRODUCT'),
 ('JavaScript', 'PRODUCT'),
 ('Java\n- Frameworks:', 'PERSON'),
 ('Git', 'GPE'),
 ('Docker', 'GPE'),
 ('Kubernetes', 'ORG'),
 ('Software Engineer', 'ORG'),
 ('ABC Tech', 'ORG'),
 ('June 2020 - Present', 'DATE'),
 ('30%', 'PERCENT'),
 ('B.S.', 'GPE'),
 ('Computer Science', 'ORG'),
 ('University of XYZ', 'ORG'),
 ('May 2020', 'DATE')]

NER models are good for identifying general-purpose entities, but struggle to identify skills out of the box. Perhaps I need to fine-tune a model? Or use a transformer-based approach

### Ollama

In [9]:
from ollama import chat

response = chat(model="llama3.2", messages=[
    {
        "role": "user",
        "content": "Why is the sky blue?"
    }
])
response["message"]["content"]

"The sky appears blue because of a phenomenon called scattering, which occurs when sunlight interacts with the tiny molecules of gases in the Earth's atmosphere. Here's a simplified explanation:\n\n1. Sunlight enters the Earth's atmosphere and is made up of a spectrum of colors, including all the colors of the visible light spectrum.\n2. When sunlight hits the tiny molecules of gases in the atmosphere, such as nitrogen (N2) and oxygen (O2), these molecules scatter the shorter (blue) wavelengths more than the longer (red) wavelengths.\n3. This is because the smaller molecules are more effective at scattering the shorter wavelengths, which have a higher frequency and energy.\n4. As a result of this scattering, the blue light is dispersed throughout the atmosphere, giving it a diffuse appearance that makes the sky appear blue to our eyes.\n\nIt's worth noting that the color of the sky can also be affected by other factors, such as:\n\n* Dust and pollution in the atmosphere, which can scat

Structured output:

In [37]:
class Country(BaseModel):
    name: str
    capital: str
    languages: list[str]
    
response = chat(
    model="llama3.2",
    messages=[
        {
            "role": "user",
            "content": "Tell me about the US."
        }
    ],
    format=Country.model_json_schema()
)

country = Country.model_validate_json(response.message.content)
country

Country(name='United States of America', capital='Washington, D.C.', languages=['English', 'Spanish', 'Chinese', 'French', 'Tagalog', 'Vietnamese'])

In [64]:
SKILL_EXTRACTION_TEMPLATE = """
You are an expert are parsing skills from resumes.

Given resume text, please parse individual technical (e.g., programming languages, tools, frameworks, databases) and domain-specific (e.g., methodologies, architectures, or specialized techniques) skills contained in the resume. **Ensure that you do not miss any domain-specific skills.**

Format your output as a JSON object as follows:
    {{
        "Technical Skills": ["list", "of", "technical", "skills"]
        "Domain-Specific Skills": ["list", "of" "domain", "specific", "skills"]
    }}

Resume text:
```
{resume_text}
```

Parsed skills:
"""

In [80]:
class Skills(BaseModel):
    technical_skills: list[str] = Field(..., alias="Technical Skills")
    domain_specific_skills: list[str] = Field(..., alias="Domain-Specific Skills")

response = chat(
    model="llama3.1",
    messages=[
        {
            "role": "user",
            "content": SKILL_EXTRACTION_TEMPLATE.format(resume_text=resumes[1])
        }
    ],
    options={"temperature": 0},
    format=Skills.model_json_schema()
)

skills = Skills.model_validate_json(response.message.content)

In [81]:
pprint(resumes[1])

('Jane Smith\n'
 'Data Scientist\n'
 'jane.smith@example.com | (987) 654-3210 | github.com/janesmith\n'
 '\n'
 'Summary:\n'
 'Data scientist with a strong background in machine learning, statistical '
 'modeling, and data visualization. Skilled in Python, R, and SQL with '
 'experience in predictive analytics.\n'
 '\n'
 'Skills:\n'
 '- Machine Learning: Scikit-learn, TensorFlow, PyTorch\n'
 '- Data Visualization: Tableau, Matplotlib, Seaborn\n'
 '- Databases: MySQL, PostgreSQL\n'
 '- Tools: Jupyter, Excel, Git\n'
 '\n'
 'Experience:\n'
 'Data Scientist | DataCorp | March 2018 - Present\n'
 '- Developed machine learning models to predict customer churn, improving '
 'retention by 20%.\n'
 '- Automated ETL pipelines to streamline data processing, saving 15 hours of '
 'manual work weekly.\n'
 '\n'
 'Education:\n'
 'M.S. in Data Science | University of ABC | December 2017')


In [83]:
pprint(skills.model_dump())

{'domain_specific_skills': ['Machine Learning',
                            'Predictive Analytics',
                            'Statistical Modeling',
                            'Data Visualization',
                            'ETL Pipelines',
                            'Automated Data Processing'],
 'technical_skills': ['Python',
                      'R',
                      'SQL',
                      'Scikit-learn',
                      'TensorFlow',
                      'PyTorch',
                      'Tableau',
                      'Matplotlib',
                      'Seaborn',
                      'MySQL',
                      'PostgreSQL',
                      'Jupyter',
                      'Excel',
                      'Git']}
