### Information extraction from documents using LLM

### GroqLLM  

In [2]:
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import os
load_dotenv()

os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")

In [3]:
model_name = "llama-3.1-8b-instant"
llm = ChatGroq(model=model_name,
               temperature=0,
               verbose=True
               )

In [4]:
response = llm.invoke("What is the capital of india?")
print(response.content)

The capital of India is New Delhi.


#### Extract text from documents

In [5]:
import pdfplumber


def text_extractore(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"

    return text

file_path = r"..\documents\bhargavDasRsm.pdf"
chunks = text_extractore(file_path=file_path)

In [6]:
print(chunks)

Bhargav Das
+91 9871079256 | erbhargavdas@gmail.com | linkedin.com/in/bhrgvbhrgv | github.com/bhrgvbhrgv
Education
Baderia Global Institute of Engineering & Management Jabalpur
Bachelor of Technology in Computer Science July 2022 - July 2026
Aditya Convent Senior Secondary School Jabalpur
Class XII July 2021 - July 2022
Experience
Freelanced — Full-Stack Web Developer Aug 2024 - Sept 2024
Creative Interior India
• Tech Stack: React.js, Next.js, CSS, Firebase
• Achieved 90+ Google Lighthouse scores in performance, accessibility, and SEO through optimized architecture and
clean code.
• Attracted 1,000+ unique visitors within the first 3 months of launch through SEO and social sharing.
• Integrated Firebase for scalable backend services including real-time database and secure hosting, ensuring 99.9%
uptime.
• Live Link: https://creativeconstruction.in/
Internship: Front-End Web Developer Mar 2023 - May 2023
DAO Info-Tech
• Contributed to the development and launch of 6+ business websites,

#### Extract skills from resume and JD

In [164]:
from langchain.prompts import PromptTemplate
prompt_template = PromptTemplate.from_template("""
You are an expert HR assistant that extracts technical and professional skills from resumes and job descriptions.

Instructions:
- Extract only SKILLS from the given text.
- Normalize variations into a common standard (e.g., VLOOKUP → Excel, Random Forest → Machine Learning).
- Return output strictly in JSON format like:
{{"Skill1", "Skill2", "Skill3"}}
- Don't return any additional text or explanations.

Text:
{context}
""")

In [165]:
prompt_template.format(context="Kamal")

'\nYou are an expert HR assistant that extracts technical and professional skills from resumes and job descriptions.\n\nInstructions:\n- Extract only SKILLS from the given text.\n- Normalize variations into a common standard (e.g., VLOOKUP → Excel, Random Forest → Machine Learning).\n- Return output strictly in JSON format like:\n{"Skill1", "Skill2", "Skill3"}\n- Don\'t return any additional text or explanations.\n\nText:\nKamal\n'

In [166]:
response = llm.invoke(input=prompt_template.format(context=chunks))
response.content

'```json\n{\n  "JavaScript", \n  "HTML/CSS", \n  "C++", \n  "SQL", \n  "Python", \n  "React.js", \n  "Node.js", \n  "Express.js", \n  "Tailwind CSS", \n  "MongoDB", \n  "THREE.js", \n  "Git", \n  "RESTful APIs", \n  "User Testing Tools", \n  "Firebase", \n  "CSS", \n  "Next.js", \n  "Axios", \n  "Mongoose", \n  "Multer", \n  "JWT", \n  "Bycrpt", \n  "Socket.io", \n  "React Router"\n}\n```'

#### --- Step 2: Extract Skills (LLM can be used, here simple mock) ---

In [None]:
from langchain.prompts import PromptTemplate
import json

def extract_skill_from_documents(chunk: str):

    """Chunk is text from documents"""
    prompt_template = PromptTemplate.from_template("""
    You are an expert HR assistant that extracts technical and professional skills from resumes and job descriptions.

    Instructions:
    - Extract only SKILLS from the given text.
    - Normalize variations into a common standard (e.g., VLOOKUP → Excel, Random Forest → Machine Learning).
    - Return output strictly in JSON format like:
    {{"Skill1", "Skill2", "Skill3"}}
    - Don't return any additional text or explanations.

    Text:
    {context}
    """)

    prompt = prompt_template.format(context=chunk)
    response = llm.invoke(input=prompt)
    return response.content

In [79]:
def get_set_of_skills(llm_response: str):
    import re
    """Convert LLM response to a set of skills"""
    resume_skills = re.findall(r'"([^"]+)"', llm_response)
    # Convert to set
    resume_kills_set = set(resume_skills)

    return resume_kills_set

In [69]:
skills = extract_skill_from_documents(chunk=chunks)

In [81]:
resume_skills_set = get_set_of_skills(llm_response=skills)

In [82]:
job_description = """
Job Description:
We are seeking a Data Analyst to join our analytics team. The candidate will be responsible for collecting, cleaning, analyzing, and interpreting large datasets to provide insights that support business decision-making.
Key Responsibilities:
Collect, process, and analyze structured and unstructured data.
Build dashboards and reports using Power BI or Tableau.
Write SQL queries to extract data from relational databases.
Apply statistical methods to identify trends and patterns.
Work with cross-functional teams to provide actionable insights.
Present findings in a clear and concise manner to stakeholders.
Required Skills & Qualifications:
Bachelor’s degree in Statistics, Mathematics, Computer Science, Economics, or related field.
Strong knowledge of SQL, Python, Excel.
Hands-on experience with Power BI / Tableau.
Knowledge of statistical analysis, regression, hypothesis testing.
Strong communication and problem-solving skills.
Preferred Skills:
Experience with Big Data tools (Spark, Hadoop).
Familiarity with machine learning basics.
Exposure to cloud platforms (AWS, GCP, Azure).
"""

In [83]:
jd_skills = extract_skill_from_documents(chunk=job_description)

In [84]:
jd_skills_set = get_set_of_skills(llm_response=jd_skills)

#### --- Step 3: Skill Match Score ---

In [86]:
common_skills = resume_skills_set.intersection(jd_skills_set)
common_skills

{'Python', 'SQL'}

In [87]:
skill_score = len(common_skills) / len(jd_skills_set) * 100
print(F"Matching skills score with Resume and JD: {skill_score:.2f} %")

Matching skills score with Resume and JD: 10.53 %


#### # --- Step 4: Semantic Similarity (JD vs Resume) ---

In [88]:
import json
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [89]:
model = SentenceTransformer(model_name_or_path="all-MiniLM-L6-v2")

In [90]:
jd_embedding = model.encode(job_description, convert_to_tensor=True)
resume_embedding = model.encode(chunks, convert_to_tensor=True)

In [91]:
semantic_score = float(util.cos_sim(jd_embedding, resume_embedding)) * 100

In [92]:
print(F"Matching skills score with Resume and JD: {semantic_score:.2f} %")

Matching skills score with Resume and JD: 42.50 %


#### --- Step 5: Final Weighted Score ---

In [93]:
final_weighted_score = 0.7 * skill_score + 0.3 * semantic_score
print(F"Final score for matching skills with Resume and JD: {final_weighted_score:.2f} %")

Final score for matching skills with Resume and JD: 20.12 %


In [94]:
print("Skill Score:", round(skill_score,2), "%")
print("Semantic Score:", round(semantic_score,2), "%")
print("Final Match Score:", round(final_weighted_score,2), "%")

Skill Score: 10.53 %
Semantic Score: 42.5 %
Final Match Score: 20.12 %


## Cosine similarity testing

In [116]:
text1 = "king"
text2 = "queen"

In [117]:
text1_encode = model.encode(text1, return_tensors="pt")
text2_encode = model.encode(text2, return_tensors="pt")

In [118]:
semantic_score = float(util.cos_sim(text1_encode, text2_encode)) * 100

In [119]:
semantic_score

68.07126998901367

In [133]:
jd = """
Job Title: Machine Learning Engineer
Experience Required: 2+ years
Location: Bangalore, India
Job Type: Full-time

Responsibilities:

Design, build, and deploy machine learning models for predictive analytics.

Preprocess and analyze structured and unstructured data.

Implement feature engineering, model training, and hyperparameter tuning.

Work with Python libraries (Pandas, NumPy, Scikit-learn, TensorFlow/PyTorch).

Collaborate with data engineers and software developers to integrate models into production systems.

Monitor model performance and retrain as needed.

Requirements:

Bachelor’s degree in Computer Science, Data Science, or related field.

Strong knowledge of supervised and unsupervised ML algorithms (Linear Regression, Random Forest, SVM, Clustering).

Hands-on experience with Python and ML libraries.

Exposure to deep learning frameworks (TensorFlow, PyTorch).

Familiarity with SQL and cloud platforms (AWS/GCP/Azure).

Excellent problem-solving and analytical skills.

Nice to Have:

Experience with NLP or Computer Vision projects.

Knowledge of MLOps practices (Docker, MLflow, CI/CD)."""

In [136]:
resume = """
Name: Rahul Sharma
Email: rahul.sharma@gmail.com

Phone: +91-9876543210

Summary:
Machine Learning Engineer with 3 years of experience in developing, deploying, and optimizing machine learning models. Skilled in Python, Scikit-learn, TensorFlow, and cloud-based ML deployment. Experienced in working with structured/unstructured data and delivering data-driven solutions for business problems.

Technical Skills:

Programming: Python, SQL, R

ML/DL Frameworks: Scikit-learn, TensorFlow, PyTorch

Data Tools: Pandas, NumPy, Matplotlib, Seaborn

Databases: MySQL, MongoDB

Cloud Platforms: AWS (S3, SageMaker), GCP AI Platform

Other Tools: Git, Docker, MLflow

Work Experience:
Machine Learning Engineer | ABC Tech Solutions | Bangalore | Jul 2022 – Present

Built and deployed predictive ML models for customer churn, improving retention rate by 12%.

Developed recommendation engine using collaborative filtering and deep learning techniques.

Automated feature engineering pipelines, reducing preprocessing time by 40%.

Deployed models on AWS SageMaker with Dockerized microservices.

Collaborated with data engineers to handle 1TB+ datasets efficiently.

Data Analyst (ML Focus) | XYZ Analytics | Pune | Jul 2021 – Jun 2022

Designed regression and classification models for financial forecasting.

Implemented clustering models for customer segmentation.

Conducted feature engineering and data cleaning using Python and SQL.

Visualized insights with Matplotlib, Seaborn, and Power BI.

Education:
B.Tech in Computer Science, Pune University (2017 – 2021)

✅ This way, you now have:

JD → defines what the company is looking for.

Resume → defines the candidate’s experience, slightly more than required."""

In [137]:
from langchain.prompts import PromptTemplate
from sentence_transformers import SentenceTransformer, util
import torch

In [138]:
# ---------- Step 1: Embedding Model ----------
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [190]:
# ---------- Step 2: Prompt for Skill Extraction ----------
skill_prompt = PromptTemplate.from_template("""
    You are an expert HR assistant that extracts technical and professional skills from resumes and job descriptions.

    Instructions:
    - Extract only SKILLS from the given text.
    - Normalize variations into a common standard (e.g., VLOOKUP → Excel, Random Forest → Machine Learning).
    - Return output strictly in JSON format like:
    {{"Skill1", "Skill2", "Skill3"}}
    - Don't return any additional text or explanations.

    Text:
    {context}
    """)

In [None]:
import re
import ast


def extract_skills(text: str):

    # ---------- Step 2: Prompt for Skill Extraction ----------
    skill_prompt = PromptTemplate.from_template("""
            You are an expert HR assistant that extracts technical and professional skills from resumes and job descriptions.

            Instructions:
            - Extract only SKILLS from the given text.
            - Normalize variations into a common standard (e.g., VLOOKUP → Excel, Random Forest → Machine Learning).
            - Return output strictly in JSON format like:
            {{"Skill1", "Skill2", "Skill3"}}
            - Don't return any additional text or explanations.

            Text:
            {context}
            """)

    prompt = skill_prompt.format(context=text)
    response = llm.invoke(prompt)
    jd_skills = response.content.strip()
    
    match = re.search(r"\{.*\}", jd_skills, re.DOTALL)
    if match:
        raw = match.group(0)

    jd_skills = ast.literal_eval(raw)
    return jd_skills

In [239]:
jd_skills_set = extract_skills(text=jd)

In [240]:
resume_skills_set = extract_skills(text=resume)

## Score using intersation

In [241]:
common_skills = resume_skills_set.intersection(jd_skills_set)
common_skills

{'AWS',
 'Clustering',
 'Docker',
 'GCP',
 'MLflow',
 'Machine Learning',
 'NumPy',
 'Pandas',
 'PyTorch',
 'Python',
 'SQL',
 'Scikit-learn',
 'TensorFlow'}

In [242]:
skill_score = len(common_skills) / len(jd_skills_set) * 100
print(F"Matching skills score with Resume and JD: {skill_score:.2f} %")

Matching skills score with Resume and JD: 61.90 %


## Similarity score using LLM

In [225]:
def extract_skills(text: str):
    import re
    import ast
    prompt = skill_prompt.format(context=text)
    response = llm.invoke(prompt)
    jd_skills = response.content.strip()
    
    match = re.search(r"\{.*\}", jd_skills, re.DOTALL)
    if match:
        raw = match.group(0)

    return raw


jd_skills_set = extract_skills(text=jd)
resume_skills = extract_skills(text=resume)

In [230]:
# ---------- Step 3: Compute Similarity ----------
def compute_match_score(jd_skills, resume_skills):
    # Convert skills into embeddings
    jd_embeddings = embed_model.encode(jd_skills, convert_to_tensor=True)
    resume_embeddings = embed_model.encode(resume_skills, convert_to_tensor=True)

    # Compute pairwise similarity
    cosine_scores = util.cos_sim(jd_embeddings, resume_embeddings)

    # Final percentage
    return round(cosine_scores.mean().item() * 100, 2)

In [231]:
compute_match_score(jd_skills=jd_skills_set, resume_skills=resume_skills)

91.52

In [235]:
# ---------- Step 4: Run Pipeline ---------

score = compute_match_score(jd_skills=jd_skills_set, resume_skills=resume_skills)
print(f"✅ Resume Match Score: {score}%")

✅ Resume Match Score: 91.52%
