### Information extraction from documents using LLM

### GroqLLM  

In [1]:
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import os
load_dotenv()

os.environ['GROQ_API_KEY'] = os.getenv("GROQ_API_KEY")

In [2]:
model_name = "llama-3.1-8b-instant"
llm = ChatGroq(model=model_name,
               temperature=0,
               verbose=True
               )

In [3]:
response = llm.invoke("What is the capital of india?")
print(response.content)

The capital of India is New Delhi.


#### Extract text from documents

In [4]:
import pdfplumber


def text_extractore(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"

    return text

file_path = r"..\documents\KAMAL_Enterpr_ resume.pdf"
chunks = text_extractore(file_path=file_path)

In [5]:
print(chunks)

KAMAL JIT SINGH
Email: rajk51890@gmail.com
Mob: +91 8830132055
Location: Ara Nawada Chauk, Bhojpur, Bih ar har, Pin: 802301
Objective:
To Work in an environment which encourage me to succeed and grow professionally where I
can utilize my skill and knowledge appropriately.
Activity:
Higher Education:
All outdoor activity
B.sc Physics(Hons)–2019 Learning new skills in my spare time
Skills:
Experience:
Excel
Functio ns: Total 3 Years of Experience
• VLOPKU P 1. Hopscotch Pvt Ltd
• IF 2. Jyoti Enterprise s
COUNT 3. NoBroker
•
PIVAT TABLE
•
And many more formation functions
Visualization Tools:
Experience with Excel:
• Power Bi • I am very good at Excel. I have good
• Advance Excel knowledge about Advance formatting
functions like SUM, SUMIF, IF, IFS,
LOOKUP, VLOOKUP, ALL LOOKUP
Language:
functions and INDEX and MATCH
function.
Hindi, English
• Pivot Table: This is the most important
inbuilt function in excel for advanced
use cases and I have good knowledge
I am g ood at team work and curio

#### Extract skills from resume and JD

In [6]:
template = "You are a expert to extract information form chunk of text." \
"You have to extract common name of the skills from the given context." \
"If you will get skills like: Vlookup, if, count then retun Excel or Advance Excel" \
"or if like liner regression, Random Forest the return Machine Learning" \
"Like this you have retun the common name of the skill."\
"Don't return other text. Return only skills." \
"Context\n" \
"{context}"

In [7]:
template.format(context="Kamal")

"You are a expert to extract information form chunk of text.You have to extract common name of the skills from the given context.If you will get skills like: Vlookup, if, count then retun Excel or Advance Excelor if like liner regression, Random Forest the return Machine LearningLike this you have retun the common name of the skill.Don't return other text. Return only skills.Context\nKamal"

In [8]:
response = llm.invoke(input=template.format(context=chunks))
response.content

'Excel \nMachine Learning \nData Analysis \nPower Bi \nAdvance Excel \nGoogle Sheets \nPivot Table \nData Visualization \nLanguage Skills (Hindi, English)'

#### --- Step 2: Extract Skills (LLM can be used, here simple mock) ---

In [9]:
def extract_skill_from_documents(chunk: str):

    """Chunk is text from documents"""
    template = """
    You are an expert in resume parsing and skill normalization. 
    Your task is to extract skills from the given text and map them to their common standardized names.

    Instructions:
    - Identify all technical and analytical skills mentioned in the context.
    - Group related skills under a single common name. 
    Examples:
        - If you find "VLOOKUP, IF, COUNT, Pivot Table" → return ["Excel / Advanced Excel"]
        - If you find "Linear Regression, Random Forest, XGBoost" → return ["Machine Learning"]
        - If you find "Seaborn, Matplotlib, Power BI, Tableau" → return ["Data Visualization"]
        - If you find "MySQL, PostgreSQL, SQL Server" → return ["SQL"]
    - Return only skills in a Python list format (e.g., ["SQL", "Machine Learning", "Excel"]).
    - Do not include extra explanation or text.

    Context:
    {context}
"""

    prompt = template.format(context=chunk)
    response = llm.invoke(input=prompt)
    return response.content

In [10]:
import ast
resume_skills = extract_skill_from_documents(chunk=chunks)
set_of_resume_skills = set(ast.literal_eval(resume_skills))
print(set_of_resume_skills)

{'English', 'Data Analysis', 'Machine Learning', 'Team Work', 'Excel / Advanced Excel', 'Data Visualization', 'SQL', 'Hindi', 'Problem Solving'}


In [11]:
job_description = """
Job Description:
We are seeking a Data Analyst to join our analytics team. The candidate will be responsible for collecting, cleaning, analyzing, and interpreting large datasets to provide insights that support business decision-making.
Key Responsibilities:
Collect, process, and analyze structured and unstructured data.
Build dashboards and reports using Power BI or Tableau.
Write SQL queries to extract data from relational databases.
Apply statistical methods to identify trends and patterns.
Work with cross-functional teams to provide actionable insights.
Present findings in a clear and concise manner to stakeholders.
Required Skills & Qualifications:
Bachelor’s degree in Statistics, Mathematics, Computer Science, Economics, or related field.
Strong knowledge of SQL, Python, Excel.
Hands-on experience with Power BI / Tableau.
Knowledge of statistical analysis, regression, hypothesis testing.
Strong communication and problem-solving skills.
Preferred Skills:
Experience with Big Data tools (Spark, Hadoop).
Familiarity with machine learning basics.
Exposure to cloud platforms (AWS, GCP, Azure).
"""

In [12]:
import ast
jd_skills = extract_skill_from_documents(chunk=job_description)
set_of_jd_skills = set(ast.literal_eval(jd_skills))
print(set_of_jd_skills)

{'Machine Learning', 'Cloud Computing', 'Data Visualization', 'Excel', 'Big Data', 'Python', 'SQL'}


#### --- Step 3: Skill Match Score ---

In [13]:
common_skills = set_of_resume_skills.intersection(set_of_jd_skills)
common_skills

{'Data Visualization', 'Machine Learning', 'SQL'}

In [14]:
skill_score = len(common_skills) / len(set_of_jd_skills) * 100
print(F"Matching skills score with Resume and JD: {skill_score:.2f} %")

Matching skills score with Resume and JD: 42.86 %


#### # --- Step 4: Semantic Similarity (JD vs Resume) ---

In [15]:
import json
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
model = SentenceTransformer(model_name_or_path="all-MiniLM-L6-v2")

In [20]:
jd_embedding = model.encode(job_description, convert_to_tensor=True)
resume_embedding = model.encode(chunks, convert_to_tensor=True)

In [31]:
semantic_score = float(util.cos_sim(jd_embedding, resume_embedding)) * 100

In [None]:
print(F"Matching skills score with Resume and JD: {semantic_score:.2f} %")

Matching skills score with Resume and JD: 50.33 %


#### --- Step 5: Final Weighted Score ---

In [32]:
final_weighted_score = 0.7 * skill_score + 0.3 * semantic_score
print(F"Final score for matching skills with Resume and JD: {final_weighted_score:.2f} %")

Final score for matching skills with Resume and JD: 45.10 %


In [33]:
print("Skill Score:", round(skill_score,2), "%")
print("Semantic Score:", round(semantic_score,2), "%")
print("Final Match Score:", round(final_weighted_score,2), "%")

Skill Score: 42.86 %
Semantic Score: 50.33 %
Final Match Score: 45.1 %
