<a href="https://colab.research.google.com/github/Jimmynycu/ATS_with_gemini_api/blob/main/ATS_Langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!pip install -q langchain-community langchain pyautogen pypdf pandas openpyxl python-dotenv autogen --quiet
!wget https://raw.githubusercontent.com/Jimmynycu/ATS_with_gemini_api/main/Jimmy_CV.pdf
!curl -fsSL https://ollama.com/install.sh | sh

--2025-09-14 16:56:54--  https://raw.githubusercontent.com/Jimmynycu/ATS_with_gemini_api/main/Jimmy_CV.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 120802 (118K) [application/octet-stream]
Saving to: ‘Jimmy_CV.pdf.1’


2025-09-14 16:56:54 (7.67 MB/s) - ‘Jimmy_CV.pdf.1’ saved [120802/120802]

>>> Cleaning up old version at /usr/local/lib/ollama
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [21]:
import os
import json
from pypdf import PdfReader
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
import pandas as pd
from typing import List, Optional
import asyncio

In [22]:
from google.colab import userdata
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = userdata.get('LANGSMITH_API_KEY')
os.environ["LANGCHAIN_PROJECT"] = "ATS-Resume-Parser-Project"

In [23]:
pid = os.fork()
if pid == 0:
    os.execv('/usr/local/bin/ollama', ['ollama', 'serve'])
else:
    print("Ollama server started in the background.")

await asyncio.sleep(5)
!ollama pull llama3:instruct &

  pid = os.fork()


Ollama server started in the background.
[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l


In [24]:
class ContactInfo(BaseModel):
    name: str
    email: str
    phone: str
    location: str
    links: List[str] = Field(default_factory=list)

class WorkExperience(BaseModel):
    job_title: str
    company: str
    location: str
    responsibilities: List[str] = Field(default_factory=list)

class Education(BaseModel):
    degree: str
    institution: str
    location: str

class PersonalProjects(BaseModel):
    project_name: str
    description: str

class ResumeSchema(BaseModel):
    contact_information: ContactInfo
    professional_summary: str
    skills: List[str] = Field(default_factory=list)
    work_experience: List[WorkExperience] = Field(default_factory=list)
    education: List[Education] = Field(default_factory=list)
    personal_projects: List[PersonalProjects] = Field(default_factory=list)

class KeywordMatch(BaseModel):
    matched: List[str]
    missing: List[str]

class AnalysisSchema(BaseModel):
    candidate_name: str
    match_score: int
    summary: str
    keyword_match: KeywordMatch

In [25]:
# 1. Instantiate the LLM and the Output Parser
llm = ChatOllama(model="llama3:instruct", base_url="http://127.0.0.1:11434")
parser = JsonOutputParser(pydantic_object=ResumeSchema)

# 2. Define the Prompt Template
parser_prompt = ChatPromptTemplate.from_messages([
    ("system", """
     You are a specialist in parsing documents to a structured JSON format.

     Your task is to parse the provided resume text into a single JSON object.

     You MUST follow these rules exactly:
     1. The entire response must be a single JSON object.
     2. DO NOT include any text, commentary, or explanations before or after the JSON.
     3. The JSON object must strictly follow the provided schema.
     4. For any field that is a list of objects (like 'work_experience'), you must return an array of objects, even if there is only one item.
     5. For any field that is a list of strings (like 'skills'), you must return an array of strings.
     6. If a piece of information is missing, use an empty string "" for a string field or an empty list [] for a list field. DO NOT use 'null' or 'None'.

     Adhere to this schema:
     {format_instructions}
    """),
    ("human", """
     Parse the following resume text:

     ---
     {raw_resume_text}
     ---
    """)
]).partial(format_instructions=parser.get_format_instructions())

# 3. Create the Chain
resume_parsing_chain = parser_prompt | llm | parser

In [26]:
# 1. Instantiate the Output Parser
analysis_parser = JsonOutputParser(pydantic_object=AnalysisSchema)

# 2. Define the Prompt Template
analysis_prompt = ChatPromptTemplate.from_messages([
    ("system", """
     You are a senior technical recruiter. Your ONLY task is to analyze a candidate's resume data against a job description.

     You must generate a SINGLE, complete JSON object that strictly conforms to the provided schema.

     **STRICT RULES FOR YOUR RESPONSE:**
     1.  The ENTIRE response must be a single JSON object.
     2.  DO NOT include any text, commentary, or explanations before or after the JSON.
     3.  DO NOT include any comments (e.g., //) inside the JSON object.
     4.  For any field that is a list, you must return an array of items.
     5.  If a field is a string, use an empty string ("") if the information is not applicable.
     6.  If a field is a list, use an empty list ([]) if no matches are found.

     Adhere to this schema:
     {format_instructions}
    """),
    ("human", """
     Please analyze the candidate's resume data against the job description.

     Job Description:
     ---
     {job_description}
     ---

     Candidate's Resume Data (JSON):
     ---
     {candidate_data}
     ---
    """)
]).partial(format_instructions=analysis_parser.get_format_instructions())

# 3. Create the Chain
resume_analysis_chain = analysis_prompt | llm | analysis_parser

In [27]:
def load_and_parse_pdf(data: dict):
    file_path = data.get('file_path')
    if not file_path:
        raise ValueError("Error: 'file_path' key not found in input dictionary.")

    raw_resume_text = ""
    try:
        reader = PdfReader(file_path)
        for page in reader.pages:
            raw_resume_text += page.extract_text() or ""
    except FileNotFoundError:
        raise ValueError(f"Error: The file '{file_path}' not found.")
    return raw_resume_text

def get_job_description():
    return """
    About Us: TSMC IT Business AI team is dedicated to developing next-generation technologies...
    Requirements:
    * A minimum of a Master's degree in Computer Science, Artificial Intelligence...
    * At least 3 years of experience in AI/Machine Learning. Strong proficiency in Python with a proven track record in AI/ML project development. Experience with Natural Language Processing (NLP) and Large Language Models (LLMs), as well as graph embedding techniques...
    * Excellent communication and teamwork skills.
    * Business domain knowledge in supply chain management is desirable.
    """
def save_to_excel(data: dict):
    candidate_data = data.get('candidate_data')
    if not candidate_data:
        print("Error: 'candidate_data' not found in the input. Skipping Excel save.")
        return data

    print("\nLLM generated the structured JSON. Now saving to Excel...")
    parsed_data = candidate_data

    try:
        # Create a single ExcelWriter object to handle all sheets
        with pd.ExcelWriter("resume_data.xlsx", engine='openpyxl') as writer:

            # 1. Save Contact Information
            contact_info_df = pd.DataFrame([parsed_data.get('contact_information', {})])
            contact_info_df.to_excel(writer, sheet_name='Contact_Info', index=False)

            # 2. Save Skills
            skills_df = pd.DataFrame(parsed_data.get('skills', []), columns=['Skill'])
            skills_df.to_excel(writer, sheet_name='Skills', index=False)

            # 3. Save Work Experience
            work_experience_df = pd.DataFrame(parsed_data.get('work_experience', []))
            work_experience_df.to_excel(writer, sheet_name='Work_Experience', index=False)

            # 4. Save Education
            education_df = pd.DataFrame(parsed_data.get('education', []))
            education_df.to_excel(writer, sheet_name='Education', index=False)

            # 5. Save Personal Projects
            personal_projects_df = pd.DataFrame(parsed_data.get('personal_projects', []))
            personal_projects_df.to_excel(writer, sheet_name='Personal_Projects', index=False)

        print("Excel file 'resume_data.xlsx' saved successfully with all sheets.")
    except Exception as e:
        print(f"Error: Could not save the data to Excel. Error: {e}")

    return data
# 1. Redefine the full workflow with the new step
full_workflow = (
    RunnablePassthrough.assign(raw_resume_text=RunnableLambda(load_and_parse_pdf))
    # This part gets the candidate data from the LLM
    | {"candidate_data": resume_parsing_chain, "job_description": RunnablePassthrough()}

    # NEW STEP: Use RunnableLambda to run the save_to_excel function
    | RunnableLambda(save_to_excel)

    # This part gets the final analysis from the LLM
    | resume_analysis_chain
)

# 2. Prepare the input dictionary and run the workflow
input_dict = {
    "file_path": "Jimmy_CV.pdf",
    "job_description": get_job_description()
}

try:
    result = full_workflow.invoke(input_dict)

    # Print the final analysis
    print(json.dumps(result, indent=2))
    print("\nFinished. ATS workflow complete.")
except Exception as e:
    print(f"An error occurred: {e}")


LLM generated the structured JSON. Now saving to Excel...
Excel file 'resume_data.xlsx' saved successfully with all sheets.
{
  "candidate_name": "JIMMY LIU",
  "match_score": 80,
  "summary": "",
  "keyword_match": {
    "matched": [
      "Python",
      "PyTorch & TensorFlow",
      "Git version control",
      "LLM fine-tuning & deployment",
      "Foundation model dev.",
      "Deep learning optimization",
      "Quantitative analysis"
    ],
    "missing": [
      "Natural Language Processing (NLP)",
      "Large Language Models (LLMs)",
      "graph embedding techniques"
    ]
  }
}

Finished. ATS workflow complete.


In [28]:
# Read the Excel file
excel_file = pd.ExcelFile("resume_data.xlsx")

# Print out each sheet
for sheet_name in excel_file.sheet_names:
    print(f"\n--- Sheet: {sheet_name} ---")
    df = excel_file.parse(sheet_name)
    display(df)


--- Sheet: Contact_Info ---


Unnamed: 0,name,email,phone,location,links
0,JIMMY LIU,jimmyliu.ii12@nycu.edu.tw,+886 978768367,Taiwan Taipei,['Github: Jimmynyu']



--- Sheet: Skills ---


Unnamed: 0,Skill
0,Agent Coding
1,Python (Advanced)
2,PyTorch & TensorFlow
3,Git version control
4,LLM fine-tuning & deployment
5,Foundation model dev.
6,Deep learning optimization
7,Quantitative analysis
8,Linux administration
9,Verilog



--- Sheet: Work_Experience ---


Unnamed: 0,job_title,company,location,responsibilities
0,EDA Software Engineer,MediaTek,"Hsinchu, Taiwan",['Implemented and optimized LLM-based workflow...
1,Impedance Engineer,Broadcom,"Hsinchu, Taiwan",['Designed and implemented new automated workf...



--- Sheet: Education ---


Unnamed: 0,degree,institution,location
0,Master’s Degree,Institute of Artificial Intelligence Innovatio...,"Hsinchu, Taiwan"



--- Sheet: Personal_Projects ---


Unnamed: 0,project_name,description
0,Algorithmic Trading Agent for TSMC Stock Using...,Developed PPO agent with FinRL based on histor...
1,YouTube Trend Analysis & NLP Application,Developed system extracting/analyzing YouTube ...
2,LLM-Powered Data Retrieval & Filtering,Engineered LLM solution with function calls fo...
3,Multi-Agent AI Resume Parser & Analyzer,Engineered LLM solution to extract information...
