## 1.Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 2.Install dependencies

In [None]:
!pip install streamlit pyngrok transformers sentencepiece accelerate graphviz rich --quiet


In [None]:
!pip install pypdf


## 3.NGROK AUTH

In [None]:
NGROK_AUTH_TOKEN = "ADD_YOUR_NGROK_TOKEN"

from pyngrok import ngrok
ngrok.set_auth_token(NGROK_AUTH_TOKEN)


# 4.APP.py


In [None]:
%%writefile app.py
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModel
from graphviz import Digraph
from pypdf import PdfReader
import requests
import json
import os

GPT_ENDPOINT = "AZURE_GPT_ENDPOINT"
GPT_API_KEY = "API_KEY"
EXTRACTION_MODEL = "gpt-4o-mini"
STAGE4_MODEL_PATH = "LOAD_MODEL_FROM_DRIVE"

device = "cuda" if torch.cuda.is_available() else "cpu"

class Stage4Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AutoModel.from_pretrained("intfloat/e5-base")
        self.tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-base")

    def forward(self, text):
        t = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        out = self.encoder(**t).last_hidden_state[:, 0]
        return torch.nn.functional.normalize(out, dim=-1)

model = Stage4Model().to(device)

def extract_text_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        try:
            text += page.extract_text() + "\n"
        except:
            pass
    return text.strip()

def azure_gpt_extract(text, mode="resume"):
    headers = {
        "Content-Type": "application/json",
        "api-key": GPT_API_KEY
    }

    prompt = f"""
You are an advanced ATS system. Extract structured information from this {mode} text.

TEXT:
{text}

Return ONLY valid JSON:
{{
  "skills": ["skill1", "skill2"],
  "experience_years": 0,
  "industry": "IT"
}}

RULES:
- ALWAYS extract at least 10 technical skills.
- Experience must be inferred if not explicitly stated.
- Industry must be short (IT, AI, HR, Finance).
- No explanations. Only VALID JSON.
"""

    payload = {
        "model": EXTRACTION_MODEL,
        "max_tokens": 1024,
        "temperature": 0.0,
        "messages": [{"role": "user", "content": prompt}]
    }

    response = requests.post(GPT_ENDPOINT, headers=headers, json=payload).json()

    try:
        parsed = json.loads(response["choices"][0]["message"]["content"])
    except:
        parsed = {"skills": [], "experience_years": 0, "industry": ""}

    if len(parsed["skills"]) < 5:

        fallback_prompt = f"""
Extract ONLY a JSON list of skills from this text.

TEXT:
{text}

Return ONLY:
{{
  "skills": ["skill1", "skill2"]
}}
"""
        fallback_payload = {
            "model": EXTRACTION_MODEL,
            "max_tokens": 512,
            "temperature": 0.0,
            "messages": [{"role": "user", "content": fallback_prompt}]
        }

        fallback_resp = requests.post(GPT_ENDPOINT, headers=headers, json=fallback_payload).json()

        try:
            parsed["skills"] = json.loads(fallback_resp["choices"][0]["message"]["content"])["skills"]
        except:
            parsed["skills"] = []

    parsed["skills"] = sorted(list(set([s.lower().strip() for s in parsed["skills"] if len(s) > 1])))

    return parsed

def compute_similarity(resume, job):
    r = model.forward(resume).detach().cpu()
    j = model.forward(job).detach().cpu()
    return float(torch.nn.functional.cosine_similarity(r, j))

def find_missing(resume_sk, job_sk):
    r = [s.lower() for s in resume_sk]
    return [s for s in job_sk if s.lower() not in r]

def generate_summary(match, sim, missing, resume_sk, job_sk):
    prompt = f"""
Resume–JD Matching Summary

Match Score: {match}%
Similarity Score: {sim}
Missing Skills: {missing}
Resume Skills: {resume_sk}
Job Skills: {job_sk}

Write a short and professional summary of the candidate's fit.
"""

    headers = {
        "Content-Type": "application/json",
        "api-key": GPT_API_KEY
    }

    payload = {
        "model": EXTRACTION_MODEL,
        "max_tokens": 300,
        "temperature": 0.2,
        "messages": [{"role": "user", "content": prompt}]
    }

    response = requests.post(GPT_ENDPOINT, headers=headers, json=payload).json()

    return response["choices"][0]["message"]["content"]

def plot_graph(resume_sk, job_sk, missing):
    g = Digraph("SkillGraph", format="png")
    g.attr(rankdir="LR")

    g.node("R", "Resume Skills", shape="box", fillcolor="#C8FFD4", style="filled")
    g.node("J", "Job Skills", shape="box", fillcolor="#FFE0B5", style="filled")

    for s in resume_sk:
        g.node(f"R_{s}", s, color="green")

    for s in job_sk:
        color = "red" if s.lower() in missing else "green"
        g.node(f"J_{s}", s, color=color)

    for s in job_sk:
        if s.lower() in resume_sk:
            g.edge(f"R_{s}", f"J_{s}", color="green", label="match")
        else:
            g.edge("R", f"J_{s}", color="red", label="missing")

    g.render("skill_graph", cleanup=True)
    return "skill_graph.png"

st.title("Resume–JD AI Matcher (CONFIT + Azure GPT)")

resume_pdf = st.file_uploader("Upload Resume (PDF)", type=["pdf"])
jd_pdf = st.file_uploader("Upload Job Description (PDF)", type=["pdf"])

if st.button("Analyze Match"):
    if not resume_pdf or not jd_pdf:
        st.error("Please upload both PDF files!")
        st.stop()

    resume_text = extract_text_from_pdf(resume_pdf)
    jd_text = extract_text_from_pdf(jd_pdf)

    st.success("Text extracted from PDFs!")

    st.subheader("Resume Fields")
    resume_data = azure_gpt_extract(resume_text, "resume")
    st.json(resume_data)

    st.subheader("Job Fields")
    jd_data = azure_gpt_extract(jd_text, "job")
    st.json(jd_data)

    st.subheader("Match Score")
    sim = compute_similarity(resume_text, jd_text)
    match = round(((sim + 1) / 2) * 100, 2)

    st.metric("Similarity Score", round(sim, 4))

    st.subheader("Missing Skills")
    missing = find_missing(resume_data["skills"], jd_data["skills"])
    st.write(missing)

    st.subheader("AI Summary")
    summary = generate_summary(match, sim, missing, resume_data["skills"], jd_data["skills"])
    st.write(summary)

    st.subheader("Skill Match Graph")
    graph_path = plot_graph(resume_data["skills"], jd_data["skills"], missing)
    st.image(graph_path, use_column_width=True)


In [None]:
public_url = ngrok.connect(8501)
public_url


In [None]:
!streamlit run app.py --server.port 8501