In [1]:
import json
import pandas as pd
import pdfplumber
import text_cleaner
import filter_context
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
import re
import pandas as pd
from collections import Counter
from nltk.tokenize import word_tokenize
import text_cleaner
import ollama
import os
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet
import markdown2
from flask import Flask, render_template, request
import compare
import gemenAI
import markdown2  # for rendering markdown nicely
import llama_model

# Load a pre-trained embedding model (compact but powerful)
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


This cell is used for extracting the text from pdf using library pdf plubmber

In [2]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text


This cell reads job description data from a JSON file, handling both line-delimited and standard JSON formats, converts
the data into a pandas DataFrame, and then selects only the first 100 records for analysis.

In [3]:
json_path = "job-descriptions.json"  # Update path
try:
    with open(json_path, "r", encoding="utf-8") as f:
        jobs_data = [json.loads(line) for line in f]
except json.JSONDecodeError:
    with open(json_path, "r", encoding="utf-8") as f:
        jobs_data = json.load(f)
jobs_df = pd.DataFrame(jobs_data)
jobs_df = jobs_df.head(100)

This is a cell that removes unclean features from the job description text by renaming the column headers to normalized headers through the usage of the normalize function from the NLTK corpus. The cell will thereafter convert the cleaned features to numerical features through the calculation of the TF-IDF transform feature.

In [4]:
# -----------------------------
# 4. Clearing the text and apply normalization from NLTK
# -----------------------------
jobs_df.columns = jobs_df.columns.str.strip().str.lower()
jobs_df['clean_desc'] = jobs_df['description'].apply(filter_context.normalization)


# -----------------------------
# 5.Precompute TF-IDF matrix
# -----------------------------
job_texts = jobs_df['clean_desc'].tolist()
vectorizer = TfidfVectorizer()
job_tfidf_matrix = vectorizer.fit_transform(job_texts)

This tool matches a resume with job postings based on various similarity measures. The tool initializes the resume cleaning and vectorization process using TF-IDF similarity measures, and afterwards, transformer semantic embeddings are used. The tool further calculates keyword overlap between the resume and the job posting. The tool finally generates a hybrid score based on the above measures and produces the top matching job postings and the most important keyword phrases of the resume.

In [5]:
def compute_matches(resume_text, top_n_keywords=10, top_n_jobs=20):
    # === 1. Clean + TF-IDF (existing NLTK logic) ===
    resume_clean = filter_context.normalization(resume_text)
    resume_keywords = filter_context.GetTFIDF(resume_clean, top_n=top_n_keywords)

    resume_vec = vectorizer.transform([resume_clean])
    similarity_scores_tfidf = cosine_similarity(resume_vec, job_tfidf_matrix)[0]
    jobs_df['similarity_tfidf'] = similarity_scores_tfidf

    # === 2. Semantic Embeddings (transformers) ===
    resume_emb = semantic_model.encode(resume_clean, convert_to_tensor=True)
    job_embs = semantic_model.encode(jobs_df['clean_desc'].tolist(), convert_to_tensor=True)
    semantic_similarities = util.cos_sim(resume_emb, job_embs)[0].cpu().numpy()
    jobs_df['similarity_semantic'] = semantic_similarities

    # === 3. Keyword Overlap (from TF-IDF) ===
    jobs_df['top_keywords'] = jobs_df['clean_desc'].apply(lambda x: filter_context.GetTFIDF(x, top_n=top_n_keywords))
    jobs_df['keyword_overlap'] = jobs_df['top_keywords'].apply(
        lambda x: len(set(resume_keywords).intersection(set(x)))
    )

    # === 4. Hybrid Combined Score ===
    # You can tune these weights — semantic tends to be more robust
    jobs_df['combined_score'] = (
        0.3* jobs_df['similarity_tfidf']
        + 0.4 * jobs_df['similarity_semantic']
        + 0.3 * (jobs_df['keyword_overlap'] / top_n_keywords)
    )

    # === 5. Sort and Return ===
    top_matches = jobs_df.sort_values(by='combined_score', ascending=False).head(top_n_jobs)

    return resume_keywords, top_matches

It preprocesses the text for lower casing, removal of non-alphanumeric characters, tokenization, extraction of stopwords, and lemmatization to get the roots of words in the text. It also includes a function to dig into the grammatical structures using POS tagging for each sentence in the text, getting the frequency of every universal POS tag, and then returns a structured dataframe to show how linguistic categories are distributed in the text.

In [6]:
from filter_context import lemmatizer


def preprocess(text):
    """Clean, tokenize, remove stopwords, and lemmatize"""
    text = str(text).lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stopwords]
    return ' '.join(tokens)

# -----------------------------
# Get each usage for each Tag
# -----------------------------
def get_tag(text, tagset='universal'):
    all_tags=['ADJ','ADP','ADV','CONJ','DET','NOUN','NUM','PRT','PRON','VERB','.','X']
    rows = []

    for sentence in text.split('.'):
        pos_tags = Counter([j for i, j in nltk.pos_tag(word_tokenize(sentence), tagset=tagset)])
        rows.append(pos_tags)

    df = pd.DataFrame(rows).fillna(0).astype(int)

    for col in all_tags:
        if col not in df.columns:
            df[col] = 0

    return df[all_tags]

It contains code that defines text normalization functions, keywords extraction functions, TF-IDF calculation, and finally returns the keywords representing the most meaningful part of the text. This is performed through text cleaning, removal of non-alphabetic letters, lemmatization, removal of stopwords, calculation of TF-IDF, sorting, and finally selecting the key words.

In [7]:
def normalization(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    text = text_cleaner.text_cleaner(text)
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords]
    return " ".join(tokens)

# -----------------------------
# Get Top 20 keywords
# -----------------------------
def GetTFIDF(text, top_n=20):
    if not text or len(text.strip()) == 0:
        return []

    cleaner_text = normalization(text)

    docs = [cleaner_text]

    vector_stop_words = TfidfVectorizer(stop_words='english')

    tfidf_matrix = vector_stop_words.fit_transform(docs)
    feature_names = np.array(vector_stop_words.get_feature_names_out())

    scores = tfidf_matrix.toarray().flatten()

    top_indices = np.argsort(scores)[::-1][:top_n]
    top_keyword = feature_names[top_indices]

    return top_keyword.tolist()

This code sets up a Google Gemini API service by loading an API key from Environment Variables and also defines a function that constructs a response based on the Google Gemini model after implementing a retry feature in case of temporary server busy errors, so that it does not fail at the first attempt to respond.

In [8]:
from google import genai
import os
import google.genai.errors
import time
from dotenv import load_dotenv

load_dotenv()
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")


api_key = os.environ.get("GOOGLE_API_KEY")

client = genai.Client(api_key=api_key)
# -----------------------------
# Get Solution from the model
# -----------------------------
# We are using retries cause sometime google API can fail so at least we dont give error with first try
def getSolution(prompt, retires=5):
    for attempt in range(retires):
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=prompt
            )
            generated_text = response.text

            return generated_text
        except google.genai.errors.ServerError as e:
            if "503" in str(e):
                wait = 2 ** attempt  # exponential backoff
                print(f"[WARN] Gemini overloaded, retrying in {wait}s...")
                time.sleep(wait)
            else:
                raise


    print("[ERROR] Gemini API still unavailable after retries.")
    return None




This code defines utilities for generating a resume using a local LLaMA model by sending a prompt to Ollama and converting the model's response into a PDF file. It ensures that the output file has a unique name not to overwrite any existing file, transforms the generated markdown content into HTML, and saves the final formatted result as a PDF in the user's Downloads directory.

In [9]:
def get_unique_filename(directory, base_name, extension):
    filename = f"{base_name}{extension}"
    counter = 1

    while os.path.exists(os.path.join(directory, filename)):
        filename = f"{base_name} ({counter}){extension}"
        counter += 1

    return os.path.join(directory, filename)


def useLlamaModel(prompt):
    desiredModel = 'llama3.2:3b'

    response = ollama.chat(
        model=desiredModel,
        messages=[{"role": "user", "content": prompt}]
    )

    ollamaResponse = response['message']['content'] or ""

    # Convert markdown → HTML
    html = markdown2.markdown(ollamaResponse)

    # Windows Downloads folder
    downloads_dir = os.path.join(os.path.expanduser("~"), "Downloads")

    # Generate unique filename
    pdf_path = get_unique_filename(downloads_dir, "Generated_Resume", ".pdf")

    # Create PDF
    doc = SimpleDocTemplate(pdf_path)
    styles = getSampleStyleSheet()
    story = [Paragraph(html, styles["Normal"])]

    doc.build(story)

    return pdf_path

Below is the Flask app used as the backend of the resume evaluation and job matchmaking website. It enables the user to upload the resume in PDF form, download the text from the resume, use NLP-based resume matching algorithms to find appropriate job listings, as well as key words in the resume. The `/match_resume` path is used in the app to process the submitted resume, match the contents to the job descriptions in the database, and return the list of matching job listings to the front end. There is also the home path, which loads the application interface, initialized with default inputs. There is also an AI-based endpoint to enhance the submitted resume according to the instructions used.

In [10]:

app = Flask(__name__)


# -----------------------------
# Reqeust to get matches for our CV and find strength and weakness
# -----------------------------
@app.route("/match-resume", methods=["GET", "POST"])
def index():
    resume_keywords = []
    strengths_weaknesses_html = ""
    matches = []

    if request.method == "POST":
        file = request.files.get("resume_pdf")
        if file:
            # Extract resume text
            resume_text = compare.extract_text_from_pdf(file)

            # Compute keywords and top matches
            resume_keywords, top_matches = compare.compute_matches(resume_text)

            # Prepare prompt for GemenAI
            # prompt = ', '.join(
            #     resume_keywords) + "\n" + resume_text + "\nCan you give strengths and weaknesses for this CV?"
            # strengths_weaknesses_md = gemenAI.getSolution(prompt)
            #
            # # Convert markdown to HTML
            # strengths_weaknesses_html = markdown2.markdown(strengths_weaknesses_md)

            # Convert matches to list of dicts
            matches = top_matches.to_dict(orient="records")

#rendering the templates in this case our main html
    return render_template(
        "main.html",
        resume_keywords=resume_keywords,
        strengths_weaknesses_html=strengths_weaknesses_html,
        matches=matches
    )


@app.route("/", methods=["GET"])
def home():
    # Just render main.html with empty defaults
    return render_template(
        "main.html",
        resume_keywords=[],
        strengths_weaknesses_html="",
        matches=[]
    )


@app.route("/build-resume-ai", methods=["POST"])
def build_resume_ai():
    ai_result_html = ""

    file = request.files.get("ai_resume_pdf")
    ai_prompt = request.form.get("ai_prompt")

    if file and ai_prompt:
        resume_text = compare.extract_text_from_pdf(file)

        prompt = f"""
        Resume:
        {resume_text}

        Instruction:
        {ai_prompt}

        Please generate an improved professional resume.
        """

        llama_model.useLlamaModel(prompt)

    return render_template(
        "main.html",
        ai_result=ai_result_html
    )


if __name__ == "__main__":
    app.run(debug=True, use_reloader=False)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [07/Jan/2026 00:16:59] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [07/Jan/2026 00:17:44] "POST /build-resume-ai HTTP/1.1" 200 -
127.0.0.1 - - [07/Jan/2026 00:18:35] "POST /build-resume-ai HTTP/1.1" 200 -
127.0.0.1 - - [07/Jan/2026 00:19:31] "POST /match-resume HTTP/1.1" 200 -
