In [2]:
# Install the Snowflake connector if not already installed:
# !pip install snowflake-connector-python

import snowflake.connector
import pandas as pd
import numpy as np

# Connect to Snowflake using given credentials
conn = snowflake.connector.connect(
    user='MUDIT',
    password='Testing@123123',
    account='BCEMHHI-LB94703',
    warehouse='COMPUTE_WH',
    database='JOB_RECOMMENDATIONS',
    schema='JOB_DATA',
    role='ACCOUNTADMIN'
)
print("Connected to Snowflake successfully.")


Connected to Snowflake successfully.


In [4]:
# Execute a query to retrieve all records from the JOBS table
query = "SELECT * FROM JOB_DESC;"
cur = conn.cursor()
cur.execute(query)

# Fetch all results into a pandas DataFrame
jobs_df = cur.fetch_pandas_all()  # Loads all rows into a DataFrame&#8203;:contentReference[oaicite:1]{index=1}
print(f"Retrieved {len(jobs_df)} job postings.")
jobs_df.head(3)  # display first few rows for verification (optional)


Retrieved 1615940 job postings.


Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,2600342200917599,,,$61K-$106K,"Capitol Hill, Saipan",,,,,,...,,Purchasing Agent,Inventory Manager,,An Inventory Manager oversees inventory levels...,,Inventory control Demand forecasting Supply ch...,,Kyndryl Holdings,
1,1097571695278272,,,$57K-$86K,Banjul,,,,,,...,,Graphic Designer,Web Graphic Designer,,Web Graphic Designers create visually appealin...,,"Graphic design tools (e.g., Adobe Creative Sui...",,Ambuja Cements,
2,393705790719989,,,$60K-$103K,Tashkent,,,,,,...,,Physician Assistant,Surgical Physician Assistant,,"Assist surgeons in the operating room, perform...",,Surgical procedures and techniques Operating r...,,Whitehaven Coal,


In [8]:
# Optimized Resume-to-Job Recommender System

import os
import re
import string
import time
import logging
import warnings
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from scipy.sparse import save_npz, load_npz
import joblib

try:
    import fitz  # PyMuPDF for faster PDF processing
except ImportError:
    raise ImportError("The 'fitz' module (PyMuPDF) is not installed. Install it via 'pip install PyMuPDF'")

warnings.filterwarnings('ignore')

# Logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', filename='job_recommender_optimized.log')
logger = logging.getLogger()

# --- Utility Functions ---

def extract_resume_text(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        return " ".join([page.get_text() for page in doc])
    except Exception as e:
        logger.error(f"PDF extraction failed: {e}")
        return ""

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return re.sub(r'\s+', ' ', text).strip()

def weight_text(text, weight):
    if not isinstance(text, str):
        return ""
    weighted = text * int(weight)
    frac = weight - int(weight)
    if frac:
        words = text.split()
        weighted += " " + " ".join(words[:int(len(words) * frac)])
    return weighted

# --- TF-IDF Caching ---

def build_or_load_tfidf(jobs_df, cache_dir="./cache"):
    os.makedirs(cache_dir, exist_ok=True)
    vec_path, model_path = f"{cache_dir}/job_vectors.npz", f"{cache_dir}/vectorizer.pkl"

    if os.path.exists(vec_path) and os.path.exists(model_path):
        logger.info("Loading cached TF-IDF...")
        return load_npz(vec_path), joblib.load(model_path), jobs_df

    weights = {
        'Job Title': 3.0, 'Role': 2.5, 'skills': 2.0, 'Job Description': 1.0, 'Company': 0.8
    }
    jobs_df['Weighted_Text'] = ""
    for field, weight in weights.items():
        if field in jobs_df.columns:
            jobs_df['Weighted_Text'] += jobs_df[field].fillna('').apply(lambda x: weight_text(str(x), weight) + " ")

    jobs_df['Weighted_Text_Clean'] = jobs_df['Weighted_Text'].apply(clean_text)
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, min_df=2, max_df=0.85, ngram_range=(1, 2), sublinear_tf=True)
    job_vectors = vectorizer.fit_transform(jobs_df['Weighted_Text_Clean'])

    save_npz(vec_path, job_vectors)
    joblib.dump(vectorizer, model_path)
    return job_vectors, vectorizer, jobs_df

# --- Similarity Matching ---

def match_resume(resume_text, job_vectors, vectorizer, jobs_df, top_n=10):
    resume_clean = clean_text(resume_text)
    resume_vector = vectorizer.transform([resume_clean])

    similarities = cosine_similarity(resume_vector, job_vectors)[0]
    jobs_df = jobs_df.copy()
    jobs_df['similarity'] = similarities
    top_jobs = jobs_df.sort_values('similarity', ascending=False).head(top_n)
    return top_jobs

# --- Main Recommender ---

def run_job_recommender(resume_path, jobs_df, top_n=10):
    start = time.time()
    logger.info("Loading data...")
    resume_text = extract_resume_text(resume_path)
    if not resume_text:
        logger.warning("Resume extraction failed.")
        return pd.DataFrame({'Message': ['Resume extraction failed']})

    job_vectors, vectorizer, processed_df = build_or_load_tfidf(jobs_df)
    results = match_resume(resume_text, job_vectors, vectorizer, processed_df, top_n)

    logger.info(f"Total execution time: {time.time() - start:.2f}s")
    return results

# --- Example Execution ---
if __name__ == "__main__":
    resume_path = 'sampleresume.pdf'
    top_matches = run_job_recommender(resume_path, jobs_df, top_n=5)
    print(top_matches[['Job Title', 'Company', 'Role', 'location', 'similarity']])


                           Job Title                 Company  \
299643  Digital Marketing Specialist  Advanced Micro Devices   
973676  Digital Marketing Specialist  Advanced Micro Devices   
43516   Digital Marketing Specialist  Advanced Micro Devices   
47202   Digital Marketing Specialist  Advanced Micro Devices   
977761  Digital Marketing Specialist  Advanced Micro Devices   

                        Role          location  similarity  
299643  Social Media Manager             Amman    0.403706  
973676  Social Media Manager  City of Victoria    0.403706  
43516   Social Media Manager            Bissau    0.403706  
47202   Social Media Manager             Macao    0.403706  
977761  Social Media Manager            Zagreb    0.403706  
