In [2]:
# Install the Snowflake connector if not already installed:
# !pip install snowflake-connector-python

import snowflake.connector
import pandas as pd
import numpy as np

# Connect to Snowflake using given credentials
conn = snowflake.connector.connect(
    user='MUDIT',
    password='Testing@123123',
    account='BCEMHHI-LB94703',
    warehouse='COMPUTE_WH',
    database='JOB_RECOMMENDATIONS',
    schema='JOB_DATA',
    role='ACCOUNTADMIN'
)
print("Connected to Snowflake successfully.")


Connected to Snowflake successfully.


In [3]:
# Execute a query to retrieve all records from the JOBS table
query = "SELECT * FROM JOBS;"
cur = conn.cursor()
cur.execute(query)

# Fetch all results into a pandas DataFrame
df_jobs = cur.fetch_pandas_all()  # Loads all rows into a DataFrame&#8203;:contentReference[oaicite:1]{index=1}
print(f"Retrieved {len(df_jobs)} job postings.")
df_jobs.head(3)  # display first few rows for verification (optional)


Retrieved 1615940 job postings.


Unnamed: 0,JOB_ID,JOB_TITLE,ROLE,COMPANY,LOCATION,SALARY_RANGE,SKILLS,JOB_DESCRIPTION,WEIGHTED_TEXT,CREATED_AT
0,52617686664739,Network Security Specialist,Security Consultant,Arrow Electronics,Ankara,$64K-$90K,Security consulting Risk assessment Security a...,A Security Consultant is responsible for asses...,Network Security SpecialistNetwork Security Sp...,2025-04-19 04:59:46.787
1,2398176927291103,Market Research Analyst,Market Researcher,Voltas Limited,San Salvador,$57K-$100K,Market research methodologies Data collection ...,A Market Researcher gathers and analyzes marke...,Market Research AnalystMarket Research Analyst...,2025-04-19 04:59:46.787
2,1984315855795341,Social Media Manager,Community Manager,Charles Schwab,Nur-Sultan,$55K-$125K,Community engagement and moderation Online com...,Community Managers build and engage online com...,Social Media ManagerSocial Media ManagerSocial...,2025-04-19 04:59:46.787


In [4]:
import re

# Text cleaning function: lowercase, remove punctuation/newlines, normalize spaces&#8203;:contentReference[oaicite:4]{index=4}
def clean_text(text):
    if not isinstance(text, str):
        text = str(text) if text is not None else ""
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and newlines (anything not alphanumeric or whitespace)
    text = re.sub(r'[^\w\s]', ' ', text)  # replace punctuation with space&#8203;:contentReference[oaicite:5]{index=5}
    text = re.sub(r'\s+', ' ', text)      # collapse multiple spaces/newlines into one&#8203;:contentReference[oaicite:6]{index=6}
    return text.strip()

# Define field weights based on importance&#8203;:contentReference[oaicite:7]{index=7}
FIELD_WEIGHTS = {
    'JOB_TITLE': 3.0,
    'ROLE': 2.5,
    'SKILLS': 2.0,
    'JOB_DESCRIPTION': 1.0,
    'COMPANY': 0.8
}

# Function to combine and weight important text fields for a job posting
def combine_text_fields(job_row):
    """
    Combine text from multiple fields (Job Title, Role, Skills, Job Description, Company)
    with specified weights. Text is cleaned and fields are duplicated according to weight.
    """
    weighted_parts = []
    for field, weight in FIELD_WEIGHTS.items():
        if field in job_row and pd.notnull(job_row[field]):
            text = clean_text(job_row[field])
            # Duplicate the field text int(weight) times
            if weight >= 1:
                weighted_parts.extend([text] * int(weight))
            # Handle fractional part by adding a proportion of the text's words&#8203;:contentReference[oaicite:8]{index=8}
            frac = weight - int(weight)
            if frac > 0 and text:
                words = text.split()
                cutoff = int(len(words) * frac)
                if cutoff > 0:
                    weighted_parts.append(' '.join(words[:cutoff]))
    # Combine all weighted parts into one string
    combined_text = ' '.join(weighted_parts)
    return combined_text.strip()

# Apply the combination to each job posting to create a new combined text column
df_jobs['COMBINED_TEXT'] = df_jobs.apply(combine_text_fields, axis=1)
print("Sample combined text for first job:\n", df_jobs.loc[0, 'COMBINED_TEXT'][:200], "...")


Sample combined text for first job:
 network security specialist network security specialist network security specialist security consultant security consultant security security consulting risk assessment security audits security policy ...


In [5]:
import os
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from scipy.sparse import save_npz, load_npz
from sklearn.metrics.pairwise import cosine_similarity

# --- Text Cleaning Helper ---
def clean_text(text):
    import re
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- TF-IDF Training and Caching ---
def build_and_cache_tfidf(jobs_df, cache_dir="tfidf_cache"):
    os.makedirs(cache_dir, exist_ok=True)
    vec_path = os.path.join(cache_dir, "job_vectors.npz")
    model_path = os.path.join(cache_dir, "vectorizer.pkl")

    vectorizer = TfidfVectorizer(
        stop_words='english',
        max_features=3000,
        min_df=3,
        max_df=0.85,
        ngram_range=(1, 1),
        sublinear_tf=True
    )
    job_matrix = vectorizer.fit_transform(jobs_df['COMBINED_TEXT'])

    save_npz(vec_path, job_matrix)
    joblib.dump(vectorizer, model_path)

    return vectorizer, job_matrix

def load_tfidf_cache(cache_dir="tfidf_cache"):
    vec_path = os.path.join(cache_dir, "job_vectors.npz")
    model_path = os.path.join(cache_dir, "vectorizer.pkl")

    if os.path.exists(vec_path) and os.path.exists(model_path):
        vectorizer = joblib.load(model_path)
        job_matrix = load_npz(vec_path)
        return vectorizer, job_matrix
    return None, None

# --- Main Recommender Function ---
def recommend_jobs_tfidf(resume_text, jobs_df, top_n=5):
    """
    Optimized & cached TF-IDF recommendation system.
    - Reuses or builds cached TF-IDF matrix and vectorizer.
    - Computes cosine similarity via normalized dot product.
    Returns top N indices and similarity scores.
    """
    # Step 1: Load or build cached model
    vectorizer, job_tfidf = load_tfidf_cache()
    if vectorizer is None or job_tfidf is None:
        vectorizer, job_tfidf = build_and_cache_tfidf(jobs_df)

    # Step 2: Normalize job matrix (once for cosine sim)
    job_tfidf = normalize(job_tfidf, axis=1)

    # Step 3: Vectorize and normalize resume
    resume_clean = clean_text(resume_text)
    resume_vec = vectorizer.transform([resume_clean])
    resume_vec = normalize(resume_vec, axis=1)

    # Step 4: Fast cosine similarity using dot product
    sim_scores = resume_vec.dot(job_tfidf.T).toarray().flatten()

    # Step 5: Top N results
    top_idx = sim_scores.argsort()[-top_n:][::-1]
    return top_idx, sim_scores


In [15]:
import os
import numpy as np
import joblib
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import torch
from tqdm import tqdm

# --- Clean Text ---
def clean_text(text):
    import re
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- Chunked Encoding Function ---
def encode_in_chunks(texts, model, batch_size=256, chunk_size=10000):
    all_embeddings = []
    for i in range(0, len(texts), chunk_size):
        chunk = texts[i:i + chunk_size]
        print(f"Encoding records {i} to {i + len(chunk) - 1}...")
        embeddings = model.encode(
            chunk,
            batch_size=batch_size,
            convert_to_numpy=True,
            show_progress_bar=True
        )
        all_embeddings.append(embeddings)
    return np.vstack(all_embeddings)

# --- Build and Cache SBERT Embeddings ---
def build_and_cache_sbert(jobs_df, cache_dir="sbert_cache"):
    os.makedirs(cache_dir, exist_ok=True)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
    texts = jobs_df['COMBINED_TEXT'].apply(clean_text).tolist()
    embeddings = encode_in_chunks(texts, model)

    np.save(os.path.join(cache_dir, "job_embeddings.npy"), embeddings)
    joblib.dump(model, os.path.join(cache_dir, "sbert_model.pkl"))
    return model, embeddings

# --- Load Cached Embeddings ---
def load_sbert_cache(cache_dir="sbert_cache"):
    emb_path = os.path.join(cache_dir, "job_embeddings.npy")
    model_path = os.path.join(cache_dir, "sbert_model.pkl")

    if os.path.exists(emb_path) and os.path.exists(model_path):
        embeddings = np.load(emb_path)
        model = joblib.load(model_path)
        return model, embeddings
    return None, None

# --- Main Recommender Function ---
def recommend_jobs_sbert(resume_text, jobs_df, top_n=5):
    """
    Optimized SBERT-based job recommender.
    Caches SBERT model + embeddings for faster reuse.
    """
    model, job_embeddings = load_sbert_cache()
    if model is None or job_embeddings is None:
        model, job_embeddings = build_and_cache_sbert(jobs_df)

    resume_clean = clean_text(resume_text)
    resume_vec = model.encode(resume_clean, convert_to_numpy=True)

    sim_scores = cosine_similarity([resume_vec], job_embeddings)[0]
    top_idx = sim_scores.argsort()[-top_n:][::-1]

    return top_idx, sim_scores


In [7]:
from datasketch import MinHash, MinHashLSHForest
import os
import numpy as np
import joblib

# --- Config ---
SHINGLE_K = 5
NUM_PERM = 128
LSH_CACHE_DIR = "minhash_cache"

# --- Helper: Generate k-shingles ---
def get_shingles(text, k=SHINGLE_K):
    text = text.lower()
    return {text[i:i+k] for i in range(len(text) - k + 1)}

# --- Build and cache LSH index ---
def build_and_cache_minhash_lsh(jobs_df):
    forest = MinHashLSHForest(num_perm=NUM_PERM)
    minhashes = []

    for idx, text in enumerate(jobs_df['COMBINED_TEXT']):
        shingles = get_shingles(text)
        m = MinHash(num_perm=NUM_PERM)
        for sh in shingles:
            m.update(sh.encode('utf8'))
        forest.add(str(idx), m)
        minhashes.append(m)

    forest.index()

    os.makedirs(LSH_CACHE_DIR, exist_ok=True)
    joblib.dump(forest, os.path.join(LSH_CACHE_DIR, "lsh_forest.pkl"))
    joblib.dump(minhashes, os.path.join(LSH_CACHE_DIR, "minhashes.pkl"))
    return forest, minhashes

# --- Load LSH Cache ---
def load_lsh_cache():
    try:
        forest = joblib.load(os.path.join(LSH_CACHE_DIR, "lsh_forest.pkl"))
        minhashes = joblib.load(os.path.join(LSH_CACHE_DIR, "minhashes.pkl"))
        return forest, minhashes
    except:
        return None, None

# --- Main Optimized LSH Recommender ---
def recommend_jobs_lsh(resume_text, jobs_df, top_n=10):
    forest, minhashes = load_lsh_cache()

    if forest is None or minhashes is None:
        forest, minhashes = build_and_cache_minhash_lsh(jobs_df)

    resume_shingles = get_shingles(clean_text(resume_text))
    m_resume = MinHash(num_perm=NUM_PERM)
    for sh in resume_shingles:
        m_resume.update(sh.encode('utf8'))

    # Approximate query
    result_ids = forest.query(m_resume, top_n)
    result_indices = [int(rid) for rid in result_ids]

    # Re-rank with actual Jaccard (optional)
    results = []
    for idx in result_indices:
        jaccard = m_resume.jaccard(minhashes[idx])
        results.append((jaccard, idx))

    results.sort(reverse=True)
    top_idx = [idx for _, idx in results]
    scores = [score for score, _ in results]
    return top_idx, scores


In [18]:
# Optimized MinHash-LSH for 1 Million Records with Caching, Parallelization, and tqdm

import os
import re
import numpy as np
import pandas as pd
import joblib
from tqdm import tqdm
from datasketch import MinHash, MinHashLSHForest
from multiprocessing import Pool, cpu_count

# --- Config ---
SHINGLE_K = 5
NUM_PERM = 128
LSH_CACHE_DIR = "minhash_cache"

# --- Text Cleaning ---
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[\n\r]', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()

# --- Shingle Function ---
def get_shingles(text, k=SHINGLE_K):
    text = clean_text(text)
    return set(text[i:i+k] for i in range(len(text) - k + 1))

# --- MinHash Creation Function ---
def create_minhash(index_text):
    idx, text = index_text
    shingles = get_shingles(text)
    m = MinHash(num_perm=NUM_PERM)
    for sh in shingles:
        m.update(sh.encode('utf8'))
    return idx, m

# --- Build LSH with Parallel Processing ---
def build_and_cache_minhash_lsh_parallel(jobs_df):
    os.makedirs(LSH_CACHE_DIR, exist_ok=True)
    forest = MinHashLSHForest(num_perm=NUM_PERM)

    # Use multiprocessing Pool
    data = list(enumerate(jobs_df['COMBINED_TEXT']))
    with Pool(cpu_count()) as pool:
        minhash_pairs = list(tqdm(pool.imap(create_minhash, data), total=len(data), desc="Generating MinHashes"))

    minhashes = [None] * len(jobs_df)
    for idx, m in minhash_pairs:
        forest.add(str(idx), m)
        minhashes[idx] = m

    forest.index()
    joblib.dump(forest, os.path.join(LSH_CACHE_DIR, "lsh_forest.pkl"))
    joblib.dump(minhashes, os.path.join(LSH_CACHE_DIR, "minhashes.pkl"))
    return forest, minhashes

# --- Load Cached LSH ---
def load_lsh_cache():
    try:
        forest = joblib.load(os.path.join(LSH_CACHE_DIR, "lsh_forest.pkl"))
        minhashes = joblib.load(os.path.join(LSH_CACHE_DIR, "minhashes.pkl"))
        return forest, minhashes
    except:
        return None, None

# --- LSH-Based Recommender ---
def recommend_jobs_lsh(resume_text, jobs_df, top_n=10):
    forest, minhashes = load_lsh_cache()
    if forest is None or minhashes is None:
        forest, minhashes = build_and_cache_minhash_lsh_parallel(jobs_df)

    resume_shingles = get_shingles(resume_text)
    m_resume = MinHash(num_perm=NUM_PERM)
    for sh in resume_shingles:
        m_resume.update(sh.encode('utf8'))

    candidate_ids = forest.query(m_resume, top_n)
    candidate_indices = [int(cid) for cid in candidate_ids]

    scored = []
    for idx in candidate_indices:
        score = m_resume.jaccard(minhashes[idx])
        scored.append((score, idx))

    scored.sort(reverse=True)
    top_idx = [idx for score, idx in scored]
    scores = [score for score, idx in scored]
    return top_idx, scores


In [8]:
import fitz  # PyMuPDF

def extract_resume_text(pdf_path):
    """Extract all text from a PDF resume using PyMuPDF (fitz)."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:  # iterate through pages&#8203;:contentReference[oaicite:49]{index=49}
        text += page.get_text()
    doc.close()
    return text

def display_top_jobs(job_indices, scores, jobs_df, top_n=5):
    """Display the top N job recommendations with their titles and similarity scores."""
    print(f"Top {top_n} Recommended Jobs:")
    for rank, idx in enumerate(job_indices, start=1):
        title = jobs_df.loc[idx, 'JOB_TITLE'] if 'JOB_TITLE' in jobs_df.columns else "Job"
        company = jobs_df.loc[idx, 'COMPANY'] if 'COMPANY' in jobs_df.columns else ""
        score = scores[rank-1] if isinstance(scores, (list, tuple, np.ndarray)) else None
        if score is not None:
            print(f"{rank}. {title} at {company} (Score: {score:.3f})")
        else:
            print(f"{rank}. {title} at {company}")


In [9]:
resume_text = extract_resume_text("sampleresume.pdf")


In [10]:
### TF-IDF Test sample
# After loading or creating your df_jobs
resume_text = extract_resume_text("sampleresume.pdf")  # assume this exists

top_indices, similarities = recommend_jobs_tfidf(resume_text, df_jobs, top_n=5)

# Show results
display_top_jobs(top_indices, similarities, df_jobs)


Top 5 Recommended Jobs:
1. Digital Marketing Specialist at Tech Mahindra (Score: 0.028)
2. Digital Marketing Specialist at Tech Mahindra (Score: 0.100)
3. Digital Marketing Specialist at Tech Mahindra (Score: 0.261)
4. Digital Marketing Specialist at Tech Mahindra (Score: 0.333)
5. Digital Marketing Specialist at Tech Mahindra (Score: 0.044)


In [16]:
resume_text = extract_resume_text("sampleresume.pdf")

top_idx, scores = recommend_jobs_sbert(resume_text, df_jobs, top_n=5)

display_top_jobs(top_idx, scores, df_jobs)


Encoding records 0 to 9999...


Batches: 100%|██████████| 40/40 [03:47<00:00,  5.69s/it]


Encoding records 10000 to 19999...


Batches: 100%|██████████| 40/40 [04:01<00:00,  6.03s/it]


Encoding records 20000 to 29999...


Batches: 100%|██████████| 40/40 [03:45<00:00,  5.63s/it]


Encoding records 30000 to 39999...


Batches: 100%|██████████| 40/40 [03:53<00:00,  5.83s/it]


Encoding records 40000 to 49999...


Batches: 100%|██████████| 40/40 [04:25<00:00,  6.64s/it]


Encoding records 50000 to 59999...


Batches: 100%|██████████| 40/40 [04:12<00:00,  6.31s/it]


Encoding records 60000 to 69999...


Batches: 100%|██████████| 40/40 [04:09<00:00,  6.25s/it]


Encoding records 70000 to 79999...


Batches: 100%|██████████| 40/40 [03:53<00:00,  5.84s/it]


Encoding records 80000 to 89999...


Batches: 100%|██████████| 40/40 [03:21<00:00,  5.03s/it]


Encoding records 90000 to 99999...


Batches: 100%|██████████| 40/40 [09:34<00:00, 14.37s/it] 


Encoding records 100000 to 109999...


Batches: 100%|██████████| 40/40 [03:01<00:00,  4.55s/it]


Encoding records 110000 to 119999...


Batches: 100%|██████████| 40/40 [03:37<00:00,  5.43s/it]


Encoding records 120000 to 129999...


Batches: 100%|██████████| 40/40 [18:56<00:00, 28.42s/it]   


Encoding records 130000 to 139999...


Batches: 100%|██████████| 40/40 [02:51<00:00,  4.30s/it]


Encoding records 140000 to 149999...


Batches: 100%|██████████| 40/40 [02:58<00:00,  4.46s/it]


Encoding records 150000 to 159999...


Batches: 100%|██████████| 40/40 [02:57<00:00,  4.45s/it]


Encoding records 160000 to 169999...


Batches: 100%|██████████| 40/40 [02:59<00:00,  4.49s/it]


Encoding records 170000 to 179999...


Batches: 100%|██████████| 40/40 [03:00<00:00,  4.52s/it]


Encoding records 180000 to 189999...


Batches: 100%|██████████| 40/40 [03:08<00:00,  4.70s/it]


Encoding records 190000 to 199999...


Batches: 100%|██████████| 40/40 [02:53<00:00,  4.35s/it]


Encoding records 200000 to 209999...


Batches: 100%|██████████| 40/40 [02:58<00:00,  4.46s/it]


Encoding records 210000 to 219999...


Batches: 100%|██████████| 40/40 [03:21<00:00,  5.05s/it]


Encoding records 220000 to 229999...


Batches: 100%|██████████| 40/40 [02:49<00:00,  4.23s/it]


Encoding records 230000 to 239999...


Batches: 100%|██████████| 40/40 [03:09<00:00,  4.74s/it]


Encoding records 240000 to 249999...


Batches: 100%|██████████| 40/40 [6:50:54<00:00, 616.37s/it]    


Encoding records 250000 to 259999...


Batches:  15%|█▌        | 6/40 [01:12<06:49, 12.04s/it]


KeyboardInterrupt: 

In [None]:
resume_text = extract_resume_text("sampleresume.pdf")
top_idx, scores = recommend_jobs_lsh(resume_text, df_jobs, top_n=5)
display_top_jobs(top_idx, scores, df_jobs)


Generating MinHashes:   0%|          | 0/1615940 [00:00<?, ?it/s]