In [4]:
!pip install -U sentence-transformers
!pip install rank_bm25




[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2



[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# **Library Setup**

In [23]:
import pandas as pd
import numpy as np
import json
import re
import os
import warnings
import random
from datetime import datetime

# TEXT PROCESSING & SCRAPING 
import requests
from bs4 import BeautifulSoup

# MACHINE LEARNING & DENSE RETRIEVAL
import torch
from sentence_transformers import SentenceTransformer, util

# SPARSE RETRIEVAL
from rank_bm25 import BM25Okapi

# CONFIGURATION 
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 50) 
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("All Libraries (Standard + Retrieval) Imported Successfully!")

All Libraries (Standard + Retrieval) Imported Successfully!


# **Data Load and Preprocessing**

In [24]:
# Data_Load
try:
    df = pd.read_json('Realtime_Jobs_Data.json')
    
    print(f"Total Rows: {df.shape[0]}, Total Columns: {df.shape[1]}")

    # ‡¶∏‡¶¨ ‡¶ï‡¶≤‡¶æ‡¶Æ‡ßá‡¶∞ ‡¶®‡¶æ‡¶Æ ‡¶™‡ßç‡¶∞‡¶ø‡¶®‡ßç‡¶ü ‡¶ï‡¶∞‡¶æ
    print("\nExisting Columns:")
    print(df.columns.tolist())

    # ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡ß©‡¶ü‡¶ø ‡¶∞‡ßã ‡¶¶‡ßá‡¶ñ‡¶æ
    print("\nData Preview:")
    display(df.head(3)) # ‡¶ú‡ßÅ‡¶™‡¶ø‡¶ü‡¶æ‡¶∞‡ßá print ‡¶è‡¶∞ ‡¶ö‡ßá‡ßü‡ßá display() ‡¶≠‡¶æ‡¶≤‡ßã ‡¶¶‡ßá‡¶ñ‡¶æ‡ßü

except ValueError:
    print("‚ùå Error: 'Realtime_Jobs_Data.json' ‡¶´‡¶æ‡¶á‡¶≤‡¶ü‡¶ø ‡¶™‡¶æ‡¶ì‡ßü‡¶æ ‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡ßá ‡¶®‡¶æ‡•§")

# Mapping all columns
rename_map = {
    'JobId': 'job_id', 'CompnayName': 'company_name', 'CompanyID': 'company_id', 'CategoryID': 'category_id',
    'JobTitle': 'job_title', 'JobDescription': 'job_description', 'JobNature': 'job_type',
    'JobWorkPlace': 'work_place', 'PostedOn': 'posted_on', 'Deadline': 'deadline',
    'JobVacancies': 'vacancies', 'JobLocation': 'raw_location', 'EducationRequirements': 'raw_education',
    'SkillsRequired': 'skills', 'SuggestedSkills': 'suggested_skills', 'experience': 'experience',
    'Age': 'age', 'Gender': 'gender', 'JobSalaryMinSalary': 'min_salary', 'JobSalaryMaxSalary': 'max_salary',
    'JobSalaryRange': 'salary_range', 'ApplyURL': 'apply_url', 'ApplyEmail': 'apply_email',
    'ApplyInstruction': 'apply_instruction', 'CompanyWeb': 'company_web', 'CompanyAddress': 'company_address',
    'CompanyBusiness': 'company_business', 'JobOtherBenifits': 'job_benefits', 'JobSource': 'job_source',
    'OnlineApply': 'online_apply'
}

# ‡¶®‡¶æ‡¶Æ ‡¶™‡¶∞‡¶ø‡¶¨‡¶∞‡ßç‡¶§‡¶® ‡¶ï‡¶∞‡¶æ (‡¶ï‡¶ø‡¶®‡ßç‡¶§‡ßÅ ‡¶∏‡¶¨ ‡¶ï‡¶≤‡¶æ‡¶Æ ‡¶∞‡¶æ‡¶ñ‡¶æ)
df = df.rename(columns=rename_map)

# ID ‡¶ó‡ßÅ‡¶≤‡ßã‡¶ï‡ßá ‡¶∏‡ßç‡¶ü‡ßç‡¶∞‡¶ø‡¶Ç ‡¶¨‡¶æ‡¶®‡¶æ‡¶®‡ßã (‡¶∏‡¶æ‡¶Ø‡¶º‡ßá‡¶®‡ßç‡¶ü‡¶ø‡¶´‡¶ø‡¶ï ‡¶®‡ßã‡¶ü‡ßá‡¶∂‡¶® ‡¶è‡¶°‡¶º‡¶æ‡¶®‡ßã‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø)
for col in ['job_id', 'company_id', 'category_id']:
    if col in df.columns:
        df[col] = df[col].astype(str)

print("Column names standardized!")

#Cleaning HTML contents
def clean_html_text(text):
    if not isinstance(text, str): return text
    if pd.isna(text) or text == "": return None
    try:
        soup = BeautifulSoup(text, "html.parser")
        text = soup.get_text(separator=" ")
    except: pass
    return re.sub(r'\s+', ' ', text).strip()

# ‡¶Ø‡ßá‡¶∏‡¶¨ ‡¶ï‡¶≤‡¶æ‡¶Æ‡ßá HTML ‡¶ü‡ßç‡¶Ø‡¶æ‡¶ó ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá
text_cols = ['job_description', 'job_benefits', 'apply_instruction', 'raw_education', 'skills', 'suggested_skills', 'company_business']

print("Cleaning HTML tags...")

for col in text_cols:
    if col in df.columns:
        df[col] = df[col].apply(clean_html_text)

# ‡¶ó‡¶æ‡¶∞‡¶¨‡ßá‡¶ú ‡¶≠‡ßç‡¶Ø‡¶æ‡¶≤‡ßÅ ‡¶®‡¶æ‡¶≤ (NaN) ‡¶ï‡¶∞‡ßá ‡¶¶‡ßá‡¶ì‡ßü‡¶æ
missing_indicators = ["--", "N/A", "n/a", "Not Applicable", "[]", "", " ", "0000-00-00", "Any"]
df.replace(missing_indicators, np.nan, inplace=True)

print("HTML Cleaning Done!")

# Feature Extraction (Age, Loc, Edu)
# Helper Functions
def extract_range(text):
    """Age ‡¶¨‡¶æ Experience ‡¶•‡ßá‡¶ï‡ßá min/max ‡¶¨‡ßá‡¶∞ ‡¶ï‡¶∞‡ßá"""
    if pd.isna(text): return None, None
    numbers = re.findall(r'\d+', str(text))
    if not numbers: return None, None
    nums = [int(n) for n in numbers]
    if len(nums) >= 2: return nums[0], nums[1]
    elif len(nums) == 1: return nums[0], None
    return None, None

def extract_loc_edu(row):
    """Location & Education Logic"""
    raw_loc = str(row.get('raw_location', '')).strip()
    raw_edu = str(row.get('raw_education', '')).lower().strip()
    
    # Location
    dist, fine = raw_loc, "none"
    if "anywhere in bangladesh" in raw_loc.lower():
        dist = "Anywhere in Bangladesh"
    elif "(" in raw_loc:
        parts = re.findall(r"([^(]+)\s*\(([^)]+)\)", raw_loc)
        if parts: dist, fine = parts[0][0].strip(), parts[0][1].strip()

    # Education
    level, subject = "Any", "Any"
    if any(x in raw_edu for x in ['cse', 'computer science', 'it', 'software']):
        level, subject = "Bachelor", "Computer Science"
    elif any(x in raw_edu for x in ['bba', 'mba', 'business']):
        level, subject = "Bachelor", "Business Administration"
    elif 'diploma' in raw_edu:
        level, subject = "Diploma", "Engineering"
    
    return pd.Series([dist, fine, level, subject])

# --- Applying Logic ---
print("Generating new columns...")

# 1. Location & Education
df[['district', 'fine_grained_location', 'education_level', 'education_subject']] = df.apply(extract_loc_edu, axis=1)

# 2. Age
if 'age' in df.columns:
    age_data = df['age'].apply(extract_range)
    df['min_age'] = age_data.apply(lambda x: x[0])
    df['max_age'] = age_data.apply(lambda x: x[1])

# 3. Experience
if 'experience' in df.columns:
    exp_data = df['experience'].apply(extract_range)
    df['min_experience'] = exp_data.apply(lambda x: x[0])
    df['max_experience'] = exp_data.apply(lambda x: x[1])

# 4. Salary & Vacancies Fix
if 'vacancies' in df.columns:
    df['vacancies'] = pd.to_numeric(df['vacancies'], errors='coerce')
for col in ['min_salary', 'max_salary']:
    if col in df.columns: df[col] = df[col].replace(0, np.nan)

print("New Features Created: min_age, district, education_level, etc.")

import pandas as pd

# ==========================================
# 1. FIXED CATEGORY MAPPING (Based on Analysis)
# ==========================================
# ‡¶Ü‡¶Æ‡¶∞‡¶æ ‡¶°‡ßá‡¶ü‡¶æ ‡¶è‡¶®‡¶æ‡¶≤‡¶æ‡¶á‡¶∏‡¶ø‡¶∏ ‡¶ï‡¶∞‡ßá ‡¶è‡¶á ‡¶Æ‡ßç‡¶Ø‡¶æ‡¶™‡¶ü‡¶ø ‡¶§‡ßà‡¶∞‡¶ø ‡¶ï‡¶∞‡ßá‡¶õ‡¶ø
fixed_category_map = {
    # --- Functional Categories ---
    '1': 'Accounting/Finance',
    '2': 'Bank/Non-Bank Fin. Inst.',
    '3': 'Supply Chain/Procurement',
    '4': 'Education/Training',
    '5': 'Engineer/Architects',
    '6': 'Garments/Textile',
    '7': 'Gen Mgt/Admin',
    '8': 'IT & Telecommunication',
    '9': 'Marketing/Sales',
    '10': 'Digital Marketing/SEO',
    '11': 'Medical/Pharma',
    '12': 'NGO/Development',
    '13': 'Research/Consultancy',
    '14': 'Receptionist/Front Desk',
    '15': 'Data Entry/Operator',
    '16': 'Customer Support/Call Center',
    '17': 'HR/Org. Development',
    '18': 'Design/Creative',
    '19': 'Production/Operation',
    '20': 'Immigration/Visa Consultant',
    '22': 'Law/Legal',
    '24': 'Security/Support Service',
    '26': 'Agro (Plant/Animal/Fisheries)',
    '27': 'Commercial/Logistics',
    '28': 'Secretariat/Media',
    '29': 'Pharma/Medical Promo',
    
    # --- Blue Collar / Skilled Trade ---
    '61': 'Computer Operator',
    '62': 'Electrician/Technician',
    '63': 'Nurse/Patient Care',
    '64': 'Hotel/Restaurant/Chef',
    '65': 'Lab/Radiographer',
    '66': 'Electronics/Technician',
    '67': 'Driver/Motor Technician',
    '68': 'Chef/Cook',
    '69': 'Housekeeping/Domestic',
    '70': 'Security Guard',
    '71': 'Graphic Design/Video Editor',
    '72': 'Welder/Technical',
    '74': 'Garments Operator',
    '75': 'Labour/Helper',
    '76': 'CAD/Draftsman',
    '77': 'Delivery Man',
    '78': 'Machine Operator',
    '79': 'Peon/Messenger',
    '80': 'Cleaner/Support Staff',
    '81': 'Gardener/Mali',
    '82': 'Carpenter',
    '83': 'Salesman/Salesgirl',
    '84': 'Sales Representative (SR)',
    '85': 'Religious/Imam/Moazzin',
    '86': 'Sports/Fitness Trainer',
    '87': 'Interpreter/Translator',
    '88': 'Beautician',
    '89': 'Fire/Safety',
    '90': 'Boiler Operator',
    '91': 'Caregiver/Nanny',
    '92': 'Physiotherapist/Therapist',
    
    # --- Catch-All ---
    '0': 'General/Others',
    '-10': 'Logistics/Transport'
}

print("‚úÖ Fixed Category Map Loaded.")

# ==========================================
# 2. APPLY MAPPING TO DATAFRAME
# ==========================================
# ‡¶Æ‡ßç‡¶Ø‡¶æ‡¶™ ‡¶Ö‡¶®‡ßÅ‡¶Ø‡¶æ‡ßü‡ßÄ ‡¶®‡¶æ‡¶Æ ‡¶¨‡¶∏‡¶æ‡¶®‡ßã (‡¶Ø‡¶¶‡¶ø ‡¶ï‡ßã‡¶®‡ßã ‡¶Ü‡¶á‡¶°‡¶ø ‡¶Æ‡ßç‡¶Ø‡¶æ‡¶™‡ßá ‡¶®‡¶æ ‡¶•‡¶æ‡¶ï‡ßá, 'Other' ‡¶¨‡¶∏‡¶¨‡ßá)
df['category_name'] = df['category_id'].astype(str).map(fixed_category_map).fillna('Other Category')

print("‚úÖ Category Names Applied Successfully!")

# ==========================================
# 3. VERIFY RESULT
# ==========================================
# ‡¶∏‡ßç‡¶Ø‡¶æ‡¶Æ‡ßç‡¶™‡¶≤ ‡¶ö‡ßá‡¶ï ‡¶ï‡¶∞‡¶æ
cols_to_view = ['category_id', 'category_name', 'job_title']
print("\nSample Data with Fixed Categories:")
print(df[cols_to_view].drop_duplicates('category_id').head(10))

# ==========================================
# CLEANING GARBAGE DATA (Bangla Jobs)
# ==========================================
print(f"Original Row Count: {len(df)}")

# ‡ßß. 'Bangla Job' ‡¶≤‡ßá‡¶ñ‡¶æ ‡¶Ü‡¶õ‡ßá ‡¶è‡¶Æ‡¶® ‡¶∏‡¶¨ ‡¶∞‡ßã ‡¶ñ‡ßÅ‡¶Å‡¶ú‡ßá ‡¶¨‡ßá‡¶∞ ‡¶ï‡¶∞‡¶æ
garbage_indices = df[df['job_title'].str.contains("Bangla Job", case=False, na=False)].index

# ‡ß®. ‡¶è‡¶ó‡ßÅ‡¶≤‡ßã ‡¶°‡ßç‡¶∞‡¶™ ‡¶ï‡¶∞‡ßá ‡¶¶‡ßá‡¶ì‡ßü‡¶æ
df_clean = df.drop(garbage_indices)

# ‡¶Æ‡ßá‡¶á‡¶® ‡¶°‡ßá‡¶ü‡¶æ‡¶´‡ßç‡¶∞‡ßá‡¶Æ‡ßá ‡¶Ü‡¶™‡¶°‡ßá‡¶ü ‡¶ï‡¶∞‡¶æ
df = df_clean.copy()

print(f"Removed {len(garbage_indices)} 'Bangla Job' rows.")
print(f"Clean Row Count: {len(df)}")

# ‡¶ö‡ßá‡¶ï ‡¶ï‡¶∞‡ßá ‡¶¶‡ßá‡¶ñ‡¶æ ‡¶Ø‡ßá ‡¶Ü‡¶∞ ‡¶Ü‡¶õ‡ßá ‡¶ï‡¶ø‡¶®‡¶æ
remaining_garbage = df[df['job_title'].str.contains("Bangla Job", case=False, na=False)]
if remaining_garbage.empty:
    print("‚úÖ All 'Bangla Job' garbage successfully removed!")
else:
    print("‚ö†Ô∏è Still some garbage left!")

import pandas as pd

# Final Features Selection
# ==========================================
# 0. DISPLAY SETTINGS (Safety First)
# ==========================================
# ‡¶ü‡ßá‡¶¨‡¶ø‡¶≤ ‡¶Ø‡¶æ‡¶§‡ßá ‡¶≠‡ßá‡¶ô‡ßá ‡¶®‡¶æ ‡¶Ø‡¶æ‡ßü, ‡¶§‡¶æ‡¶á ‡¶Ü‡¶ó‡ßá ‡¶°‡¶ø‡¶∏‡¶™‡ßç‡¶≤‡ßá ‡¶∏‡ßá‡¶ü ‡¶ï‡¶∞‡ßá ‡¶®‡¶ø‡¶ö‡ßç‡¶õ‡¶ø
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# ==========================================
# 1. CUSTOM ID GENERATION
# ==========================================
print("üõ†Ô∏è Generating Custom IDs...")

# Job ID
df['job_id'] = [f"JOB-{i+1:05d}" for i in range(len(df))]

# Company ID
unique_companies = df['company_name'].unique()
company_id_map = {name: f"COM-{5000+i}" for i, name in enumerate(unique_companies)}
df['company_id'] = df['company_name'].map(company_id_map)

print("‚úÖ Custom Job & Company IDs Created.")


# ==========================================
# 2. FINAL COLUMN SELECTION
# ==========================================
# ‡¶è‡¶á ‡¶≤‡¶ø‡¶∏‡ßç‡¶ü‡ßá category_name ‡¶Ø‡ßã‡¶ó ‡¶ï‡¶∞‡¶æ ‡¶π‡ßü‡ßá‡¶õ‡ßá ‡¶†‡¶ø‡¶ï category_id ‡¶è‡¶∞ ‡¶™‡¶∞‡ßá
target_columns = [
    # IDs & Category
    'job_id', 'company_id', 'category_id', 'category_name', 
    
    # Company Info
    'company_name', 'company_web', 'company_address', 'company_business',
    
    # Core Job Info
    'job_title', 'job_description', 'job_type', 'work_place',
    'job_benefits', 'job_source',
    
    # Dates & Vacancy
    'posted_on', 'deadline', 'vacancies',
    
    # Skills & Gender
    'skills', 'suggested_skills', 'gender',
    
    # Application Details
    'apply_url', 'apply_email', 'apply_instruction', 'online_apply',
    
    # Salary
    'min_salary', 'max_salary',
    
    # Processed Location
    'district', 'fine_grained_location',
    
    # Processed Education
    'education_level', 'education_subject',
    
    # Processed Age & Experience
    'min_age', 'max_age', 
    'min_experience', 'max_experience'
]

# ‡¶∏‡ßá‡¶á‡¶´‡¶≤‡¶ø ‡¶ï‡¶≤‡¶æ‡¶Æ ‡¶∏‡¶ø‡¶≤‡ßá‡¶ï‡ßç‡¶ü ‡¶ï‡¶∞‡¶æ (‡¶Ø‡¶¶‡¶ø category_name ‡¶Ü‡¶ó‡ßá‡¶∞ ‡¶∏‡ßç‡¶ü‡ßá‡¶™‡ßá ‡¶§‡ßà‡¶∞‡¶ø ‡¶®‡¶æ ‡¶π‡ßü‡ßá ‡¶•‡¶æ‡¶ï‡ßá, ‡¶§‡¶¨‡ßá ‡¶è‡¶∞‡¶∞ ‡¶¶‡¶ø‡¶¨‡ßá ‡¶®‡¶æ)
existing_cols = [c for c in target_columns if c in df.columns]
df_final = df[existing_cols].copy()

# ==========================================
# 3. FINAL OUTPUT PREVIEW
# ==========================================
print("\n" + "="*40)
print("üéâ FINAL DATASET READY")
print("="*40)
print(f"Total Columns: {len(df_final.columns)}")
print(f"Total Rows: {len(df_final)}")

# ‡¶Ü‡¶Æ‡¶∞‡¶æ ‡¶è‡¶ñ‡¶® ‡¶∏‡ßç‡¶™‡ßá‡¶∏‡¶ø‡¶´‡¶ø‡¶ï ‡¶ï‡¶≤‡¶æ‡¶Æ‡¶ó‡ßÅ‡¶≤‡ßã ‡¶¶‡ßá‡¶ñ‡¶¨‡ßã ‡¶Ø‡¶æ‡¶§‡ßá ‡¶®‡¶ø‡¶∂‡ßç‡¶ö‡¶ø‡¶§ ‡¶π‡¶ì‡ßü‡¶æ ‡¶Ø‡¶æ‡ßü ‡¶Ö‡¶∞‡ßç‡¶°‡¶æ‡¶∞ ‡¶†‡¶ø‡¶ï ‡¶Ü‡¶õ‡ßá
display_cols = ['job_id', 'category_id', 'category_name', 'job_title', 'company_name']
print("\nSample Data (Checking Category Order):")
# ‡¶ú‡ßÅ‡¶™‡¶ø‡¶ü‡¶æ‡¶∞‡ßá display() ‡¶´‡¶æ‡¶Ç‡¶∂‡¶® ‡¶ü‡ßá‡¶¨‡¶ø‡¶≤ ‡¶∏‡ßÅ‡¶®‡ßç‡¶¶‡¶∞ ‡¶¶‡ßá‡¶ñ‡¶æ‡ßü
try:
    display(df_final[display_cols].head(5))
except:
    print(df_final[display_cols].head(5))

df_final.head(10)

Total Rows: 13597, Total Columns: 70

Existing Columns:
['test', 'JobId', 'JobFound', 'error', 'CompnayName', 'JobTitle', 'PostedOn', 'Deadline', 'DeadlineDB', 'JobVacancies', 'JobDescription', 'JobNature', 'JobWorkPlace', 'EducationRequirements', 'SkillsRequired', 'SuggestedSkills', 'Publications', 'Age', 'experience', 'Gender', 'AdditionJobRequirements', 'JobLocation', 'OnlineApply', 'CompanyBusiness', 'CompanyAddress', 'CompanyHideAddress', 'CompanyWeb', 'JobAppliedEmail', 'JobSource', 'JobOtherBenifits', 'RecruitmentProcessingInformation', 'RecruitingCompanysProfile', 'JobSalaryRange', 'JobSalaryRangeText', 'JobSalaryMinSalary', 'JobSalaryMaxSalary', 'ShowSalary', 'overseasnote', 'JobAdType', 'JobLOgoName', 'JobKeyPoints', 'ApplyInstruction', 'ApplyEmail', 'HardCopy', 'WalkInInterview', 'ApplyURL', 'Photograph', 'PhotographMsg', 'JObIMage', 'upcoming', 'upcomingln', 'CompanyOtherJ0bs', 'CompanyID', 'CompanyNameENG', 'AssessmentRequired', 'Context', 'RLNO', 'PreferVideoResume', 'Att

Unnamed: 0,test,JobId,JobFound,error,CompnayName,JobTitle,PostedOn,Deadline,DeadlineDB,JobVacancies,JobDescription,JobNature,JobWorkPlace,EducationRequirements,SkillsRequired,SuggestedSkills,Publications,Age,experience,Gender,AdditionJobRequirements,JobLocation,OnlineApply,CompanyBusiness,CompanyAddress,CompanyHideAddress,CompanyWeb,JobAppliedEmail,JobSource,JobOtherBenifits,RecruitmentProcessingInformation,RecruitingCompanysProfile,JobSalaryRange,JobSalaryRangeText,JobSalaryMinSalary,JobSalaryMaxSalary,ShowSalary,overseasnote,JobAdType,JobLOgoName,JobKeyPoints,ApplyInstruction,ApplyEmail,HardCopy,WalkInInterview,ApplyURL,Photograph,PhotographMsg,JObIMage,upcoming,upcomingln,CompanyOtherJ0bs,CompanyID,CompanyNameENG,AssessmentRequired,Context,RLNO,PreferVideoResume,AttachedResume,bottomAlertMsg,ProUser,ApplicantMatchingScore,CategoryID,C2C,MobileNo,CONFIDENTIAL,NewspaperJob,Closed,ApplyRedirectUrl,AccessibilityAware
0,1,1436370,True,0,RK Supply Ltd.,Data Analyst / Accounts Officer,"Dec 6, 2025","Dec 31, 2025",12/31/2025 00:00:00,6,"<p><strong><span style=""color:rgb(51, 51, 51);...",Full Time,Work at office,<ul><ul><li>Bachelor of Business Administratio...,"Accounting,Accounting Data Entry,Canva Pro,Dat...","Accounting Data Entry,Accounting Software,Acco...",0,20 to 35 years,<ul><li>At least 1 year</li><li>The applicants...,"M,F",<ul><li>Age 20 to 35 years</li></ul>,Dhaka (Banani),True,,,True,,,,,,,Tk. 20000 - 35000 (Monthly),,20000,35000,1,,1,,"<p><strong><span style=""color:rgb(51, 51, 51)""...",Selected candidates for this role will be trai...,,,,,-,-,,,,0,109093,RK Supply Ltd.,No,,,0,0,‡¶¨‡¶ø‡¶°‡¶ø‡¶ú‡¶¨‡¶∏-‡¶è ‡¶™‡ßç‡¶∞‡¶ï‡¶æ‡¶∂‡¶ø‡¶§ ‡¶Ø‡ßá‡¶ï‡ßã‡¶®‡ßã ‡¶ö‡¶æ‡¶ï‡¶∞‡¶ø ‡¶∏‡¶Ç‡¶ï‡ßç‡¶∞‡¶æ‡¶®‡ßç‡¶§ ‡¶§‡¶•‡ßç‡¶Ø...,0,0,8,0,,,False,0,//mybdjobs.bdjobs.com/mybdjobs/signin.asp?c7`6...,[]
1,1,1437042,True,0,Pridesys IT Limited,Processor/Senior Processor,"Dec 6, 2025","Jan 5, 2026",01/05/2026 00:00:00,2,<ul><li>Review property preservation work orde...,Full Time,Work at office,<ul><ul><li>Bachelor/Honors</li></ul></ul>,"Flexibility and Adaptability,Google Sheets,MS ...","Client Service,Property Management,Property Pr...",0,20 to 35 years,<ul><li>At most 2 years</li><li>Freshers are a...,M,<ul><li>Age 20 to 35 years</li><li>Only Male</...,Dhaka (Kawran Bazar),True,Pridesys IT Ltd. own developed ERP product for...,"Level-11, Vision 2021 Tower-1, Software Techno...",False,,,,<ul><li>Salary Review: Yearly</li><li>Festival...,,,Negotiable,,0,0,1,,1,https://corporate.bdjobs.com/logos/38072_0.png,,,,,,,-,-,,,,0,38072,Pridesys IT Limited,No,,,0,0,‡¶¨‡¶ø‡¶°‡¶ø‡¶ú‡¶¨‡¶∏-‡¶è ‡¶™‡ßç‡¶∞‡¶ï‡¶æ‡¶∂‡¶ø‡¶§ ‡¶Ø‡ßá‡¶ï‡ßã‡¶®‡ßã ‡¶ö‡¶æ‡¶ï‡¶∞‡¶ø ‡¶∏‡¶Ç‡¶ï‡ßç‡¶∞‡¶æ‡¶®‡ßç‡¶§ ‡¶§‡¶•‡ßç‡¶Ø...,0,0,8,0,,,False,0,//mybdjobs.bdjobs.com/mybdjobs/signin.asp?a4]7...,[]
2,1,1437039,True,0,Aalok Healthcare & Hospital,Assistant Manager (Customer Care / Admin),"Dec 6, 2025","Dec 26, 2025",12/26/2025 00:00:00,4,<h3>Shift Management and Operational Leadershi...,Full Time,Work at office,<ul><ul><li>Masters</li></ul></ul>,"Computer Literacy,Customer Service,Health/ Med...","Administration,Complaint Management,Hospital B...",0,Na,<ul><li>3 to 5 years</li><li>The applicants sh...,M,<ul><li>Only Male</li></ul> <ul><li><p>3 to 5 ...,Dhaka (Mirpur),True,,"Corporate Office: House# 3, Road # 4, Block # ...",False,https://www.aalokhealthcare.com/,,,<ul><li>Provident fund</li><li>Salary Review: ...,,,--,,0,0,0,,1,https://corporate.bdjobs.com/logos/92299_3.png,,<p>Read before Apply and email us mention as E...,Send your CV to the given email hr.aalok@gmail...,,,,-,-,,,,4,92299,Aalok Healthcare & Hospital,No,,,1,0,‡¶¨‡¶ø‡¶°‡¶ø‡¶ú‡¶¨‡¶∏-‡¶è ‡¶™‡ßç‡¶∞‡¶ï‡¶æ‡¶∂‡¶ø‡¶§ ‡¶Ø‡ßá‡¶ï‡ßã‡¶®‡ßã ‡¶ö‡¶æ‡¶ï‡¶∞‡¶ø ‡¶∏‡¶Ç‡¶ï‡ßç‡¶∞‡¶æ‡¶®‡ßç‡¶§ ‡¶§‡¶•‡ßç‡¶Ø...,0,0,11,0,,,False,0,//mybdjobs.bdjobs.com/mybdjobs/signin.asp?3_7a...,[{'Name': 'Do you have Disability Inclusion Po...


Column names standardized!
Cleaning HTML tags...
HTML Cleaning Done!
Generating new columns...
New Features Created: min_age, district, education_level, etc.
‚úÖ Fixed Category Map Loaded.
‚úÖ Category Names Applied Successfully!

Sample Data with Fixed Categories:
   category_id                  category_name                                          job_title
0            8         IT & Telecommunication                    Data Analyst / Accounts Officer
2           11                 Medical/Pharma          Assistant Manager (Customer Care / Admin)
3           84      Sales Representative (SR)                          Sales Representative (SR)
4            9                Marketing/Sales  Sales Hero (Sales Executive - Freshers) RANGS ...
5           17            HR/Org. Development                                HR Manager - Female
6            6               Garments/Textile  Senior Executive - Dyeing (Textile Division, G...
8           19           Production/Operation          

Unnamed: 0,job_id,category_id,category_name,job_title,company_name
0,JOB-00001,8,IT & Telecommunication,Data Analyst / Accounts Officer,RK Supply Ltd.
1,JOB-00002,8,IT & Telecommunication,Processor/Senior Processor,Pridesys IT Limited
2,JOB-00003,11,Medical/Pharma,Assistant Manager (Customer Care / Admin),Aalok Healthcare & Hospital
3,JOB-00004,84,Sales Representative (SR),Sales Representative (SR),Eureka food and beverage
4,JOB-00005,9,Marketing/Sales,Sales Hero (Sales Executive - Freshers) RANGS ...,Rancon Holdings Limited


Unnamed: 0,job_id,company_id,category_id,category_name,company_name,company_web,company_address,company_business,job_title,job_description,job_type,work_place,job_benefits,job_source,posted_on,deadline,vacancies,skills,suggested_skills,gender,apply_url,apply_email,apply_instruction,online_apply,min_salary,max_salary,district,fine_grained_location,education_level,education_subject,min_age,max_age,min_experience,max_experience
0,JOB-00001,COM-5000,8,IT & Telecommunication,RK Supply Ltd.,,,,Data Analyst / Accounts Officer,"RK Supply Ltd. (Company No. 09740433), establi...",Full Time,Work at office,,,"Dec 6, 2025","Dec 31, 2025",6.0,"Accounting,Accounting Data Entry,Canva Pro,Dat...","Accounting Data Entry,Accounting Software,Acco...","M,F",,,Selected candidates for this role will be trai...,True,20000.0,35000.0,Dhaka,Banani,Bachelor,Computer Science,20.0,35.0,1.0,2.0
1,JOB-00002,COM-5001,8,IT & Telecommunication,Pridesys IT Limited,,"Level-11, Vision 2021 Tower-1, Software Techno...",Pridesys IT Ltd. own developed ERP product for...,Processor/Senior Processor,Review property preservation work orders submi...,Full Time,Work at office,Salary Review: Yearly Festival Bonus: 2 Dinner...,,"Dec 6, 2025","Jan 5, 2026",2.0,"Flexibility and Adaptability,Google Sheets,MS ...","Client Service,Property Management,Property Pr...",M,,,,True,,,Dhaka,Kawran Bazar,Any,Any,20.0,35.0,2.0,
2,JOB-00003,COM-5002,11,Medical/Pharma,Aalok Healthcare & Hospital,https://www.aalokhealthcare.com/,"Corporate Office: House# 3, Road # 4, Block # ...",,Assistant Manager (Customer Care / Admin),Shift Management and Operational Leadership (2...,Full Time,Work at office,Provident fund Salary Review: Yearly Festival ...,,"Dec 6, 2025","Dec 26, 2025",4.0,"Computer Literacy,Customer Service,Health/ Med...","Administration,Complaint Management,Hospital B...",M,,Send your CV to the given email hr.aalok@gmail...,Read before Apply and email us mention as Expe...,True,,,Dhaka,Mirpur,Any,Any,,,3.0,5.0
3,JOB-00004,COM-5003,84,Sales Representative (SR),Eureka food and beverage,,"Jong & sons market,Anwar jong road,Ashulia, Sa...","We manufacture, market, and sell all types of ...",Sales Representative (SR),‡¶¶‡¶æ‡¶Ø‡¶º‡¶ø‡¶§‡ßç‡¶¨ ‡¶ì ‡¶ï‡¶∞‡ßç‡¶§‡¶¨‡ßç‡¶Ø: ‡¶ï‡ßã‡¶Æ‡ßç‡¶™‡¶æ‡¶®‡¶ø‡¶∞ ‡¶™‡¶£‡ßç‡¶Ø ‡¶¨‡¶ø‡¶ï‡ßç‡¶∞‡¶Ø‡¶º ‡¶ï‡¶∞‡¶æ...,Full Time,,‡¶Ö‡¶®‡ßç‡¶Ø‡¶æ‡¶®‡ßç‡¶Ø ‡¶∏‡ßÅ‡¶¨‡¶ø‡¶ß‡¶æ: ‡¶Ü‡¶≤‡ßã‡¶ö‡¶®‡¶æ ‡¶∏‡¶æ‡¶™‡ßá‡¶ï‡ßç‡¶∑‡ßá,,"Dec 6, 2025","Jan 5, 2026",,,"Market Research,Marketing,Sales & Marketing,Sa...",M,,,‡¶Ü‡¶ó‡ßç‡¶∞‡¶π‡ßÄ ‡¶™‡ßç‡¶∞‡¶æ‡¶∞‡ßç‡¶•‡ßÄ‡¶¶‡ßá‡¶∞ ‡¶Ü‡¶¨‡ßá‡¶¶‡¶® ‡¶ï‡¶∞‡¶æ‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶â‡ßé‡¶∏‡¶æ‡¶π‡¶ø‡¶§ ‡¶ï‡¶∞‡¶æ...,True,18000.0,22000.0,Anywhere in Bangladesh,none,Any,Any,22.0,40.0,2.0,5.0
4,JOB-00005,COM-5004,9,Marketing/Sales,Rancon Holdings Limited,,"117/A Old Airport Road, Bijoy Sharani","A local conglomerate, doing diversified busine...",Sales Hero (Sales Executive - Freshers) RANGS ...,Greet and assist walk-in customers with a frie...,Full Time,Work at office,"T/A,Mobile bill,Performance bonus Salary Revie...",,"Dec 6, 2025","Dec 31, 2025",20.0,,"Marketing and Sales,Retail Sales,Sales,Sales &...",M,,,,True,20000.0,20000.0,Anywhere in Bangladesh,none,Any,Any,24.0,30.0,,
5,JOB-00006,COM-5005,17,HR/Org. Development,ALTERNATIVE RECRUITMENT LTD.,,"BNS Center, Level 08\r\nSector 07\r\nUttara ,D...",Alternative Recruitment is a certified Educati...,HR Manager - Female,Join Our Global Team ‚Äì Female HR Manager Wante...,Full Time,"Work from home,Work at office","Profit share,Mobile bill Salary Review: Yearly...",,"Dec 6, 2025","Jan 5, 2026",15.0,"Computer Literacy,English typing,FLUENCY IN EN...","Human Resource Management,Management,Recruitme...",F,,Send your CV to the given email hiring@thealte...,Please read carefully : This position is open ...,True,20000.0,30000.0,Dhaka,"Uttara Sector 10, Uttara Sector 12",Diploma,Engineering,25.0,38.0,1.0,5.0
6,JOB-00007,COM-5006,6,Garments/Textile,Epyllion Group,,"Corporate Office: NINAKABBO, Level: 12, 227/A,...","Textile, Garments, Garments Accessories, Washi...","Senior Executive - Dyeing (Textile Division, G...",Ensure daily shift-wise dyeing production with...,Full Time,Work at office,"Mobile bill,Insurance,Gratuity,Medical allowan...",,"Dec 6, 2025","Dec 31, 2025",1.0,"Dyeing,Wet Processing","Dyeing Finishing,Dyeing Production,Textile and...","M,F",,,,True,,,Gazipur,none,Bachelor,Computer Science,27.0,,3.0,5.0
7,JOB-00008,COM-5002,11,Medical/Pharma,Aalok Healthcare & Hospital,https://www.aalokhealthcare.com/,"Corporate Office: House# 3, Road # 4, Block # ...",,Medical Officer,"Medical Officer Weekly 48 Hours duty, Shifting...",Full Time,Work at office,Provident fund Salary Review: Yearly Festival ...,,"Dec 6, 2025","Dec 26, 2025",21.0,"Emergency patient management,Have capacity to ...",,"M,F",,Send your CV to the given email hr.aalok@gmail...,Ready to do shifting duty. Weekly 48 Hours dut...,True,,,Dhaka,Mirpur,Any,Any,,,1.0,3.0
8,JOB-00009,COM-5006,19,Production/Operation,Epyllion Group,,"Corporate Office: NINAKABBO, Level: 12, 227/A,...","Textile, Garments, Garments Accessories, Washi...","Officer - Sewing (Production), Gazipur","Plan, analyze, and set up daily and weekly pro...",Full Time,Work at office,"Mobile bill,Medical allowance,Insurance,Gratui...",,"Dec 6, 2025","Dec 31, 2025",2.0,"Attention to details,Problem solving and decis...",,"M,F",,,,True,,,Gazipur,none,Bachelor,Computer Science,28.0,,5.0,
9,JOB-00010,COM-5007,16,Customer Support/Call Center,Ever Skin Care,,House No: 11/C-3 Modina Tower Mirpur-1.,"Ever Skincare is a clean, nature-inspired skin...",Customer Service Executive,Providing guidance and support to team members...,Full Time,Work at office,,,"Dec 6, 2025","Jan 5, 2026",10.0,,"Client Service,Customer Relations,Customer Ser...",F,,,,True,15000.0,30000.0,"Dhaka, Dhaka","Kawran Bazar, Mirpur, Uttara",Any,Any,20.0,35.0,,


# **Dense Retreival**

# **1st Step**

In [25]:
def create_improved_context(row):
    # ‡¶Ö‡¶≠‡¶ø‡¶ú‡ßç‡¶û‡¶§‡¶æ‡¶∞ ‡¶§‡¶•‡ßç‡¶Ø‡¶ï‡ßá ‡¶ü‡ßá‡¶ï‡ßç‡¶∏‡¶ü ‡¶π‡¶ø‡¶∏‡ßá‡¶¨‡ßá ‡¶∏‡¶æ‡¶ú‡¶æ‡¶®‡ßã (‡¶Ø‡ßá‡¶Æ‡¶®: 2 years experience)
    experience_text = f"{row['min_experience']} to {row['max_experience']} years experience" if row['min_experience'] != "" else "Freshers"
    
    parts = [
        str(row['job_title']),
        str(row['category_name']), # ‡¶®‡¶§‡ßÅ‡¶®
        str(row['company_name']),  # ‡¶®‡¶§‡ßÅ‡¶®
        BeautifulSoup(str(row['job_description']), 'html.parser').get_text(),
        str(row['skills']),
        str(row['job_type']),       # ‡¶®‡¶§‡ßÅ‡¶®
        str(row['education_level']),
        str(row['education_subject']),
        str(row['district']),
        str(row['fine_grained_location']), # ‡¶®‡¶§‡ßÅ‡¶®
        experience_text             # ‡¶®‡¶§‡ßÅ‡¶®
    ]
    
    # ‡¶∏‡¶¨ ‡¶ü‡ßá‡¶ï‡ßç‡¶∏‡¶ü ‡¶õ‡ßã‡¶ü ‡¶π‡¶æ‡¶§‡ßá‡¶∞ ‡¶Ö‡¶ï‡ßç‡¶∑‡¶∞‡ßá ‡¶è‡¶¨‡¶Ç ‡¶Ö‡¶§‡¶ø‡¶∞‡¶ø‡¶ï‡ßç‡¶§ ‡¶∏‡ßç‡¶™‡ßá‡¶∏ ‡¶∞‡¶ø‡¶Æ‡ßÅ‡¶≠ ‡¶ï‡¶∞‡ßá ‡¶∞‡¶ø‡¶ü‡¶æ‡¶∞‡ßç‡¶® ‡¶ï‡¶∞‡¶æ
    combined = " ".join([str(p) for p in parts if p]).lower()
    return combined

# ‡¶®‡¶§‡ßÅ‡¶® ‡¶ï‡¶≤‡¶æ‡¶Æ ‡¶§‡ßà‡¶∞‡¶ø
df_final['combined_text'] = df_final.apply(create_improved_context, axis=1)

# ‡¶Ü‡¶ó‡ßá‡¶∞ ‡¶Æ‡¶§‡ßã‡¶á ‡¶™‡¶æ‡¶ô‡ßç‡¶ï‡¶ö‡ßÅ‡¶Ø‡¶º‡ßá‡¶∂‡¶® ‡¶∞‡¶ø‡¶Æ‡ßÅ‡¶≠ ‡¶ï‡¶∞‡¶æ (‡¶Æ‡¶°‡ßá‡¶≤‡¶ï‡ßá ‡¶ï‡ßç‡¶≤‡¶ø‡¶® ‡¶°‡¶æ‡¶ü‡¶æ ‡¶¶‡ßá‡¶ì‡ßü‡¶æ‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø)
df_final['combined_text'] = df_final['combined_text'].str.replace(r'[^\w\s]', ' ', regex=True)

# Verify the result
print("Combined Text Sample:")
print(df_final['combined_text'].iloc[0][:200] + "...")

Combined Text Sample:
data analyst   accounts officer it   telecommunication rk supply ltd  rk supply ltd   company no  09740433   established in august 2015 and based in london  is a leading uk supplier of sim cards and m...


# ** step -02**

In [26]:
from sentence_transformers import SentenceTransformer
import torch

# 1. Load the model (this stays the same)
model = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Encode the job data - MAKE SURE TO USE df_final
# This might take a few minutes if you have 10,000+ jobs
job_embeddings = model.encode(
    df_final['combined_text'].tolist(), 
    show_progress_bar=True, 
    convert_to_tensor=True
)

print("‚úÖ Job Embeddings Generated Successfully!")

Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 276.13it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 423/423 [11:35<00:00,  1.64s/it]

‚úÖ Job Embeddings Generated Successfully!





# **Step 1: Tokenization**

In [27]:
from rank_bm25 import BM25Okapi

# Simple tokenization: lowercase and split by space
tokenized_corpus = [doc.lower().split(" ") for doc in df_final['combined_text'].tolist()]
bm25 = BM25Okapi(tokenized_corpus)

# **Step 3: Creating the Retrieval Logic (The "Super Function")**

In [28]:
from sentence_transformers import util

def get_recommendations(query, k=5):
    # --- 1. Dense Search ---
    query_embedding = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_embedding, job_embeddings)[0]
    top_dense_indices = torch.topk(cos_scores, k=k).indices.tolist()
    
    # --- 2. Sparse Search ---
    tokenized_query = query.lower().split(" ")
    top_sparse_results = bm25.get_top_n(tokenized_query, df.to_dict('records'), n=k)
    
    return {
        "dense_results": df.iloc[top_dense_indices][['job_title', 'company_name']],
        "sparse_results": top_sparse_results
    }

# Example Usage
# results = get_recommendations("Looking for a data analyst job in Dhaka")

# **‡¶π‡¶æ‡¶á‡¶¨‡ßç‡¶∞‡¶ø‡¶° ‡¶∏‡¶æ‡¶∞‡ßç‡¶ö ‡¶´‡¶æ‡¶Ç‡¶∂‡¶® (The Solution)**

In [38]:
def get_hybrid_recommendations(query, k=5, alpha=0.5):
    """
    alpha: Dense ‡¶è‡¶¨‡¶Ç Sparse ‡¶è‡¶∞ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßç‡¶¨ ‡¶†‡¶ø‡¶ï ‡¶ï‡¶∞‡ßá‡•§ 
    0.5 ‡¶Æ‡¶æ‡¶®‡ßá ‡¶¶‡ßÅ‡¶á‡¶ü‡¶æ‡¶á ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßç‡¶¨‡¶™‡ßÇ‡¶∞‡ßç‡¶£‡•§
    """
    # 1. Dense Score (Cosine Similarity)
    query_embedding = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_embedding, job_embeddings)[0].cpu().numpy()
    
    # 2. Sparse Score (BM25)
    tokenized_query = query.lower().split()
    bm25_scores = bm25.get_scores(tokenized_query)
    
    # ‡ß©. Normalization (‡¶∏‡ßç‡¶ï‡ßã‡¶∞‡¶ó‡ßÅ‡¶≤‡ßã‡¶ï‡ßá ‡ß¶ ‡¶•‡ßá‡¶ï‡ßá ‡ßß ‡¶è‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ‡¶Ü‡¶®‡¶æ)
    # Dense scores are already roughly 0-1, but BM25 isn't.
    if np.max(bm25_scores) != 0:
        bm25_scores = bm25_scores / np.max(bm25_scores)
    
    # ‡ß™. Hybrid Score calculation
    # Formula: $Score = \alpha \times \text{Dense} + (1 - \alpha) \times \text{Sparse}$
    final_scores = (alpha * cos_scores) + ((1 - alpha) * bm25_scores)
    
    # ‡ß´. Top K indices ‡¶¨‡ßá‡¶∞ ‡¶ï‡¶∞‡¶æ
    top_indices = np.argsort(final_scores)[::-1][:k]
    
    return df_final.iloc[top_indices][['job_title', 'company_name', 'district', 'skills', 'job_type','fine_grained_location']]

# ‡¶ü‡ßá‡¶∏‡ßç‡¶ü ‡¶ï‡¶∞‡¶æ‡¶∞ ‡¶®‡¶ø‡ßü‡¶Æ
#results = get_hybrid_recommendations("Looking for a Python Developer job in Dhaka", k=5)
# display(results)

In [44]:
results = get_hybrid_recommendations("Part time Software Engineer", k=5)
display(results)

Unnamed: 0,job_title,company_name,district,skills,job_type,fine_grained_location
3227,Software Engineer (.NET Core & Angular),HawarIT Software Service Ltd.,Dhaka,"Angular,ASP.NET Core,ASP.NET Core MVC,ASP.NET ...",Full Time,none
2865,Software Engineer (Full-Stack),mPower Social Enterprises Ltd.,Dhaka,,Full Time,Gulshan
6922,Architect / Civil Engineer,A Reputed Import Machineries Company,Dhaka,"Architecture, Civil Engineering",Full Time,Mirpur
12357,Asst. Software Engineer/Software Engineer,A Reputed Garments Manufacturing Company in Ba...,Dhaka,"ASP.NET Core, JavaScript, React | Node JS | Ja...",Full Time,none
13206,Software Engineer ( .Net ),Leading Multinational Software Company,Anywhere in Bangladesh,Dot Net,Full Time,none
