### Obtain candidates based on search critetia

In [None]:
#GET https://api.lusha.com/v2/person?firstName=:firstName&lastName=:lastName&companyName=:companyName -H api_key:API_KEY

In [1]:
import pandas as pd
import numpy as np
import requests
import logging
import warnings
import re
import os


# Set up logging and ignore warnings
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def search_employees_one_row_per_employee_dedup(
    query,
    country_filter=None,
    location_filter=None,
    company_filter=None,
    university_filter=None,
    industry_filter=None,
    skills_filter=None,
    certifications_filter=None,
    languages_filter=None,
    max_to_fetch=None
):
    """
    Search employees by:
      - 'query' (e.g. 'CEO', 'CEO OR CFO', etc.)
      - Optional filters:
            country_filter (e.g. 'South Africa'),
            location_filter (e.g. 'Johannesburg, Gauteng, South Africa'),
            company_filter (search in company names),
            university_filter (search in university names),
            industry_filter (search in the top-level industry field),
            skills_filter (search in skills),
            certifications_filter (search in certifications),
            languages_filter (search in languages),
            projects_filter (provided for consistency but not used in the search query).

    In the final DataFrame (one row per employee):
      - Keeps: ID, Name, Headline/Title, Location, Country, URL, Canonical_URL, Industry,
               Experience Count, Summary.
      - Includes: deduplicated Experiences (with duration), Educations, Skills, Certifications,
                  Languages, and Projects.
    """
    # Build the list of must clauses.
    must_clauses = []

    # Base clause: search in experience title.
    must_clauses.append({
        "nested": {
            "path": "member_experience_collection",
            "query": {
                "query_string": {
                    "query": query,
                    "default_field": "member_experience_collection.title",
                    "default_operator": "and"
                }
            }
        }
    })

    # Additional filter: Company Name (in experience collection)
    if company_filter:
        must_clauses.append({
            "nested": {
                "path": "member_experience_collection",
                "query": {
                    "match_phrase": {
                        "member_experience_collection.company_name": company_filter
                    }
                }
            }
        })

    # Additional filter: University Name (in education collection)
    if university_filter:
        must_clauses.append({
            "nested": {
                "path": "member_education_collection",
                "query": {
                    "match_phrase": {
                        "member_education_collection.title": university_filter
                    }
                }
            }
        })

    # Additional filter: Industry (top-level field)
    if industry_filter:
        must_clauses.append({
            "match_phrase": {
                "industry": industry_filter
            }
        })

    # Additional filter: Skills (in skills collection)
    if skills_filter:
        must_clauses.append({
            "nested": {
                "path": "member_skills_collection",
                "query": {
                    "match_phrase": {
                        "member_skills_collection.member_skill_list.skill": skills_filter
                    }
                }
            }
        })

    # Additional filter: Certifications (in certifications collection)
    if certifications_filter:
        must_clauses.append({
            "nested": {
                "path": "member_certifications_collection",
                "query": {
                    "match_phrase": {
                        "member_certifications_collection.name": certifications_filter
                    }
                }
            }
        })

    # Additional filter: Languages (in languages collection)
    if languages_filter:
        # Convert the search term to lower case so that "English" matches stored "english"
        must_clauses.append({
            "nested": {
                "path": "member_languages_collection",
                "query": {
                    "match_phrase": {
                        "member_languages_collection.member_language_list.language": languages_filter.lower()
                    }
                }
            }
        })

    # Exclude patterns in titles
    exclude_patterns = ["PA to", "Assistant to", "Personal Assistant", "EA to","Executive Assistant to"]
    must_not_clauses = [
        {
            "nested": {
                "path": "member_experience_collection",
                "query": {
                    "query_string": {
                        "query": f"member_experience_collection.title:({pattern})",
                        "default_operator": "or"
                    }
                }
            }
        }
        for pattern in exclude_patterns
    ]

    # Build the complete payload with country and location filters added.
    payload = {
        "query": {
            "bool": {
                "must": must_clauses,
                "must_not": must_not_clauses
            }
        }
    }

    if country_filter:
        payload["query"]["bool"]["must"].append({
            "term": {
                "country": country_filter
            }
        })

    if location_filter:
        payload["query"]["bool"]["must"].append({
            "match_phrase": {
                "location": location_filter
            }
        })

    # Uncomment for debugging:
    # print(json.dumps(payload, indent=2))

    # Send the search request.
    search_url = "https://api.coresignal.com/cdapi/v1/professional_network/employee/search/es_dsl"
    headers = {
    'Content-Type': 'application/json',
    'Authorization': 'Bearer eyJhbGciOiJFZERTQSIsImtpZCI6IjYzOGY5Y2YyLTUyM2UtOGJmMC0zZmFlLTEyY2UwNTUzOTQ1YiJ9.eyJhdWQiOiJzdHVkZW50LnVqLmFjLnphIiwiZXhwIjoxNzczMjc2NTkzLCJpYXQiOjE3NDE3MTk2NDEsImlzcyI6Imh0dHBzOi8vb3BzLmNvcmVzaWduYWwuY29tOjgzMDAvdjEvaWRlbnRpdHkvb2lkYyIsIm5hbWVzcGFjZSI6InJvb3QiLCJwcmVmZXJyZWRfdXNlcm5hbWUiOiJzdHVkZW50LnVqLmFjLnphIiwic3ViIjoiOTc4OGQ4OTYtMjcwYy01ODY4LTE2NDItOTFhYmQ5NDBhMDg2IiwidXNlcmluZm8iOnsic2NvcGVzIjoiY2RhcGkifX0.GYI_XfOwh_DiuBMu9q_JRL39v4bOgJixOWIxPG0ZujADWVFtQQKO1tNJ71ig-ncoRJJE7R6z0WbG4Bxjs_qkDw'
    }

    resp = requests.post(search_url, headers=headers, json=payload)
    resp.raise_for_status()
    employee_ids = resp.json()

    if not isinstance(employee_ids, list):
        print("Unexpected structure in search response.")
        return pd.DataFrame()

    # Collect data for each employee ID.
    rows = []
    for emp_id in employee_ids[:max_to_fetch]:
        collect_url = f"https://api.coresignal.com/cdapi/v1/professional_network/employee/collect/{emp_id}"
        r = requests.get(collect_url, headers=headers)
        r.raise_for_status()
        employee = r.json()


        # Basic fields
        id_val = employee.get("id")
        name_val = employee.get("name")
        headline_val = employee.get("title")
        location_val = employee.get("location")
        country_val = employee.get("country")
        url_val = employee.get("url")
        canonical_url = employee.get("canonical_url")
        industry_val = employee.get("industry")
        experience_count_val = employee.get("experience_count")
        summary_val = employee.get("summary")

        # ----- EXPERIENCE (deduplicate) -----
        raw_exps = employee.get("member_experience_collection", [])
        unique_exps = []
        seen_exps = set()
        for exp in raw_exps:
            key = (
                exp.get("title", "N/A"),
                exp.get("company_name", "N/A"),
                exp.get("date_from", "N/A"),
                exp.get("date_to", "N/A")
            )
            if key not in seen_exps:
                seen_exps.add(key)
                unique_exps.append(exp)
        experiences_str = "\n".join(
            f"Role: {exp.get('title','N/A')} | Company: {exp.get('company_name','N/A')} | From: {exp.get('date_from','N/A')} | To: {exp.get('date_to','N/A')} | Duration: {exp.get('duration','N/A')}"
            for exp in unique_exps
        )

        # ----- EDUCATION (deduplicate) -----
        raw_edu = employee.get("member_education_collection", [])
        unique_edu = []
        seen_edu = set()
        for edu in raw_edu:
            key = (
                edu.get("title", "N/A"),
                edu.get("subtitle", "N/A"),
                edu.get("date_from", "N/A"),
                edu.get("date_to", "N/A")
            )
            if key not in seen_edu:
                seen_edu.add(key)
                unique_edu.append(edu)
        educations_str = "\n".join(
            f"Institution: {edu.get('title','N/A')} | Degree: {edu.get('subtitle','N/A')} | From: {edu.get('date_from','N/A')} | To: {edu.get('date_to','N/A')}"
            for edu in unique_edu
        )

        # ----- SKILLS (deduplicate) -----
        raw_skills = employee.get("member_skills_collection", [])
        seen_skills = set()
        for skill_entry in raw_skills:
            skill_name = skill_entry.get("member_skill_list", {}).get("skill", "N/A")
            seen_skills.add(skill_name)
        skills_str = ", ".join(seen_skills) if seen_skills else ""

        # ----- CERTIFICATIONS (deduplicate) -----
        raw_certifications = employee.get("member_certifications_collection", [])
        seen_certs = set()
        for cert in raw_certifications:
            cert_name = cert.get("name", "N/A")
            seen_certs.add(cert_name)
        certifications_str = ", ".join(seen_certs) if seen_certs else ""

        # ----- LANGUAGES (deduplicate) -----
        raw_languages = employee.get("member_languages_collection", [])
        seen_langs = set()
        for lang in raw_languages:
            language_name = lang.get("member_language_list", {}).get("language", "N/A")
            seen_langs.add(language_name)
        languages_str = ", ".join(seen_langs) if seen_langs else ""

        # ----- PROJECTS (deduplicate) -----
        raw_projects = employee.get("member_projects_collection", [])
        seen_projects = set()
        for proj in raw_projects:
            proj_name = proj.get("name", "N/A")
            seen_projects.add(proj_name)
        projects_str = ", ".join([str(x) for x in seen_projects if x is not None]) if seen_projects else ""

        # Build the final row dictionary.
        row = {
            "ID": id_val,
            "Name": name_val,
            "Headline/Title": headline_val,
            "Location": location_val,
            "Country": country_val,
            "URL": url_val,
            "Canonical_URL": canonical_url,
            "Industry": industry_val,
            "Experience Count": experience_count_val,
            "Summary": summary_val,
            "Experiences": experiences_str,
            "Educations": educations_str,
            "Skills": skills_str,
            "Certifications": certifications_str,
            "Languages": languages_str,
            "Projects": projects_str
        }
        rows.append(row)

    # After the search API call
    print(f"Number of employee IDs returned: {len(employee_ids)}")
    df = pd.DataFrame(rows)

    return df

if __name__ == "__main__":
    user_query = '("Head of Legal")'
    country = "South Africa"
    location = "Johannesburg"
    # company = "PWC"
    # university = "University of Cape Town"
    # industry = "Accounting"
    # skills = "managerial finance"
    # certifications = "Assessor"
    # languages = "English"

    df_employees = search_employees_one_row_per_employee_dedup(
        query=user_query,
        #country_filter=country,
        #location_filter=location,
        # company_filter=company,
        # university_filter=university,
        # industry_filter=industry,
        # skills_filter=skills,
        # certifications_filter=certifications,
        # languages_filter=languages,
        max_to_fetch=1
    )

Number of employee IDs returned: 1000


In [19]:
import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import requests
import logging
import warnings
import re
from typing import List, Optional
import io
import hashlib
import pickle
import os
from datetime import datetime, timedelta

# Set up logging and ignore warnings
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def search_employees_one_row_per_employee_dedup(
    query,
    country_filter=None,
    location_filter=None,
    company_filter=None,
    university_filter=None,
    industry_filter=None,
    skills_filter=None,
    certifications_filter=None,
    languages_filter=None,
    max_to_fetch=None
):
    """
    Search employees by:
      - 'query' (e.g. 'CEO', 'CEO OR CFO', etc.)
      - Optional filters:
            country_filter (e.g. 'South Africa'),
            location_filter (e.g. 'Johannesburg, Gauteng, South Africa'),
            company_filter (search in company names),
            university_filter (search in university names),
            industry_filter (search in the top-level industry field),
            skills_filter (search in skills),
            certifications_filter (search in certifications),
            languages_filter (search in languages),
            projects_filter (provided for consistency but not used in the search query).

    In the final DataFrame (one row per employee):
      - Keeps: ID, Name, Headline/Title, Location, Country, URL, Canonical_URL, Industry,
               Experience Count, Summary.
      - Includes: deduplicated Experiences (with duration), Educations, Skills, Certifications,
                  Languages, and Projects.
    """
    # Build the list of must clauses.
    must_clauses = []

    # Base clause: search in experience title
    must_clauses.append({
        "nested": {
            "path": "experience",
            "query": {
                "query_string": {
                    "query": query,
                    "default_field": "experience.position_title",
                    "default_operator": "and"
                }
            }
        }
    })

    # Additional filter: Company Name (in experience)
    if company_filter:
        must_clauses.append({
            "nested": {
                "path": "experience",
                "query": {
                    "query_string": {
                        "query": company_filter,
                        "default_field": "experience.company_name",
                        "default_operator": "or"
                    }
                }
            }
        })

    # Additional filter: University Name (in education)
    if university_filter:
        must_clauses.append({
            "nested": {
                "path": "education",
                "query": {
                    "query_string": {
                        "query": university_filter,
                        "default_field": "education.institution_name",
                        "default_operator": "or"
                    }
                }
            }
        })

    # Additional filter: Industry (in experience)
    if industry_filter:
        must_clauses.append({
            "nested": {
                "path": "experience",
                "query": {
                    "query_string": {
                        "query": industry_filter,
                        "default_field": "experience.company_industry",
                        "default_operator": "or"
                    }
                }
            }
        })

    # Additional filter: Skills (in inferred_skills)
    if skills_filter:
        must_clauses.append({
            "query_string": {
                "query": skills_filter,
                "default_field": "inferred_skills",
                "default_operator": "or"
            }
        })

    # Additional filter: Certifications
    if certifications_filter:
        must_clauses.append({
            "nested": {
                "path": "certifications",
                "query": {
                    "query_string": {
                        "query": certifications_filter,
                        "default_field": "certifications.title",
                        "default_operator": "or"
                    }
                }
            }
        })

    # Additional filter: Languages
    if languages_filter:
        must_clauses.append({
            "nested": {
                "path": "languages",
                "query": {
                    "query_string": {
                        "query": languages_filter.lower(),
                        "default_field": "languages.language",
                        "default_operator": "or"
                    }
                }
            }
        })

    # Exclude patterns in titles
    exclude_patterns = ["PA to", "Assistant to", "Personal Assistant", "EA to", "Executive Assistant to","Head of the Office of the CFO","Head of the Office of the CEO"]
    must_not_clauses = [
        {
            "nested": {
                "path": "experience",
                "query": {
                    "query_string": {
                        "query": f"experience.position_title:({pattern})",
                        "default_operator": "or"
                    }
                }
            }
        }
        for pattern in exclude_patterns
    ]

    # Build the complete payload with country and location filters added.
    payload = {
        "query": {
            "bool": {
                "must": must_clauses,
                "must_not": must_not_clauses
            }
        }
    }

    if country_filter:
        payload["query"]["bool"]["must"].append({
            "query_string": {
                "query": country_filter,
                "default_field": "location_country",
                "default_operator": "and"
            }
        })

    if location_filter:
        payload["query"]["bool"]["must"].append({
            "query_string": {
                "query": location_filter,
                "default_field": "location_full",
                "default_operator": "and"
            }
        })

    # Uncomment for debugging:
    # print(json.dumps(payload, indent=2))

    # Send the search request.
    search_url = "https://api.coresignal.com/cdapi/v1/multi_source/employee/search/es_dsl"
    headers = {
    'Content-Type': 'application/json',
    'Authorization': 'Bearer eyJhbGciOiJFZERTQSIsImtpZCI6IjMzNjEyYzA1LWQ2MDYtYzllYy0zNGVjLWRiYmJiNGI0ZjgyMCJ9.eyJhdWQiOiJtdWx0aWNob2ljZS5jby56YSIsImV4cCI6MTc3MzQwNjg1OCwiaWF0IjoxNzQxODQ5OTA2LCJpc3MiOiJodHRwczovL29wcy5jb3Jlc2lnbmFsLmNvbTo4MzAwL3YxL2lkZW50aXR5L29pZGMiLCJuYW1lc3BhY2UiOiJyb290IiwicHJlZmVycmVkX3VzZXJuYW1lIjoibXVsdGljaG9pY2UuY28uemEiLCJzdWIiOiI5Nzg4ZDg5Ni0yNzBjLTU4NjgtMTY0Mi05MWFiZDk0MGEwODYiLCJ1c2VyaW5mbyI6eyJzY29wZXMiOiJjZGFwaSJ9fQ.GFaoIY_j8e3TKs9-iQ0H6O7NVz87T3Z7ZWIWPRHo17IrWqmehNvvJ8sD3BMaDVatHs9rr9C3hpUykkwS53HrAw' 
    }

    resp = requests.post(search_url, headers=headers, json=payload)
    resp.raise_for_status()
    employee_ids = resp.json()

    if not isinstance(employee_ids, list):
        print("Unexpected structure in search response.")
        return pd.DataFrame()

    # Collect data for each employee ID.
    rows = []
    for emp_id in employee_ids[:max_to_fetch]:
        
        collect_url = f"https://api.coresignal.com/cdapi/v1/multi_source/employee/collect/{emp_id}"
        r = requests.get(collect_url, headers=headers)
        r.raise_for_status()
        employee = r.json()

        # Basic fields
        id_val = employee.get("id")
        name_val = employee.get("full_name")
        headline_val = employee.get("headline")
        location_val = employee.get("location_full")
        country_val = employee.get("location_country")
        url_val = employee.get("linkedin_url")
        canonical_url = employee.get("linkedin_url")  # Using LinkedIn URL as canonical
        industry_val = None  # Not available in top level, will need to be extracted from experience
        experience_count_val = len(employee.get("experience", []))
        summary_val = employee.get("summary")
        
        # Get email information
        primary_email = employee.get("primary_professional_email")
        
        # Get all email addresses from collection
        email_collection = employee.get("professional_emails_collection", [])
        all_emails = [email_info.get("professional_email") for email_info in email_collection if email_info.get("professional_email")]
        all_emails_str = ", ".join(all_emails) if all_emails else ""

        # ----- EXPERIENCE (deduplicate) -----
        raw_exps = employee.get("experience", [])
        unique_exps = []
        seen_exps = set()
        company_industries = set()  # Set to collect unique industries
        for exp in raw_exps:
            key = (
                exp.get("position_title", "N/A"),
                exp.get("company_name", "N/A"),
                exp.get("date_from", "N/A"),
                exp.get("date_to", "N/A")
            )
            if key not in seen_exps:
                seen_exps.add(key)
                unique_exps.append(exp)
                # Add industry to the set if it exists
                if exp.get("company_industry"):
                    company_industries.add(exp.get("company_industry"))

        experiences_str = "\n".join(
            f"Role: {exp.get('position_title','N/A')} | Company: {exp.get('company_name','N/A')} | From: {exp.get('date_from','N/A')} | To: {exp.get('date_to','N/A')} | Duration: {exp.get('duration_months','N/A')} months"
            for exp in unique_exps
        )

        # Create a formatted string of industries
        company_industry_str = " | ".join(sorted(company_industries)) if company_industries else "N/A"

        # ----- EDUCATION (deduplicate) -----
        raw_edu = employee.get("education", [])
        unique_edu = []
        seen_edu = set()
        for edu in raw_edu:
            key = (
                edu.get("institution_name", "N/A"),
                edu.get("degree", "N/A"),
                str(edu.get("date_from_year", "N/A")),
                str(edu.get("date_to_year", "N/A"))
            )
            if key not in seen_edu:
                seen_edu.add(key)
                unique_edu.append(edu)
        educations_str = "\n".join(
            f"Institution: {edu.get('institution_name','N/A')} | Degree: {edu.get('degree','N/A')} | From: {edu.get('date_from_year','N/A')} | To: {edu.get('date_to_year','N/A')}"
            for edu in unique_edu
        )

        # ----- SKILLS (deduplicate) -----
        skills = employee.get("inferred_skills", [])
        skills_str = ", ".join(skills) if skills else ""

        # ----- CERTIFICATIONS (deduplicate) -----
        raw_certifications = employee.get("certifications", [])
        seen_certs = set()
        for cert in raw_certifications:
            cert_name = cert.get("title", "N/A")
            seen_certs.add(cert_name)
        certifications_str = ", ".join(seen_certs) if seen_certs else ""

        # ----- LANGUAGES (deduplicate) -----
        raw_languages = employee.get("languages", [])
        seen_langs = set()
        for lang in raw_languages:
            language_name = lang.get("language", "N/A")
            seen_langs.add(language_name)
        languages_str = ", ".join(seen_langs) if seen_langs else ""

        # ----- PROJECTS (deduplicate) -----
        raw_projects = employee.get("projects", [])
        seen_projects = set()
        for proj in raw_projects:
            proj_name = proj.get("name", "N/A")
            seen_projects.add(proj_name)
        projects_str = ", ".join([str(x) for x in seen_projects if x is not None]) if seen_projects else ""

        # ----- AWARDS (deduplicate) -----
        raw_awards = employee.get("awards", [])
        seen_awards = set()
        for award in raw_awards:
            award_name = award.get("title", "N/A")
            seen_awards.add(award_name)
        awards_str = ", ".join(seen_awards) if seen_awards else ""

        # ----- PATENTS (deduplicate) -----
        raw_patents = employee.get("patents", [])
        seen_patents = set()
        for patent in raw_patents:
            patent_name = patent.get("title", "N/A")
            seen_patents.add(patent_name)
        patents_str = ", ".join(seen_patents) if seen_patents else ""

        # ----- PUBLICATIONS (deduplicate) -----
        raw_publications = employee.get("publications", [])
        seen_publications = set()
        for pub in raw_publications:
            pub_name = pub.get("title", "N/A")
            seen_publications.add(pub_name)
        publications_str = ", ".join(seen_publications) if seen_publications else ""

        # ----- SALARY INFORMATION -----
        projected_base_salary_median = employee.get("projected_base_salary_median")
        projected_base_salary_currency = employee.get("projected_base_salary_currency")
        projected_base_salary_period = employee.get("projected_base_salary_period")
        
        salary_str = ""
        if projected_base_salary_median:
            salary_str = f"{projected_base_salary_currency}{projected_base_salary_median:,.2f} {projected_base_salary_period}"

        # Build the final row dictionary.
        row = {
            "ID": id_val,
            "Name": name_val,
            "Headline/Title": headline_val,
            "Location": location_val,
            "Country": country_val,
            "URL": url_val,
            "Primary Email": primary_email,
            "All Emails": all_emails_str,
            "Industry": company_industry_str, 
            "Experience Count": experience_count_val,
            "Summary": summary_val,
            "Experiences": experiences_str,
            "Educations": educations_str,
            "Skills": skills_str,
            "Certifications": certifications_str,
            "Languages": languages_str,
        }
        rows.append(row)

    # After the search API call
    df = pd.DataFrame(rows)

    return df

if __name__ == "__main__":
    user_query = "('Head Of Legal')"

    df_employees = search_employees_one_row_per_employee_dedup(
        query=user_query,
        max_to_fetch=1
    )

In [20]:
df_employees

### Modelling - compare candidate profiles with job description to get similarity scores and rank based on that

In [88]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import logging
import warnings
from typing import List, Optional
import re

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def build_user_text(row, text_columns: List[str]) -> str:
    """
    Combine relevant text fields into a single string for semantic comparison.
    Handles both string and list-type columns.
    
    Args:
        row: DataFrame row containing user information
        text_columns: List of columns to include in combined text
        
    Returns:
        Combined text string
    """
    parts = []
    for col in text_columns:
        val = row.get(col)
        if pd.notnull(val):
            if isinstance(val, list):
                parts.append(' '.join(map(str, val)))
            else:
                parts.append(str(val))
    return " ".join(parts).strip()

def preprocess_text(text: str) -> str:
    """
    Clean and normalize text input by:
    - Removing emojis and special characters
    - Removing extra whitespace
    - Converting to lowercase
    """
    # Remove emojis using Unicode range patterns
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # CJK symbols
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # Remove special characters and punctuation (keep alphanumeric and whitespace)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Convert to lowercase and clean whitespace
    text = text.lower()
    text = ' '.join(text.strip().split())
    return text

def rank_candidates_semantic(
    df_employees: pd.DataFrame,
    job_description: str,
    text_columns: Optional[List[str]] = None,
    model_name: str = 'all-MiniLM-L6-v2',
    batch_size: int = 32
) -> pd.DataFrame:
    """
    Rank candidates based on semantic similarity to job description.
    """
    try:
        logger.info("Starting candidate ranking process...")
        
        # Create working copy to avoid modifying original dataframe
        df = df_employees.copy()
        
        # Set columns for corpus
        if text_columns is None:
            text_columns = ['Summary', 'Experiences', 'Educations', 
                           'Headline/Title', 'Industry', 'Skills'
                           'Certifications','Projects']
            logger.debug(f"Using default text columns: {text_columns}")
        else:
            logger.debug(f"Using custom text columns: {text_columns}")

        # 1) Create combined text for each user
        logger.info("Combining candidate text fields...")
        df['combined_text'] = df.apply(
            lambda x: build_user_text(x, text_columns), 
            axis=1
        )
        logger.info(f"Processed {len(df)} candidate profiles")

        # Handle empty texts to avoid encoding issues
        logger.info("Filtering empty candidate texts...")
        initial_count = len(df)
        df['combined_text'] = df['combined_text'].replace(r'^\s*$', np.nan, regex=True)
        df = df.dropna(subset=['combined_text']).reset_index(drop=True)
        filtered_count = len(df)
        logger.info(f"Removed {initial_count - filtered_count} empty profiles, {filtered_count} remaining")

        if df.empty:
            logger.warning("No valid candidate texts found after preprocessing")
            return pd.DataFrame()

        # 2) Initialize sentence transformer model
        logger.info(f"Initializing sentence transformer model: {model_name}")
        model = SentenceTransformer(model_name)
        
        # 3) Preprocess and embed job description
        logger.info("Preprocessing job description...")
        clean_jd = preprocess_text(job_description)
        logger.debug(f"Job description length: {len(clean_jd.split())} words")
        
        logger.info("Encoding job description...")
        job_embedding = model.encode(clean_jd, convert_to_tensor=True)
        logger.debug(f"Job embedding shape: {job_embedding.shape}")

        # 4) Embed candidate texts in batches
        logger.info("Preprocessing candidate texts...")
        user_texts = df['combined_text'].apply(preprocess_text).tolist()
        logger.debug(f"First candidate text preview: {user_texts[0][:200]}...")
        
        logger.info(f"Encoding candidate texts in batches of {batch_size}...")
        user_embeddings = model.encode(
            user_texts,
            convert_to_tensor=True,
            batch_size=batch_size,
            show_progress_bar=True
        )
        logger.info(f"Successfully encoded {len(user_texts)} candidate texts")
        logger.debug(f"Embeddings matrix shape: {user_embeddings.shape}")

        # 5) Calculate cosine similarities
        logger.info("Calculating cosine similarities...")
        similarities = util.cos_sim(job_embedding, user_embeddings)
        df['similarity_score'] = similarities.cpu().numpy().flatten()
        
        # Calculate score statistics
        min_score = df['similarity_score'].min()
        max_score = df['similarity_score'].max()
        logger.info(f"Similarity scores range: {min_score:.3f} - {max_score:.3f}")
        logger.debug(f"Score distribution:\n{df['similarity_score'].describe()}")

        # 6) Sort and return results
        logger.info("Sorting candidates by similarity score...")
        df_sorted = df.sort_values(by='similarity_score', ascending=False)\
                      .reset_index(drop=True)
        
        logger.info(f"Top candidate score: {df_sorted.iloc[0]['similarity_score']:.3f}")
        logger.info("Ranking process completed successfully")
        
        return df_sorted

    except Exception as e:
        logger.error(f"Error in ranking candidates: {str(e)}")
        raise

if __name__ == "__main__":
    job_description_text = """
        About the job
        A well-established investment management company with nationwide operations are 
        looking for a highly skilled and experienced Chief Financial Officer (CFO) to join 
        their dynamic team. This leadership role is based in Stellenbosch, Western Cape.

        As CFO, you will lead financial strategy, ensure strong financial management, 
        and drive sustainable growth, optimising operations and profitability for long-term success. 
        Additionally, the CFO will oversee all financial operations, including budgeting, 
        forecasting, reporting, cash flow management, and regulatory compliance.

        Key Responsibilities:

        Prepare accurate financial reports, including monthly, quarterly, and annual statements.
        Provide financial analysis to support strategic planning and decision-making.
        Monitor key metrics to ensure financial health and performance.
        Develop and manage budgets in alignment with company goals.
        Track financial performance, highlight variances, and recommend corrective actions.
        Oversee cash flow, receivables, and payables to maintain liquidity.
        Ensure compliance with financial regulations, tax laws, and industry standards.
        Lead external audits and coordinate with auditors.
        Collaborate with leadership on long-term financial strategies.
        Conduct financial modelling to support growth, investments, and cost management.
        Work cross-functionally with other departments within the organization (intercompany loans etc.)

        Qualifications and Experience:

        Qualified CA(SA) or CGMA
        5 years minimum of experience working in a managerial/leadership role
        Proven track record in financial modelling and analysis
    
    """ 
    try:
        df_ranked = rank_candidates_semantic(
            df_employees=df_employees,
            job_description=job_description_text,
            model_name='all-MiniLM-L6-v2'
        )
        logger.info("Script execution completed")
    except Exception as e:
        logger.critical(f"Critical error in main execution: {e}")
        print(f"Error processing candidates: {e}")

INFO:datasets:PyTorch version 2.4.0 available.
INFO:datasets:TensorFlow version 2.17.0 available.
INFO:__main__:Starting candidate ranking process...
INFO:__main__:Combining candidate text fields...
INFO:__main__:Processed 1 candidate profiles
INFO:__main__:Filtering empty candidate texts...
INFO:__main__:Removed 0 empty profiles, 1 remaining
INFO:__main__:Initializing sentence transformer model: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:__main__:Preprocessing job description...
INFO:__main__:Encoding job description...
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.82it/s]
INFO:__main__:Preprocessing candidate texts...
INFO:__main__:Encoding candidate texts in batches of 32...
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.35it/s]
INFO:__main__:Successfully encoded 1 candidate texts
INFO:__main__:Calculating cosine similarities...

In [89]:
df_ranked

Unnamed: 0,ID,Name,Headline/Title,Location,Country,URL,Canonical_URL,Industry,Experience Count,Summary,Experiences,Educations,Skills,Certifications,Languages,Projects,combined_text,similarity_score
0,24938138,Puseletso Gobinca CA(SA),CFO at Lonsa Everite,"City of Johannesburg, Gauteng, South Africa",South Africa,https://www.linkedin.com/in/puseletso-gobinca-...,https://www.linkedin.com/in/puseletso-gobinca-...,Accounting,17,,Role: Senior Corporate Finance Specialist | Co...,Institution: University of Cape Town | Degree:...,"credit analysis, managerial finance, legal con...",Assessor,english,TOPP Programme Implementantion: National Treas...,Role: Senior Corporate Finance Specialist | Co...,0.687969


In [2]:
# View content of a pickle file

import pickle
def view_pickle_file(file_path):
    """Opens and displays the contents of a pickle file."""
    try:
        with open(file_path, 'rb') as file:
            data = pickle.load(file)
            print(data) 
            return data
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
    except pickle.UnpicklingError:
        print("Error: Invalid pickle file.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

file_path = 'users.pkl'
view_pickle_file(file_path)

{'admin': {'password': '9c8fda2d3b2d5d51d71ceaab96f8a23980250021c2320e033b92351c1100bbfb', 'email': 'admin@example.com', 'created_at': datetime.datetime(2025, 3, 10, 21, 17, 51, 15493), 'role': 'admin'}, 'Lindani': {'password': '709ab09a04fb33a48680f243cafaa781753870f90ff4e395821b2bcaade5c002', 'email': 'leemncwabe29@gmail.com', 'created_at': datetime.datetime(2025, 3, 10, 22, 44, 1, 394674), 'role': 'user', 'reset_token': '98f1d158-4b87-430f-a39a-51c1582c1275', 'reset_token_expiry': datetime.datetime(2025, 3, 16, 17, 5, 25, 850308)}, 'LeeM': {'password': '8d0d43bc7ae6dd3970389ffae089fdc6ba85e300c332f7e4a2bef8b01d52de2b', 'email': '3621029@myuwc.ac.za', 'created_at': datetime.datetime(2025, 3, 16, 16, 49, 53, 358297), 'role': 'user'}}


{'admin': {'password': '9c8fda2d3b2d5d51d71ceaab96f8a23980250021c2320e033b92351c1100bbfb',
  'email': 'admin@example.com',
  'created_at': datetime.datetime(2025, 3, 10, 21, 17, 51, 15493),
  'role': 'admin'},
 'Lindani': {'password': '709ab09a04fb33a48680f243cafaa781753870f90ff4e395821b2bcaade5c002',
  'email': 'leemncwabe29@gmail.com',
  'created_at': datetime.datetime(2025, 3, 10, 22, 44, 1, 394674),
  'role': 'user',
  'reset_token': '98f1d158-4b87-430f-a39a-51c1582c1275',
  'reset_token_expiry': datetime.datetime(2025, 3, 16, 17, 5, 25, 850308)},
 'LeeM': {'password': '8d0d43bc7ae6dd3970389ffae089fdc6ba85e300c332f7e4a2bef8b01d52de2b',
  'email': '3621029@myuwc.ac.za',
  'created_at': datetime.datetime(2025, 3, 16, 16, 49, 53, 358297),
  'role': 'user'}}