### Obtain candidates based on search critetia

In [1]:
import requests
import json
import pandas as pd

def search_employees_one_row_per_employee_dedup(
    query,
    country_filter=None,
    location_filter=None,
    max_to_fetch=5
):
    """
    Search employees by:
      - 'query' (e.g. 'CEO', 'CEO OR CFO', etc.), 
      - optional 'country_filter' (e.g. 'South Africa'), 
      - optional 'location_filter' (e.g. 'Johannesburg, Gauteng, South Africa').

    In the final DataFrame (one row per employee):
      - Keep: ID, Name, Headline/Title, Location, Country, URL, Industry, experience_count, summary
      - Include: deduplicated Experiences (with 'duration'), Educations, and Skills
      - Remove: first_name, last_name
    """

    # 1) Build the Elasticsearch DSL query
    must_clauses = []

    # a) The nested query for experience titles
    must_clauses.append({
        "nested": {
            "path": "member_experience_collection",
            "query": {
                "query_string": {
                    "query": query,
                    "default_field": "member_experience_collection.title",
                    "default_operator": "and"
                }
            }
        }
    })

    # b) If user wants to filter by a specific country (exact match)
    if country_filter:
        must_clauses.append({
            "term": {
                "country": country_filter
            }
        })

    # c) If user wants to filter by a specific location (phrase match)
    if location_filter:
        must_clauses.append({
            "match_phrase": {
                "location": location_filter
            }
        })

    # Combine into a bool query
    payload = {
        "query": {
            "bool": {
                "must": must_clauses
            }
        }
    }

    # 2) Send the search request
    search_url = "https://api.coresignal.com/cdapi/v1/professional_network/employee/search/es_dsl"

    headers = {
    'Content-Type': 'application/json',
    'Authorization': 'Bearer eyJhbGciOiJFZERTQSIsImtpZCI6IjEwYTYwZWRhLWNhNzEtMTIxZS1jY2JhLTBmNjRjMzg4Yjg0ZCJ9.eyJhdWQiOiJheW9iYS5tZSIsImV4cCI6MTc3MzEwNjAyMSwiaWF0IjoxNzQxNTQ5MDY5LCJpc3MiOiJodHRwczovL29wcy5jb3Jlc2lnbmFsLmNvbTo4MzAwL3YxL2lkZW50aXR5L29pZGMiLCJuYW1lc3BhY2UiOiJyb290IiwicHJlZmVycmVkX3VzZXJuYW1lIjoiYXlvYmEubWUiLCJzdWIiOiI5Nzg4ZDg5Ni0yNzBjLTU4NjgtMTY0Mi05MWFiZDk0MGEwODYiLCJ1c2VyaW5mbyI6eyJzY29wZXMiOiJjZGFwaSJ9fQ.BeR_ci_7346iPkfP64QZCwxILa1v1_HGIE1SdhOl9qHtM_HcwiiWIf26DNhcDPl7Bs16JAEfjBntMoyJymtYDA'
    }

    resp = requests.post(search_url, headers=headers, json=payload)
    resp.raise_for_status()
    employee_ids = resp.json()

    if not isinstance(employee_ids, list):
        print("Unexpected structure in search response.")
        return pd.DataFrame()

    # 3) Collect data for each employee ID
    rows = []
    for emp_id in employee_ids[:max_to_fetch]:
        collect_url = f"https://api.coresignal.com/cdapi/v1/professional_network/employee/collect/{emp_id}"
        r = requests.get(collect_url, headers=headers)
        r.raise_for_status()

        employee = r.json()

        # Basic fields
        id_val = employee.get('id')
        name_val = employee.get('name')
        headline_val = employee.get('title')
        location_val = employee.get('location')
        country_val = employee.get('country')
        url_val = employee.get('url')
        industry_val = employee.get('industry')
        experience_count_val = employee.get('experience_count')
        summary_val = employee.get('summary')

        # ----- EXPERIENCE (deduplicate) -----
        raw_exps = employee.get('member_experience_collection', [])
        unique_exps = []
        seen_exps = set()
        for exp in raw_exps:
            key = (
                exp.get('title', 'N/A'),
                exp.get('company_name', 'N/A'),
                exp.get('date_from', 'N/A'),
                exp.get('date_to', 'N/A')
            )
            if key not in seen_exps:
                seen_exps.add(key)
                unique_exps.append(exp)

        experiences_str = "\n".join(
            f"Role: {exp.get('title','N/A')} | Company: {exp.get('company_name','N/A')} "
            f"| From: {exp.get('date_from','N/A')} | To: {exp.get('date_to','N/A')} "
            f"| Duration: {exp.get('duration','N/A')}"
            for exp in unique_exps
        )

        # ----- EDUCATION (deduplicate) -----
        raw_edu = employee.get('member_education_collection', [])
        unique_edu = []
        seen_edu = set()
        for edu in raw_edu:
            key = (
                edu.get('title', 'N/A'),
                edu.get('subtitle', 'N/A'),
                edu.get('date_from', 'N/A'),
                edu.get('date_to', 'N/A')
            )
            if key not in seen_edu:
                seen_edu.add(key)
                unique_edu.append(edu)

        educations_str = "\n".join(
            f"Institution: {edu.get('title','N/A')} | Degree: {edu.get('subtitle','N/A')} "
            f"| From: {edu.get('date_from','N/A')} | To: {edu.get('date_to','N/A')}"
            for edu in unique_edu
        )

        # ----- SKILLS (deduplicate) -----
        raw_skills = employee.get('member_skills_collection', [])
        seen_skills = set()
        for skill_entry in raw_skills:
            skill_name = skill_entry.get('member_skill_list', {}).get('skill', 'N/A')
            if skill_name not in seen_skills:
                seen_skills.add(skill_name)

        skills_str = ", ".join(seen_skills) if seen_skills else ""

        # Build final row
        row = {
            "ID": id_val,
            "Name": name_val,
            "Headline/Title": headline_val,
            "Location": location_val,
            "Country": country_val,
            "URL": url_val,
            "Industry": industry_val,
            "Experience Count": experience_count_val,
            "Summary": summary_val,
            "Experiences": experiences_str,
            "Educations": educations_str,
            "Skills": skills_str
        }
        rows.append(row)

    df = pd.DataFrame(rows)
    return df

if __name__ == "__main__":
    user_query = "(Chief Financial Officer) OR (CFO)"
    country = "South Africa"
    location = "Johannesburg"
    df_employees = search_employees_one_row_per_employee_dedup(
        query=user_query,
        country_filter=country,  
        location_filter=location,
        max_to_fetch=6
    )

In [2]:
df_employees

Unnamed: 0,ID,Name,Headline/Title,Location,Country,URL,Industry,Experience Count,Summary,Experiences,Educations,Skills
0,1353328,Barbara Moodley,PA to Managing Director/Group Chief Financial ...,"City of Johannesburg, Gauteng, South Africa",South Africa,https://www.linkedin.com/in/barbara-moodley-08...,Food & Beverages,1,,Role: PA to Managing Director/Group Chief Fina...,,
1,1496114,Bella Nel,Chief Financial Officer at MLA SQUARS,"City of Johannesburg, Gauteng, South Africa",South Africa,https://www.linkedin.com/in/bella-nel-247142119,Real Estate,1,,Role: Chief Financial Officer | Company: MLA S...,,
2,1695886,Beyers Müller,CFO,"City of Johannesburg, Gauteng, South Africa",South Africa,https://www.linkedin.com/in/beyers-m%c3%bcller...,Accounting,7,,Role: Group Accountant | Company: Omnia Holdin...,Institution: University of Johannesburg | Degr...,"management consulting, organizational design, ..."
3,1727569,Bobby Kitching,FINANCIAL MANGER at PROCESS AUTOMATION,"Johannesburg Area, South Africa",South Africa,https://www.linkedin.com/in/bobby-kitching-257...,Electrical & Electronic Manufacturing,2,,Role: AUDIT MANAGER | Company: RSM Betty & Dic...,Institution: CHARTERED ACCOUNTANT | Degree: CA...,"account reconciliation, budgets, financial rep..."
4,1912299,Bokang Molatlhwa,Senior Education Specialist,Johannesburg Metropolitan Area,South Africa,https://www.linkedin.com/in/bokang-molatlhwa-5...,Environmental Services,7,I am a Bsc(Hons) educated geographer and Chief...,Role: Senior Education Specialist Geography | ...,Institution: Mpolokang High School | Degree: 1...,"motivational speaking, public speaking, micros..."
5,1936470,Bonita Penny,Executive Assistant: COO to Group DIO team at ...,"City of Johannesburg, Gauteng, South Africa",South Africa,https://www.linkedin.com/in/bonita-penny-3763aa60,Information Technology & Services,8,I was not made to give up. I strive to create ...,Role: Executive Assistant to Chief Operations ...,Institution: Bracken High | Degree: - | From: ...,"corporate social responsibility, event plannin..."


### Modelling - compare candidate profiles with job description to get similarity scores and rank based on that

In [3]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import logging
import warnings
from typing import List, Optional
import re

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def build_user_text(row, text_columns: List[str]) -> str:
    """
    Combine relevant text fields into a single string for semantic comparison.
    Handles both string and list-type columns.
    
    Args:
        row: DataFrame row containing user information
        text_columns: List of columns to include in combined text
        
    Returns:
        Combined text string
    """
    parts = []
    for col in text_columns:
        val = row.get(col)
        if pd.notnull(val):
            if isinstance(val, list):
                parts.append(' '.join(map(str, val)))
            else:
                parts.append(str(val))
    return " ".join(parts).strip()

def preprocess_text(text: str) -> str:
    """
    Clean and normalize text input by:
    - Removing emojis and special characters
    - Removing extra whitespace
    - Converting to lowercase
    """
    # Remove emojis using Unicode range patterns
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # CJK symbols
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # Remove special characters and punctuation (keep alphanumeric and whitespace)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Convert to lowercase and clean whitespace
    text = text.lower()
    text = ' '.join(text.strip().split())
    return text

def rank_candidates_semantic(
    df_employees: pd.DataFrame,
    job_description: str,
    text_columns: Optional[List[str]] = None,
    model_name: str = 'all-MiniLM-L6-v2',
    batch_size: int = 32
) -> pd.DataFrame:
    """
    Rank candidates based on semantic similarity to job description.
    """
    try:
        logger.info("Starting candidate ranking process...")
        
        # Create working copy to avoid modifying original dataframe
        df = df_employees.copy()
        
        # Set columns for corpus
        if text_columns is None:
            text_columns = ['Summary', 'Experiences', 'Educations', 
                           'Headline/Title', 'Industry', 'Skills']
            logger.debug(f"Using default text columns: {text_columns}")
        else:
            logger.debug(f"Using custom text columns: {text_columns}")

        # 1) Create combined text for each user
        logger.info("Combining candidate text fields...")
        df['combined_text'] = df.apply(
            lambda x: build_user_text(x, text_columns), 
            axis=1
        )
        logger.info(f"Processed {len(df)} candidate profiles")

        # Handle empty texts to avoid encoding issues
        logger.info("Filtering empty candidate texts...")
        initial_count = len(df)
        df['combined_text'] = df['combined_text'].replace(r'^\s*$', np.nan, regex=True)
        df = df.dropna(subset=['combined_text']).reset_index(drop=True)
        filtered_count = len(df)
        logger.info(f"Removed {initial_count - filtered_count} empty profiles, {filtered_count} remaining")

        if df.empty:
            logger.warning("No valid candidate texts found after preprocessing")
            return pd.DataFrame()

        # 2) Initialize sentence transformer model
        logger.info(f"Initializing sentence transformer model: {model_name}")
        model = SentenceTransformer(model_name)
        
        # 3) Preprocess and embed job description
        logger.info("Preprocessing job description...")
        clean_jd = preprocess_text(job_description)
        logger.debug(f"Job description length: {len(clean_jd.split())} words")
        
        logger.info("Encoding job description...")
        job_embedding = model.encode(clean_jd, convert_to_tensor=True)
        logger.debug(f"Job embedding shape: {job_embedding.shape}")

        # 4) Embed candidate texts in batches
        logger.info("Preprocessing candidate texts...")
        user_texts = df['combined_text'].apply(preprocess_text).tolist()
        logger.debug(f"First candidate text preview: {user_texts[0][:200]}...")
        
        logger.info(f"Encoding candidate texts in batches of {batch_size}...")
        user_embeddings = model.encode(
            user_texts,
            convert_to_tensor=True,
            batch_size=batch_size,
            show_progress_bar=True
        )
        logger.info(f"Successfully encoded {len(user_texts)} candidate texts")
        logger.debug(f"Embeddings matrix shape: {user_embeddings.shape}")

        # 5) Calculate cosine similarities
        logger.info("Calculating cosine similarities...")
        similarities = util.cos_sim(job_embedding, user_embeddings)
        df['similarity_score'] = similarities.cpu().numpy().flatten()
        
        # Calculate score statistics
        min_score = df['similarity_score'].min()
        max_score = df['similarity_score'].max()
        logger.info(f"Similarity scores range: {min_score:.3f} - {max_score:.3f}")
        logger.debug(f"Score distribution:\n{df['similarity_score'].describe()}")

        # 6) Sort and return results
        logger.info("Sorting candidates by similarity score...")
        df_sorted = df.sort_values(by='similarity_score', ascending=False)\
                      .reset_index(drop=True)
        
        logger.info(f"Top candidate score: {df_sorted.iloc[0]['similarity_score']:.3f}")
        logger.info("Ranking process completed successfully")
        
        return df_sorted

    except Exception as e:
        logger.error(f"Error in ranking candidates: {str(e)}")
        raise

if __name__ == "__main__":
    job_description_text = """
        About the job
        A well-established investment management company with nationwide operations are 
        looking for a highly skilled and experienced Chief Financial Officer (CFO) to join 
        their dynamic team. This leadership role is based in Stellenbosch, Western Cape.

        As CFO, you will lead financial strategy, ensure strong financial management, 
        and drive sustainable growth, optimising operations and profitability for long-term success. 
        Additionally, the CFO will oversee all financial operations, including budgeting, 
        forecasting, reporting, cash flow management, and regulatory compliance.

        Key Responsibilities:

        Prepare accurate financial reports, including monthly, quarterly, and annual statements.
        Provide financial analysis to support strategic planning and decision-making.
        Monitor key metrics to ensure financial health and performance.
        Develop and manage budgets in alignment with company goals.
        Track financial performance, highlight variances, and recommend corrective actions.
        Oversee cash flow, receivables, and payables to maintain liquidity.
        Ensure compliance with financial regulations, tax laws, and industry standards.
        Lead external audits and coordinate with auditors.
        Collaborate with leadership on long-term financial strategies.
        Conduct financial modelling to support growth, investments, and cost management.
        Work cross-functionally with other departments within the organization (intercompany loans etc.)

        Qualifications and Experience:

        Qualified CA(SA) or CGMA
        5 years minimum of experience working in a managerial/leadership role
        Proven track record in financial modelling and analysis
    
    """ 
    try:
        df_ranked = rank_candidates_semantic(
            df_employees=df_employees,
            job_description=job_description_text,
            model_name='all-MiniLM-L6-v2'
        )
        logger.info("Script execution completed")
    except Exception as e:
        logger.critical(f"Critical error in main execution: {e}")
        print(f"Error processing candidates: {e}")

  from .autonotebook import tqdm as notebook_tqdm
INFO:__main__:Starting candidate ranking process...
INFO:__main__:Combining candidate text fields...
INFO:__main__:Processed 6 candidate profiles
INFO:__main__:Filtering empty candidate texts...
INFO:__main__:Removed 0 empty profiles, 6 remaining
INFO:__main__:Initializing sentence transformer model: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:__main__:Preprocessing job description...
INFO:__main__:Encoding job description...
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.65it/s]
INFO:__main__:Preprocessing candidate texts...
INFO:__main__:Encoding candidate texts in batches of 32...
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.17it/s]
INFO:__main__:Successfully encoded 6 candidate texts
INFO:__main__:Calculating cosine similarities...
INFO:__main__:Similarity scores range: 0.352 - 

In [4]:
df_ranked

Unnamed: 0,ID,Name,Headline/Title,Location,Country,URL,Industry,Experience Count,Summary,Experiences,Educations,Skills,combined_text,similarity_score
0,1695886,Beyers Müller,CFO,"City of Johannesburg, Gauteng, South Africa",South Africa,https://www.linkedin.com/in/beyers-m%c3%bcller...,Accounting,7,,Role: Group Accountant | Company: Omnia Holdin...,Institution: University of Johannesburg | Degr...,"management consulting, organizational design, ...",Role: Group Accountant | Company: Omnia Holdin...,0.652508
1,1727569,Bobby Kitching,FINANCIAL MANGER at PROCESS AUTOMATION,"Johannesburg Area, South Africa",South Africa,https://www.linkedin.com/in/bobby-kitching-257...,Electrical & Electronic Manufacturing,2,,Role: AUDIT MANAGER | Company: RSM Betty & Dic...,Institution: CHARTERED ACCOUNTANT | Degree: CA...,"account reconciliation, budgets, financial rep...",Role: AUDIT MANAGER | Company: RSM Betty & Dic...,0.598147
2,1936470,Bonita Penny,Executive Assistant: COO to Group DIO team at ...,"City of Johannesburg, Gauteng, South Africa",South Africa,https://www.linkedin.com/in/bonita-penny-3763aa60,Information Technology & Services,8,I was not made to give up. I strive to create ...,Role: Executive Assistant to Chief Operations ...,Institution: Bracken High | Degree: - | From: ...,"corporate social responsibility, event plannin...",I was not made to give up. I strive to create ...,0.56221
3,1496114,Bella Nel,Chief Financial Officer at MLA SQUARS,"City of Johannesburg, Gauteng, South Africa",South Africa,https://www.linkedin.com/in/bella-nel-247142119,Real Estate,1,,Role: Chief Financial Officer | Company: MLA S...,,,Role: Chief Financial Officer | Company: MLA S...,0.494944
4,1353328,Barbara Moodley,PA to Managing Director/Group Chief Financial ...,"City of Johannesburg, Gauteng, South Africa",South Africa,https://www.linkedin.com/in/barbara-moodley-08...,Food & Beverages,1,,Role: PA to Managing Director/Group Chief Fina...,,,Role: PA to Managing Director/Group Chief Fina...,0.481925
5,1912299,Bokang Molatlhwa,Senior Education Specialist,Johannesburg Metropolitan Area,South Africa,https://www.linkedin.com/in/bokang-molatlhwa-5...,Environmental Services,7,I am a Bsc(Hons) educated geographer and Chief...,Role: Senior Education Specialist Geography | ...,Institution: Mpolokang High School | Degree: 1...,"motivational speaking, public speaking, micros...",I am a Bsc(Hons) educated geographer and Chief...,0.351727
