pip installs

In [None]:
# !pip install -U sentence-transformers
# !python -m spacy download en_core_web_sm

imports

In [None]:
import torch
import pandas as pd
import numpy as np
import glob
import re
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer, util
from sentence_transformers.util import batch_to_device, cos_sim
import spacy
import sys
from IPython.display import display

sys.path.append("..")
from csv_job_translator import translate_csv

# Load the small English model
nlp = spacy.load("en_core_web_sm")

Load Translated job data


In [None]:
all_files = glob.glob(f"../data/Translated_data/*.csv")

dfs = []

# Combine all files, and add search keyword and file column
for file in all_files:
    df = pd.read_csv(file)
    # Extract Country from filename
    keyword = file.split("\\")[1].split("_")[0]
    print(keyword)
    df["country"] = keyword.lower()
    df["source_file"] = file
    dfs.append(df)
combined_df = pd.concat(dfs, ignore_index=True)

print(f"The combined dataframe has {len(combined_df)} records")

# Deduplicate based on id + title + location + company
dedupe_cols = ["title", "location", "company", ]

combined_df = combined_df.drop_duplicates(subset=dedupe_cols, keep="first").reset_index(drop=True)

print(f"The deduped dataframe has {len(combined_df)} records")


# combined_df.to_csv(
#     f"../data/{country}_deduped_and_combined.csv", 
#     index=False,
#     quotechar='"',
#     escapechar='\\',
#     encoding='utf-8-sig')

Cambodia
Indonesia
Malaysia
Myanmar
Philippines
Singapore
Thailand
Vietnam
The combined dataframe has 51625 records
The deduped dataframe has 50621 records


JobBert Dictionaries

In [23]:
taxonomy_aliases = {
    "data analyst": ["data analyst", "analytics analyst", "analytics specialist"],
    "data scientist": ["data scientist", "data science specialist"],
    "data engineer": ["data engineer", "big data engineer", "etl engineer", "data pipeline engineer"],
    "machine learning engineer": ["machine learning engineer", "ml engineer", "machine learning developer"],
    "artificial intelligence specialist": ["artificial intelligence specialist", "ai specialist", "ai expert"],
    "ai engineer": ["ai engineer", "artificial intelligence engineer", "ai developer"],
    "ai researcher": ["ai researcher", "ai scientist", "artificial intelligence researcher"],
    "deep learning engineer": ["deep learning engineer", "dl engineer", "deep learning specialist"],
    "nlp engineer": ["nlp engineer", "natural language processing engineer", "nlp specialist"],
    "computer vision specialist": ["computer vision specialist", "cv engineer", "computer vision engineer", "computer vision developer"],
    "data science consultant": ["data science consultant", "data science advisor"],
    "business intelligence Analyst": ["business intelligence analyst", "bi analyst", "business intelligence specialist"],
    "data-driven decision maker": ["data-driven decision maker", "data decision maker"],
    "data strategy consultant": ["data strategy consultant", "data strategy advisor"],
    "analytics consultant": ["analytics consultant", "analytics advisor"],
    "data product manager": ["data product manager", "data pm"],
    "ai product manager": ["ai product manager", "artificial intelligence product manager"],
    "ai strategist": ["ai strategist", "artificial intelligence strategist"],
    "data architect": ["data architect", "big data architect"],
    "generative ai developer": ["generative ai developer", "genai developer", "generative ai engineer", "genai engineer", "generative ai"],
    "power bi analyst": ["power bi analyst", "powerbi analyst"],
    "tableau developer": ["tableau developer", "tableau specialist"],
    "sql developer": ["sql developer", "database developer", "sql programmer"],
    "database engineer": ["database engineer", "db engineer"],
    "sql engineer": ["sql engineer", "database engineer", "db engineer"]
}

Title Normalization: Batch JobBert

In [24]:
# --- Flatten aliases for embedding ---
alias_texts = []
alias_to_canonical = []
for canonical, aliases in taxonomy_aliases.items():
    for alias in aliases:
        alias_texts.append(alias)
        alias_to_canonical.append(canonical)

# --- Load JobBERT and encode all aliases (real text) ---
model = SentenceTransformer("TechWolf/JobBERT-v3")
alias_embeddings = model.encode(alias_texts, convert_to_tensor=True)

# --- Batch Normalization with Aliases ---
def batch_normalize_with_aliases(titles, alias_embeddings, alias_to_canonical, threshold=0.70, batch_size=1024):
    normalized_titles = []
    for i in range(0, len(titles), batch_size):
        batch = titles[i:i+batch_size]
        batch_emb = model.encode(batch, convert_to_tensor=True)
        cos_sim = util.cos_sim(batch_emb, alias_embeddings)  # shape: (batch_size, n_aliases)
        best_scores, best_indices = torch.max(cos_sim, dim=1)
        for score, idx in zip(best_scores.tolist(), best_indices.tolist()):
            normalized_titles.append(alias_to_canonical[idx] if score >= threshold else "Unclassified")
    return normalized_titles

# --- Normalize Titles (JobBERT + Aliases, batch) ---
combined_df["normalized_title"] = batch_normalize_with_aliases(
    combined_df["title"].astype(str).tolist(),
    alias_embeddings,
    alias_to_canonical,
    threshold=0.90 # Higher threshold = Higher Accuracy = More Unclassified Data
)


print("Normalized Title Counts: ")
combined_df["normalized_title"].value_counts()

Normalized Title Counts: 


normalized_title
Unclassified                          49827
data engineer                           245
data analyst                            232
data scientist                          119
ai engineer                              67
business intelligence Analyst            40
machine learning engineer                32
data architect                           16
database engineer                        12
computer vision specialist                8
sql developer                             8
ai product manager                        6
artificial intelligence specialist        3
data product manager                      2
ai researcher                             2
data science consultant                   1
sql engineer                              1
Name: count, dtype: int64

Rule-Based Dictionaries

In [25]:
abbr_dict = {
    "ai": {"artificial", "intelligence"},
    "ml": {"machine",  "learn"},
    "nlp": {"natural",  "language", "process"},
    "llm": {"large", "language", "model"},
}

job_variant_mapping = {
    "administrator": "administration",
    "analysts": "analyst", "analysis": "analyst", "analyzing": "analyst",
    "analytics": "analytics", "analytic": "analytics",
    "architecture": "architect", "architecting": "architect",
    "consultants": "consultant", "consulting": "consultant",
    "developers": "developer", "developing": "developer", "development": "developer",
    "engineers": "engineer", "engineering": "engineer",
    "learning": "learn", "learns": "learn", "learned": "learn",
    "managers": "manager", "managing": "manager", "mangement": "manager",
    "operations": "operation", "operating": "operation",
    "researchers": "research", "researcher": "research", "researching": "research",
    "specialists": "specialist", "specializing": "specialist",
    "strategists": "strategy", "strategist": "strategy", "strategizing": "strategy", "strategies": "strategy",
    "systems": "system",
    "testers": "test", "testing": "test",
}

# Words to ignore for core roles
noise_words = {
    "based", "contract", "english", "fluency", "required", "welcome",  "enterprise", "urgent",  
    "grad", "intern", "internship", "junior", "level", "mid", "new", "senior", "sr",
    "onsite", "hybrid", "remote",
}


data_job_title_mapping = {
    "ai engineer": {
        "required": {"artificial", "intelligence"},
        "any_of": {"developer", "engineer", "architect"},
    },
    "ai architect": {"artificial", "intelligence", "architect"},
    "ai product manager": {
        "required": {"artificial", "intelligence", "product"},
        "any_of": {"manager", "director", "lead"},
    },
    "ai researcher": {
        "alternatives": [
            {
                "required": {"artificial", "intelligence"},
                "any_of": {"scientist", "research"},
            },
            {
                "required": {"machine", "learn"},
                "any_of": {"scientist", "research"},
            },
            {
                "required": {"large", "model"},
                "any_of": {"scientist", "research"},
            },
            {
                "required": {"large", "language", "model"},
                "any_of": {"scientist", "research"},
            },
            {
                "required": {"natural",  "language", "process"},
                "any_of": {"scientist", "research"},
            },
            {
                "required": {"genai"},
                "any_of": {"scientist", "research"},
            }
        ]
    },
    "ai strategist": {
        "alternatives": [
            {
                "required": {"artificial", "intelligence"},
                "any_of": {"strategy", "governance", "planning", "solutions", "consultant"}
            },
            {
                "required": {"artificial", "intelligence", "risk", "manager"},
            },
            {
                "required": {"genai", "strategy"},
            }
        ]
    },
    "artificial intelligence specialist": {
        "required": {"artificial", "intelligence"},
        "any_of": {"specialist", "operator"}
    },
    "analytics consultant": {"analytics", "consultant"},
    "business intelligence analyst": {
        "alternatives": [
            {
                "required": {"business", "intelligence"},
            },
            {
                "required": {"data", "warehouse"},
                "any_of": {"officer"}
            }
        ]
    },
    "computer vision specialist": {
        "required": {"vision"},
        "any_of": {"specialist", "computer", "engineer" },
    },
    "data analyst": {
        "required": {"data"},
        "any_of": {"analytics", "analyst", "specialist"},
    },
    "data architect": {"data", "architect"},
    "data driven decision maker": {"data", "driven", "decision", "maker"},
    "data engineer": {
        "alternatives": [
            {
                "required": {"data"},
                "any_of": {"engineer", "modeler", "migration"}
            },
            {
                "required": {"data", "platform"},
                "any_of": {"engineer", "modeler", "developer"}
            },
            {
                "required": {"data", "warehouse"},
                "any_of": {"engineer", "modeler", "developer"}
            },
            {
                "required": {"etl"},
                "any_of": {"engineer", "modeler", "developer"}
            }
        ]
    },
    "data governance": {
        "alternatives": [
            {
                "required": {"data"},
                "any_of": {"governance", "steward"},
            },
            {
                "required": {"data", "risk"},
                "any_of": {"manager"},
            }
        ]
        
    },
    "data product manager": {
        "alternatives": [
            {
                "required": {"data", "product"},
                "any_of": {"manager", "director", "lead"},
            },
            {
                "required": {"data", "project"},
                "any_of": {"manager", "director", "lead"},
            }
        ]
    },
    "data science consultant": {
        "required": {"data", "science"},
        "any_of": {"consultant", "adviser"},
    },
    "data scientist": {
        "required": {"data"},
        "any_of": {"scientist", "science"},
    },
    "data strategy consultant": {
        "alternatives": [
            {
                "required": {"data", "strategy"},
                "any_of": {"consultant"},
            },
            {
                "required": {"data", "strategy"}
            }
        ]
    },
    "database engineer": {
        "required": {"database"},
        "any_of": {"engineer", "administration", "analyst"},
    },
    "deep learning engineer": {"deep", "learn"},

    "generative ai developer": {
        "alternatives": [
            {
                "required": {"generative", "artificial", "intelligence"},
                "any_of": {"developer", "engineer", "architect"},
            },
            {
                "required": {"gen", "artificial", "intelligence"},
                "any_of": {"developer", "engineer", "architect"},
            },
            {
                "required": {"genai"},
                "any_of": {"developer", "engineer", "architect"}
            },
            {
                "required": {"large", "language", "model"},
                "any_of": {"developer", "engineer", "architect"},
            },

        ]
    },
    "machine learning engineer": {
        "alternatives": [
            {
                "required": {"machine", "learn"},
                "any_of": {"engineer"},
            },
            {
                "required": {"mlops"},
                "any_of": {"engineer"},
            },
            {
                "required": {"mllm"},
                "any_of": {"engineer"}
            },
        ]
    },  
    "nlp engineer": {"natural", "language", "process", "engineer"},
    "power bi analyst": {
        "required": {"power", "bi"},
        "any_of": {"visualization", "analyst", "tool", "developer", "specialist"}
    },
    "sql developer": {
        "required": {"developer"},
        "any_of": {"postgresql", "mysql", "sql", "oracle", "plsql"},
    },
    "sql engineer": {
        "required": {"engineer"},
        "any_of": {"postgresql", "postgre", "mysql", "sql", "oracle", "plsql"},
    },
    "tableau analyst": {
        "required": {"tableau"},
        "any_of": {"visualization", "analyst", "tool", "developer", "specialist"}
    },
}

Title Normalization: Dictionary-based Approach 

In [26]:
# --- Token normalization ---
def normalize_job_title(title):
    # Tokenize the title 
    doc = nlp(title.lower())

    # List to store normalized tokens
    tokens_set = set()

    for token in doc:
        if token.is_punct or token.is_space or token.is_stop:
            continue
        
        # Get both original and lemma forms
        original_word = re.sub(r"[-_.,(){}\[\]/]", " ", token.text.lower())
        lemma_word = re.sub(r"[-_.,(){}\[\]/]", " ", token.lemma_.lower())

        # Check if the word is an abbreviation or a variant
        for word in {original_word, lemma_word}:
            for w in word.split():
                # Ignore noise words
                if not w or w in noise_words:
                    continue

                # Expand abbreviations
                if w in abbr_dict:
                    tokens_set.update(abbr_dict[w])

                # Check if the word is a variant
                elif w in job_variant_mapping:
                    tokens_set.add(job_variant_mapping[w])
                else:
                    tokens_set.add(w)

    print(title, "->", tokens_set)
    return tokens_set


# --- Match function ---
def match_job_title(tokens_set, mapping):
    # Alternative mapping
    if isinstance(mapping, dict) and "alternatives" in mapping:
        for option in mapping["alternatives"]:
            required = option.get("required", set())
            any_of = option.get("any_of", set())
            one_of = option.get("one_of", set())
            
            if (
                required.issubset(tokens_set) and 
                (not one_of or tokens_set & one_of) and
                (not any_of or tokens_set & any_of)
            ):
                return True
        return False

    # Flexible mapping
    if isinstance(mapping, dict):
        required = mapping.get("required", set())
        any_of = mapping.get("any_of", set())
        one_of = mapping.get("one_of", set())

        if (
            required.issubset(tokens_set) and 
            (not one_of or tokens_set & one_of) and
            (not any_of or tokens_set & any_of)
        ):
            return True

    # Strict mapping
    elif isinstance(mapping, set):
        if mapping.issubset(tokens_set):
            return True

    return False


# --- Dictionary fallback (only for Unclassified) ---
def dict_fallback(row, title_cache, mapping_dict):
    # Keep existing classification if not Unclassified
    if row["normalized_title"] != "Unclassified":
        return row["normalized_title"]

    raw_words_set = title_cache[row["title"]]
    search_kw = row["search_keyword"].lower()

    # Step 1: Search keyword priority
    if search_kw in mapping_dict:
        mapping = mapping_dict[search_kw]
        if match_job_title(raw_words_set, mapping):
            return search_kw

    # Step 2: Check against dictionary
    for normalized, mapping in mapping_dict.items():
        if match_job_title(raw_words_set, mapping):
            return normalized

    # Step 3: Still unclassified
    return "Unclassified"


# --- Pre-process unique titles ---
unique_titles = combined_df["title"].unique()
title_cache = {title: normalize_job_title(title) for title in unique_titles}

# --- Apply dictionary only to Unclassified ---
combined_df["normalized_title"] = combined_df.apply(
    lambda row: dict_fallback(row, title_cache, data_job_title_mapping),
    axis=1
)


Civil Engineer -> {'civil', 'engineer'}
Assistant Manager, IT Security Engineer -> {'manager', 'security', 'engineer', 'assistant'}
Production Engineer -> {'production', 'engineer'}
Engineering Manager -> {'engineer', 'manager'}
Production Engineering (PPSEZ) -> {'production', 'ppsez', 'engineer'}
Senior Sales Engineer -> {'engineer', 'sales', 'sale'}
Electrical Engineer -> {'engineer', 'electrical'}
Maintenance Electrical Engineer -> {'engineer', 'electrical', 'maintenance'}
Assistant Mgr, IT Security Engineer -> {'engineer', 'security', 'mgr', 'assistant'}
IT And ERP Engineer -> {'engineer', 'erp'}
ELV Design Engineer (TMY68276 - 1) -> {'1', 'tmy68276', 'engineer', 'elv', 'design'}
Security Engineer -> {'engineer', 'security'}
Senior Manager, DevOps Engineer -> {'manager', 'devops', 'engineer'}
QA Engineering (Bavet) -> {'engineer', 'qa', 'bavet'}
Senior ELV Design Engineer (TMY68276) -> {'design', 'elv', 'tmy68276', 'engineer'}
Test Automation Engineer -> {'engineer', 'test', 'autom

In [27]:
print("Normalized Title Counts: ")
combined_df["normalized_title"].value_counts()

Normalized Title Counts: 


normalized_title
Unclassified                          45982
data analyst                           1455
data engineer                           945
ai engineer                             547
data scientist                          331
database engineer                       229
business intelligence analyst           164
machine learning engineer               134
ai researcher                           122
sql developer                            94
data architect                           78
data product manager                     75
artificial intelligence specialist       69
ai strategist                            66
data governance                          62
power bi analyst                         45
computer vision specialist               42
business intelligence Analyst            40
analytics consultant                     34
sql engineer                             24
ai product manager                       24
generative ai developer                  24
data strategy c

More Filtering

In [None]:
data_center_mask = combined_df['title'].str.lower().str.contains('data center|data centre')
combined_df.loc[data_center_mask, 'normalized_title'] = 'Unclassified'

# combined_df[data_center_mask][['title', 'normalized_title']].drop_duplicates()

Unnamed: 0,title,normalized_title
66,"Assistant Manager, IT Data Center & Server",Unclassified
86,"Manager, IT Data Center & Server",Unclassified
153,Project Director - Data Center Infrastructure,Unclassified
169,Pre Sales Director - Data Center Infrastructure,Unclassified
203,Project Manager (Data Center),Unclassified
1189,Data Center Staff,Unclassified
1912,Data Center Lead,Unclassified
1929,"Account Management, Data Center Lead",Unclassified
1934,HVAC Engineer (Data Center),Unclassified
1936,Alibaba Data Center Facility Engineer,Unclassified


Saving File

In [29]:
combined_df.to_csv(
    f"../data/ASEAN_combined_with_normalized_title.csv", 
    index=False,
    quotechar='"',
    escapechar='\\',
    encoding='utf-8-sig')

Display/Save Data

In [None]:
# --- Display setting ---

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# --- preview_df_unclassified ---

preview_df_unclassified = combined_df.loc[
    combined_df["normalized_title"] == "Unclassified",
    ["title", "search_keyword", "normalized_title"]
].drop_duplicates(subset=["title"])


# To view the unclassified dataset in a csv file format (title, search_keyword, normalized_title):
preview_df_unclassified.to_csv(
    f"../data/ASEAN_preview_unclassified.csv", 
    index=False,
    quotechar='"',
    escapechar='\\',
    encoding='utf-8-sig')


# --- df_unclassified ---

unclassified_df = combined_df[combined_df["normalized_title"] == "Unclassified"]

# To save all of the unclassified data in a csv file format
unclassified_df.to_csv(
    f"../data/ASEAN_unclassified.csv", 
    index=False,
    quotechar='"',
    escapechar='\\',
    encoding='utf-8-sig')


# --- df_classified ---

classified_df = combined_df[combined_df["normalized_title"] != "Unclassified"]
# To save all of the unclassified data in a csv file format
classified_df.to_csv(
    f"../data/ASEAN_classified.csv", 
    index=False,
    quotechar='"',
    escapechar='\\',
    encoding='utf-8-sig')


# --- Display data ---

# preview_df_unclassified
# unclassified_df
# classified_df
classified_df["normalized_title"].value_counts()



normalized_title
data analyst                          1436
data engineer                          797
ai engineer                            547
data scientist                         331
database engineer                      229
business intelligence analyst          164
machine learning engineer              134
ai researcher                          122
sql developer                           94
data architect                          70
artificial intelligence specialist      69
ai strategist                           66
data governance                         62
power bi analyst                        45
computer vision specialist              42
data product manager                    40
business intelligence Analyst           40
analytics consultant                    34
sql engineer                            24
ai product manager                      24
generative ai developer                 24
data strategy consultant                15
tableau analyst                      

Show Specific Job Title

In [31]:
# fix text wrapping, and display
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)   # show all rows

# preview_df_gen_ai_engineer = combined_df.loc[
#     combined_df["normalized_title"] == "generative ai developer",
#     ["title", "search_keyword", "normalized_title", "site", "country"]
# ].drop_duplicates(subset=["title"])

# preview_df_gen_ai_engineer

# preview_df_ai_engineer = combined_df.loc[
#     combined_df["normalized_title"] == "ai engineer",
#     ["title", "search_keyword", "normalized_title"]
# ].drop_duplicates(subset=["title"])

# preview_df_ai_engineer


# preview_df_ai_specialist = combined_df.loc[
#     combined_df["normalized_title"] == "ai specialist",
#     ["title", "search_keyword", "normalized_title"]
# ].drop_duplicates(subset=["title"])

# preview_df_ai_specialist

# preview_df_ai_product_manager = combined_df.loc[
#     combined_df["normalized_title"] == "ai product manager",
#     ["title", "search_keyword", "normalized_title"]
# ].drop_duplicates(subset=["title"])

# preview_df_ai_product_manager

# preview_df_data_scientist = combined_df.loc[
#     combined_df["normalized_title"] == "data scientist",
#     ["title", "search_keyword", "normalized_title"]
# ].drop_duplicates(subset=["title"])

# preview_df_data_scientist


# preview_df_data_analyst = combined_df.loc[
#     combined_df["normalized_title"] == "data analyst",
#     ["title", "search_keyword", "normalized_title"]
# ].drop_duplicates(subset=["title"])

# preview_df_data_analyst

# preview_df_data_engineer = combined_df.loc[
#     combined_df["normalized_title"] == "data engineer",
#     ["title", "search_keyword", "normalized_title"]
# ].drop_duplicates(subset=["title"])

# preview_df_data_engineer


# preview_df_machine_learning_engineer = combined_df.loc[
#     combined_df["normalized_title"] == "machine learning engineer",
#     ["title", "search_keyword", "normalized_title"]
# ].drop_duplicates(subset=["title"])

# preview_df_machine_learning_engineer


# preview_df_power_bi_analyst = combined_df.loc[
#     combined_df["normalized_title"] == "power bi analyst",
#     ["title", "search_keyword", "normalized_title"]
# ].drop_duplicates(subset=["title"])

# preview_df_power_bi_analyst

preview_df_data_centre = combined_df.loc[
    combined_df["title"].str.contains("data center", case=False, na=False),
    ["title", "search_keyword", "normalized_title", "site", "country"]
].drop_duplicates(subset=["title"])

preview_df_data_centre



Unnamed: 0,title,search_keyword,normalized_title,site,country
66,"Assistant Manager, IT Data Center & Server",data analyst,Unclassified,jobnet,cambodia
86,"Manager, IT Data Center & Server",data product manager,Unclassified,jobnet,cambodia
153,Project Director - Data Center Infrastructure,ai engineer,Unclassified,jobstreet,indonesia
169,Pre Sales Director - Data Center Infrastructure,ai engineer,Unclassified,jobstreet,indonesia
203,Project Manager (Data Center),ai engineer,Unclassified,jobstreet,indonesia
1189,Data Center Staff,data analyst,Unclassified,jobstreet,indonesia
1912,Data Center Lead,data driven decision maker,Unclassified,jobstreet,indonesia
1929,"Account Management, Data Center Lead",data driven decision maker,Unclassified,jobstreet,indonesia
1934,HVAC Engineer (Data Center),data engineer,Unclassified,jobstreet,indonesia
1936,Alibaba Data Center Facility Engineer,data engineer,Unclassified,jobstreet,indonesia
