In [3]:
import re
import pandas as pd
import numpy as np
from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoModel
import torch

import os
import openai
from openai import OpenAI

from difflib import get_close_matches # sequence (character based) comparision

In [2]:
# ! pip install openai

Collecting openai
  Downloading openai-1.52.2-py3-none-any.whl.metadata (24 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.52.2-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.9/386.9 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.6-py3-none-any.whl (78 kB)
[2K   [90m━

In [4]:
# OPENAI_API_KEY = ""
client = OpenAI(
  api_key=OPENAI_API_KEY,  # this is also the default, it can be omitted
)

In [5]:
job_levels_dict = {
    'C-Level': 19,
    'Executive VP': 18,
    'Senior VP': 17,
    'VP': 16,
    'Head': 15,
    'Senior Director': 14,
    'Director': 13,
    'Senior Manager': 12,
    'Manager': 11,
    'Architect': 10,
    'Supervisor': 9,
    'Senior Employee': 8,
    'Coordinator': 7,
    'Analyst': 6,
    'Specialist': 5,
    'Administrator': 4,
    'Engineer': 3,
    'Developer': 2,
    'Consultant': 1
}

job_areas_dict = {
    'Executives':	["CLO / Chief Learning Office","Treasurer","Owner / Co-owner","Executives","Project/Program Management","CCO / Chief Compliance Officer","CDO / Chief Data Officer","CHRO / Chief Human Resource Officer","CISO / Chief Information Security Officer","CPO / Chief People Officer","CSO / Chief Security Officer","General Counsel","Managing Partner","Senior Management","Proprietor","Principal","President","Other CXO","CMO / Chief Marketing Officer","CTO / Chief Technology Officer","COO / Chief Operations Officer","CFO / Chief Financial Officer","CIO / Chief Information Officer","Board Member","CEO","Partner","General Manager","Fellow","Chairman","Chancellor"],
    'Banking / Mortgage':	["Project/Program Management","Data Management / Analytics","Teller","Branch Manager","Audit","Mergers / Acquisitions","General Management","Lending","Credit","Title / Escrow","Banking","Underwriter","Analyst","Banking / Mortgage","Senior Management","Broker"],
    'Real Estate':	["Underwriter","Property Manager","Appraising","General Management","Data Management / Analytics","Real Estate","Title Officer","Project/Program Management","Agent / Broker / Realtor","Senior Management","Investor","Legal","Leasing / Acquisition"],
    'Biotech':	["Data Management / Analytics","Project/Program Management","Biological Research","Chemical Research","General Management","Geological Testing / Analysis","Mathematical Statistical Research","Pharmacy / Pharmaceutical","Biotech","Clinical Research","Senior Management","Environmental Testing / Analysis"],
    'Building Construction':	["Contractor / General Contractor","Equipment Operator","Senior Management","Laborer","Project Manager","Welder","Boilermaker","Plumbing / Pipefitting","HVAC","Industrial Design","Ironwork / Metal Fabrication","Painting / Wallpapering","General Management","Flooring / Tiling","Roofing","Sheetrock / Plastering","Site Superintendent","Surveying","Town / City Planning","Building Construction","Electrical","Concrete / Masonry","Carpentry / Framing","Data Management / Analytics","Architect","Heavy Equipment"],
    'Business':	["Project/Program Management","Senior Management","Relations / Communications / Partnerships","Risk / Compliance","Data Management / Analytics","General Operations","Fundraising","Analysis / Research","Public Health Administration","Titles w/o Job Function","Business","Strategic Planning / Management","Services / Support","Professional Services","Product Management","General Management","Franchise"],
    'IT / Computers / Electronics':	["Business Intelligence / Data Science","Artificial Intelligence","Desktop Service / Support","Desktop Systems","Electrical / Electronics Engineering","Electronics Design","Enterprise Software Implementation / Consulting","General Management","Graphic Arts / Illustration","Hardware R&D","Cyber Security / Information Security","Data Center","Animation / Multimedia","ASIC / Layout Design","Computer Engineering","Computer / Network Security","Data Storage / Processing","Network Systems","Database Management","Senior Management","Product Development","IT Infrastructure","Devops & Application Management","Information Systems Management","Telecommunications","Software Engineer","IT / Computers / Electronics","Web Design","Usability / Information Architecture","System Analysis","Software / Web R&D","Software / System Architecture","RF / Wireless","CAD / Drafting","IT Project Management","Network / Server Administration","Product QA"],
    'Creative / Design':	["Electronics Design","Project/Program Management","Photography / Videography","Interior Design","Industrial Design","Creative / Design","Graphic Arts / Illustration","General Management","Fashion / Accessories","Data Management / Analytics","Creative Direction","Painting / Sculpting / Printmaking","Senior Management","Advertising","Animation / Multimedia","Architecture","Web Design"],
    'Customer Support / Client Services':	["Account Management","Data Management / Analytics","Customer Service/Support","Customer Support / Client Services","Technical Support","Call Center","General Management","Customer Training","Project/Program Management","Product Support","Senior Management"],
    'Education':	["Project/Program Management","Data Management / Analytics","Senior Management","Administrator","General Management","Education","Teacher/Professor","Student","Special Education","Provost","Librarian","Fitness / Sports Training","Admissions / Advising","Archivist / Curator","Corporate Training","Early Childhood Care / Development","Faculty"],
    'Engineering':	["Enterprise Software Implementation / Consulting","Project/Program Management","Environmental / Geological Engineering","Electrical / Electronics Engineering","Facilities Engineering","Field","General Management","Industrial / Manufacturing Engineering","Energy / Nuclear Engineering","Computer Engineering","Test Engineering","Senior Management","Six Sigma / Black Belt / TQM","Acoustical Engineering","Aerospace / Avionic Engineering","Bio-Engineering","Chemical Engineering","Civil Structural Engineering","ASIC / Layout Design","Automotive Engineering","Design Engineering","Measurement Engineering","Packaging Engineering","CAD / Drafting","Engineering","System / Process Engineering","Semiconductors","RF / Wireless Engineering","Plant Engineering","Naval Architecture / Marine Engineering","Mining Engineering","Metallurgical / Materials Engineering","Data Management / Analytics","Mechanical Engineering"],
    'Finance / Accounting':	["Financial Control","Financial Planning / Advising","Financial Product Sales / Brokerage","Finance / Accounting","General Management","Investment Management","Fund Accounting","Risk Management / Compliance","Procurement","Tax Assessment / Collections","Securities Analysis / Research","Data Management / Analytics","Tax Accounting","Project/Program Management","Financial Analysis / Research","Registered Rep","Registered Investment Advisor","Senior Management","Analyst","Accounts Payable / Receivable","Audit","Bookkeeping","Collections","Controller","Corporate","Credit Review / Analysis"],
    'Insurance':	["Independent Insurance Agent","Employee Benefits Broker / Consultant","Third Party Administrator","Data Management / Analytics","Project/Program Management","Senior Management","Appraiser","Insurance","Claims Review / Adjusting","Audit","Insurance Brokerage","Tax Assessment / Collections","Tax Accounting","Risk Management / Compliance","Policy Underwriting","Planning / Advising","Investment Management","General Management","Financial Analysis / Research","Actuarial Analysis","Bookkeeping"],
    'Food Services / Hospitality':	["Project/Program Management","Data Management / Analytics","Food Services / Hospitality","Winery","Restaurant Management","Host / Hostess","Senior Management","Guide","Guest Services / Concierge","General Management","Front Desk / Reception","Food / Beverage Serving","Food Preparation / Cooking","Hotel / Lodging Management"],
    'Human Resources':	["Payroll / Benefits","Data Management / Analytics","Diversity Management / EEO / Compliance","General Management","Project/Program Management","Compensation / Benefit Policy","Corporate Communications","Corporate Development","Recruiting / Sourcing","Human Resources","Wellness","Learning/Training","HR Systems Administration","Organizational Development","Senior Management","Talent Management"],
    'Installation / Maintenance / Repair':	["Field Services","Plumbing / Pipefitting","Locksmith","Installation / Maintenance / Repair","Telecommunications","Wire / Cable","Data Management / Analytics","Oil Rig / Pipeline","Automotive / Vehicle","Computer / Electronics","Electrician","Equipment","General Management","HVAC","Senior Management","Project/Program Management","Janitorial / Cleaning","Facilities"],
    'Legal':	["Senior Management","Tax Lawyer","Secretary","Regulatory / Compliance","Patent / IP","Real Estate","Litigation","Judge","Legal","General Management","Attorney","Law Firm Attorney","Contracts Administration","Legal Operations","Jury Consultant","Arbitrator/Mediator","Labor / Employment","Paralegal","Project/Program Management","Data Management / Analytics"],
    'Logistics / Transportation':	["Purchasing","Public Transportation","Piloting: Air / Marine","Car / Truck / Van / Bus Driving","Merchandise Planning / Buying","Cargo / Baggage Handling","Data Management / Analytics","Maritime / Marine / Shipbuilding","Inventory Planning / Management","General Management","Import / Export Administration","Senior Management","Project/Program Management","Hazardous Material Handling","Recreational Vehicles","Logistics / Transportation","Vehicle Dispatch / Routing / Scheduling","Train / Rail Operator","Supply Chain Management","Equipment / Forklift / Crane Operation","Logistics","Shipping / Receiving / Warehousing"],
    'Manufacturing / Production / Operations':	["Operations","Manufacturing / Production / Operations","Assembly / Assembly Line","Machining / CNC","General Management","Hazardous Material Handling","Moldmaking / Casting","Operations / Plant Management","Packaging","Materials","Planning","Production","Project/Program Management","Scientific / Technical Production","Layout / Prepress / Printing / Binding","Equipment","Data Management / Analytics","Senior Management","Field Operations","Metal Fabrication / Welding"],
    'Marketing':	["Affiliate Marketing","Brand / Product Marketing","Social Media","SEO","Marketing","Channel Marketing","Public Relations","Telemarketing","Visual / Display Merchandising","Advertising","Data Management / Analytics","Market Research","Project/Program Management","Content Marketing","Demand / Lead Generation","Marketing Operations","Product Management","Media Planning / Buying","Marketing Production / Traffic","Copy Writing / Editing","Corporate Marketing","Direct Marketing","Senior Management","Events / Promotional Marketing","Fundraising","General Management","Digital Marketing","Investor / Public / Media Relations","Marketing Communications"],
    'Medical and Health':	["Data Management / Analytics","Dental Practitioner","Project/Program Management","Medical Therapy / Rehab Services","Senior Management","Mental Health","Research","Ophthalmology","Healthcare Aid","Internal Medicine","Laboratory / Pathology","Medical Device","Radiology","Public Health","Pharmacy / Pharmaceutical","Orthotist","Medical Imaging","Nutrition / Dieting","Medical Practitioner","Nursing / Hospital","General Management","Hospital / Clinic","EMT / Paramedic","Neurology","Dental Assistant / Hygienist","Chiropractic","Veterinary / Animal Care","Sports Medicine","Social Service","Medical and Health"],
    'Quality Assurance / Safety':	["Occupational Heal / Safety","ISO Certification","Building Inspection","Senior Management","Quality Systems","General Management","Project/Program Management","Fraud Investigation","Production Quality","Vehicle Inspection","Quality Assurance / Safety","Food Safety / Inspection","Data Management / Analytics","Environmental Protection","Process Improvement"],
    'Security Services':	["Project/Program Management","Store Security / Loss Prevention","Data Management / Analytics","Security Services","Senior Management","Security Intelligence / Analysis","Military Combat","Security Guard","Police-Law Enforcement","General Management","Firefighting / Rescue","Customs / Immigration","Correctional Officer","Airport Security / Screening"],
    'Journalism / Media / Entertainment':	["Data Management / Analytics","Translation / Interpretation","Senior Management","Project/Program Management","Music / Composing","Journalism / Media / Entertainment","Motion / Talk / Sound / Commercial Pictures","Journalism","General Management","Editing / Proofreading","Painting / Sculpting / Printmaking","Public Relations","Radio / Broadcasting","Documentation / Technical Writing","Video / Television","Author","Digital Content Development"],
    'Agriculture':	["Fertilizer / Agriculture Chemicals","Project/Program Management","Feed / Grain / Milling","Meats / Provisions","Livestock / Breed","Health Foods / Natural Foods","General Management","Forestry / Logging","Food Processing / Distribution","Florists / Floriculture","Field Crops / Soil Management","Poultry / Poultry Products","Senior Management","Dairy Products","Farming / Farming Equipment","Produce (Fruits / Vegetables)","Seed / Nursery Trade","Agriculture","Data Management / Analytics","Fishing"],
    'Retail':	["Retail","Wholesale / Reselling","Visual / Display Merchandising","General Management","Store / Branch Management","Retail Services","eCommerce","Retail / Counter Sales","Merchandise Planning / Buying","Event Ticketing","Cashier","Project/Program Management","Senior Management","Data Management / Analytics"],
    'Sales':	["International Sales","Business Development","Data Management / Analytics","Project/Program Management","General Management","Media / Advertising Sales","Product Sales","General Sales","Telesales","Sales","Senior Management","Sales Operations","Field Service","Agent / Broker"],
    'Hardware And Product Design / Maintenance':	["Hardware And Product Design / Maintenance","Hardware R&D","Usability / Information Architecture","Graphic Arts / Illustration","CAD / Drafting","Computer Engineering","Electrical / Electronics Engineering","Electronics Design","Animation / Multimedia","General Management","RF / Wireless","Telecommunications","ASIC / Layout Design"],
    'Other':	["Other"],
    'Religious':	["Religious"],
    'Government':	["Government"],
    'Line Of Business':	["Human Resources","Creative / Design","Line Of Business","Insurance","Finance / Accounting","Engineering","Education","Customer Support / Client Services","IT / Computers / Electronics","Business","Building Construction","Biotech","Real Estate","Banking / Mortgage","Food Services / Hospitality","Installation / Maintenance / Repair","Legal","Sales","General Management","Retail","Agriculture","Journalism / Media / Entertainment","Security Services","Quality Assurance / Safety","Medical and Health","Marketing","Manufacturing / Production / Operations","Logistics / Transportation","Executives"],
    'Unusable':	["Unusable"]
}

keywords = ['keyword', 'anything with', 'must contain', 'must have', 'containing']

exclude_keywords = ['exclude', 'unwanted', 'suppress', 'avoid', 'not be accept', 'not target', 'omit', 'no', 'exclusion', 'can not accept']

In [6]:
# Example input text from your description
input_text = {
    "geo": "UKI - 25%\nGermany - 20%\nFrance - 15%\nME (UAE+SA) - 10%\nNL - 7%\nNordics (Denmark, SW, FI, NO) - 6%\nSpain - 3%\nBelux - 10%",
    "revenue": "1 - 1M\n1M - 5M\n5M - 20M\n20M - 50M\n50M - 100M\n100M - 500M\n500M - 1B\n1B+",
    "sub_cid": "LC7481-1",
    "industry": "Accommodation and Food Services\nAdministrative and Support and Waste Management and Remediation Services\nAgriculture, Forestry, Fishing and Hunting\nArts, Entertainment, and Recreation\nConstruction\nFinance and Insurance\nHealth Care and Social Assistance\nInformation\nManagement of Companies and Enterprises\nManufacturing\nMining, Quarrying, and Oil and Gas Extraction\nOther Services (except Public Administration)\nProfessional, Scientific, and Technical Services\nPublic Administration\nReal Estate and Rental and Leasing\nRetail Trade\nTransportation and Warehousing\nUtilities\nWholesale Trade",
    "job_level": "VP+\nManager",
    "job_title": "Any C-suite: CDO, CTO, CIO, CMO, CEO, CFO, CSO, COO\nData Team Manager+\nMachine Learning (ML)\nHead of Finance\nVP in Technology\nCloud Architects\nData Engineers Data Analysts\nData insights Analytics titles\nArtificial Intelligence (AI)\nData Science (DS) titles\nVP+ in Marketing\nPredictive Marketing\nVP+ in IT\nVP+ in Finance\nHead of Audit\nVP+ in Software\nVP+ in Sales Operations\nVP+ in Statistics\nData Scientists\nBusiness Intelligence titles\nData Architects\nData/Digital Transformation titles\nData/Digital Innovation titles\nData Execs\nPrivacy, Risk, Governance, Compliance, Security & Audit titles",
    "company_size": "1 - 50\n51 - 100\n101 - 250\n251 - 500\n500 - 1000\n501 - 1000\n1001 - 2000\n1001 - 5000\n2001 - 5000\n5001 - 10000\n10000+\n10001+",
    "job_function": "Engineering"
}

In [7]:
# Function to clean and normalize text
def clean_text(text):
    return text.strip().lower()

def get_input_list(input_text, key):
    try:
        return input_text.get(key, '').split('\n')
    except AttributeError:
        return []

# Global caches
embedding_cache = {}
key_embeddings_cache = defaultdict(dict)  # For storing embeddings per dictionary and model

def get_embedding(text, model="text-embedding-ada-002"):
    cache_key = (text, model)
    if cache_key in embedding_cache:
        return embedding_cache[cache_key]
    else:
        response = client.embeddings.create(
            input=text,
            model=model
        )
        embedding = np.array(response.data[0].embedding)
        embedding_cache[cache_key] = embedding
        return embedding

def get_normalized_embeddings(texts, model="text-embedding-ada-002"):
    embeddings = []
    for text in texts:
        emb = get_embedding(text, model)
        norm_emb = emb / np.linalg.norm(emb)
        embeddings.append(norm_emb)
    return np.array(embeddings)

def match_with_dict(item, dictionary, area=None, model_type="default", threshold=0.8):
    # Select keys based on area
    if area is not None:
        keys = dictionary.get(area, [])
    else:
        keys = list(dictionary.keys())

    if not keys:
        return 'NA'

    if model_type == "gpt":
        # Normalize input embedding
        input_embedding = get_embedding(item)
        input_embedding_norm = input_embedding / np.linalg.norm(input_embedding)

        # Check if embeddings for these keys are already cached
        dict_id = id(dictionary)
        if dict_id in key_embeddings_cache and model_type in key_embeddings_cache[dict_id]:
            key_embeddings = key_embeddings_cache[dict_id][model_type]
        else:
            # Precompute and cache normalized embeddings for dictionary keys
            key_embeddings = {}
            for key in dictionary.keys():
                emb = get_embedding(key)
                key_embeddings[key] = emb / np.linalg.norm(emb)
            key_embeddings_cache[dict_id][model_type] = key_embeddings

        # Filter embeddings for relevant keys
        relevant_keys = [key for key in keys if key in key_embeddings]
        if not relevant_keys:
            return 'NA'
        embeddings = np.array([key_embeddings[key] for key in relevant_keys])

        # Compute cosine similarities using vectorized operations
        scores = embeddings @ input_embedding_norm

        # Find the best match
        max_score = np.max(scores)
        if max_score >= threshold:
            match_idx = np.argmax(scores)
            return relevant_keys[match_idx]
        else:
            return 'NA'

    elif model_type == "bert":
        from transformers import AutoTokenizer, AutoModel
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        model = AutoModel.from_pretrained("bert-base-uncased")
        model.eval()

        # Cache embeddings
        def compute_bert_embedding(text):
            if text in embedding_cache:
                return embedding_cache[text]
            inputs = tokenizer(text, return_tensors='pt')
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).detach().numpy().flatten()
            embedding_cache[text] = embedding / np.linalg.norm(embedding)
            return embedding_cache[text]

        input_embedding = compute_bert_embedding(item)
        embeddings = np.array([compute_bert_embedding(key) for key in keys])

        scores = embeddings @ input_embedding
        max_score = np.max(scores)
        if max_score >= threshold:
            match_idx = np.argmax(scores)
            return keys[match_idx]
        else:
            return 'NA'

    elif model_type == "TFIDF":
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.metrics.pairwise import cosine_similarity

        vectorizer = TfidfVectorizer()
        all_texts = [item] + keys
        tfidf_matrix = vectorizer.fit_transform(all_texts)

        # Compute cosine similarities
        scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

        max_score = np.max(scores)
        if max_score >= threshold:
            match_idx = np.argmax(scores)
            return keys[match_idx]
        else:
            return 'NA'

    else:
        # Default case: sequence similarity
        from difflib import get_close_matches
        closest_match = get_close_matches(item, keys, n=1, cutoff=threshold)
        if closest_match:
            return closest_match[0]
        else:
            return 'NA'

## To add the case handling when two or more keys (eg. levels) are present in item


# Parsing and matching function
def parse_job_titles(job_titles, job_levels_dict, job_areas_dict, exclude_keywords):
    output = []

    for title in job_titles:
        clean_title = clean_text(title)

        # Check for exclusion keywords
        if any(exclude_kw in clean_title for exclude_kw in exclude_keywords):
            output.append(["Exclude", None, None, None, clean_title])
            continue

        # Match with job levels
        job_level = match_with_dict(clean_title, job_levels_dict)

        # Match with job areas
        job_area = match_with_dict(clean_title, job_areas_dict)

        # If not able to match, set NA
        if not job_level: job_level = 'N/A'
        if not job_area: job_area = 'N/A'

        # Add to output
        output.append(["Include", job_level, job_area, None, clean_title])

    return output

# Function to compute similarity score using LLM embeddings
def compute_llm_similarity(text1, text2, model, tokenizer):
    # Tokenizing and encoding the inputs, using the newly added pad token if necessary
    inputs1 = tokenizer(text1, return_tensors='pt', truncation=True, padding=False)
    inputs2 = tokenizer(text2, return_tensors='pt', truncation=True, padding=False)
    # Generating embeddings
    with torch.no_grad():
        outputs1 = model(**inputs1).last_hidden_state.mean(dim=1)  # Take the mean of hidden states for simplicity
        outputs2 = model(**inputs2).last_hidden_state.mean(dim=1)
    # Compuinge cosine similarity between embeddings
    cosine_sim = torch.nn.functional.cosine_similarity(outputs1, outputs2).item()
    return cosine_sim

In [33]:
def process_items(input_text, match_model="gpt"):
    # initialize the output
    output = []

    # Extract items from raw
    job_titles_raw = get_input_list(input_text, 'job_title')
    print('raw titles: ', job_titles_raw)
    job_levels_raw = get_input_list(input_text, 'job_level')
    print('raw levels: ', job_levels_raw)
    job_functions_raw = get_input_list(input_text, 'job_function')
    print('raw functions: ', job_functions_raw)

    # get job levels check
    titles_contain_level = any([match_with_dict(t_item, job_levels_dict, model_type=match_model) for t_item in job_titles_raw])
    functions_contain_level = any([match_with_dict(f_item, job_levels_dict, model_type=match_model) for f_item in job_functions_raw])

    ## Placeholder for paired cases implementation

    # Level - Case 1: Check if job_level block is not empty and job title/function contain higher job level
    if len(job_levels_raw) > 0 and (titles_contain_level or functions_contain_level):

      # Get highest levels to compare
      max_level = max(list(filter(lambda x: x is not None, [job_levels_dict.get(match_with_dict(clean_text(level), job_levels_dict, model_type=match_model)) for level in job_levels_raw]))+[-1])
      max_t_level = max(list(filter(lambda x: x is not None, [job_levels_dict.get(match_with_dict(t_item, job_levels_dict, model_type=match_model)) for t_item in job_titles_raw]))+[-1])
      max_f_level = max(list(filter(lambda x: x is not None, [job_levels_dict.get(match_with_dict(f_item, job_levels_dict, model_type=match_model)) for f_item in job_functions_raw]))+[-1])

      if max_level > max_t_level and max_level > max_f_level:
        levels = list(filter(lambda x: x is not None, [match_with_dict(clean_text(level), job_levels_dict, model_type=match_model) for level in job_levels_raw]))
      elif max_t_level > max_f_level:
        levels = list(filter(lambda x: x is not None, [match_with_dict(t_item, job_levels_dict, model_type=match_model) for t_item in job_titles_raw]))
      else:
        levels = list(filter(lambda x: x is not None, [match_with_dict(f_item, job_levels_dict, model_type=match_model) for f_item in job_functions_raw]))


    # Lavel - Case 2: levels are available only in levels block (apply to all areas and functions)
    elif len(job_levels_raw) > 0:
      levels = [match_with_dict(clean_text(level), job_levels_dict, model_type=match_model) for level in job_levels_raw]

    # Level - Case 3: a/ titles contain job levels
    elif titles_contain_level:
      levels = [job_levels_dict.get(match_with_dict(t_item, job_levels_dict, model_type=match_model)) for t_item in job_titles_raw]
    elif functions_contain_level:
      levels = [job_levels_dict.get(match_with_dict(f_item, job_levels_dict, model_type=match_model)) for f_item in job_functions_raw]

    else: # no such cases
      levels = []

    # Rule 18: Handle the + case (melting) as the default
    min_level = min(list(filter(lambda x: x is not None, [job_levels_dict.get(match_with_dict(clean_text(level), job_levels_dict, model_type=match_model)) for level in levels]))+[100])
    levels = levels + [missing_level for missing_level in job_levels_dict.keys() if job_levels_dict.get(missing_level) > min_level and missing_level not in levels]
    ## Target audience output placeholder
    target_audince_level = list(job_levels_dict.keys())[list(job_levels_dict.values()).index(min_level)] + '+'
    ## Are there except cases for levels? Placeholder to handle

    # get job areas check
    functions_area = list(set([match_with_dict(f_item, job_areas_dict, model_type=match_model) for f_item in job_functions_raw]))
    functions_contain_area = any(functions_area)
    titles_area = list(set([match_with_dict(t_item, job_areas_dict, model_type='gpt') for t_item in job_titles_raw]))
    titles_contain_area = any(titles_area)
    # print('areas in titles', [match_with_dict(t_item, job_areas_dict, model_type='gpt') for t_item in job_titles_raw])
    functions_contain_keywords = any([max([kw in item for kw in keywords]) for item in job_functions_raw])
    titles_contain_keywords = any([max([kw in item for kw in keywords]) for item in job_titles_raw])
    # Rule 2 implementation (functions higher priority over titles)
    search_areas = functions_area + titles_area

    for level in levels:
      # Area case 1: area can be found
      if functions_contain_area or titles_contain_area:
        for item_a in search_areas:
            matched_area = match_with_dict(clean_text(item_a), job_areas_dict)
            if matched_area not in ('NA', None):
                job_area_is_matched = True
                area = matched_area
                functions_contain_function = any([match_with_dict(f_item, job_areas_dict, area, model_type=match_model) for f_item in job_functions_raw])
                titles_contain_function = any([match_with_dict(t_item, job_areas_dict, area, model_type=match_model) for t_item in job_titles_raw])
                # Option 0: customFunction case - no mapping available and no keywords
                if not (functions_contain_function or titles_contain_function or functions_contain_keywords or titles_contain_keywords):
                  # Option 0.1: customFunction through functions
                  for item_f in job_functions_raw:
                    if not (match_with_dict(item_f, job_levels_dict, model_type=match_model) or match_with_dict(item_f, job_areas_dict, model_type=match_model)):
                      function = item_f ## to replace with generative (completion) model
                      source = 'customFunction'
                      operator = any(exclude_kw in item_a or exclude_kw in item_f for exclude_kw in exclude_keywords)
                      # add row item
                      output.append([operator, level, area, function, source, ''])
                  # Option 0.2: customFunction through titles
                  for item_t in job_titles_raw:
                    if not (match_with_dict(item_t, job_levels_dict, model_type=match_model) or match_with_dict(item_t, job_areas_dict, model_type=match_model)):
                      function = item_t ## to replace with generative (completion) model
                      source = 'customFunction'
                      operator = any(exclude_kw in item_a or exclude_kw in item_f for exclude_kw in exclude_keywords)
                      # add row item
                      output.append([operator, level, area, function, source, ''])
                # Option 1.1: function through functions and Rule 2 again
                elif functions_contain_function:
                  for item_f in job_functions_raw:
                    matched_function = match_with_dict(item_f, job_areas_dict, area, model_type=match_model)
                    if matched_function not in ('NA', None):
                      job_function_is_matched = True
                      function = matched_function
                      source = 'function'
                      operator = any(exclude_kw in item_a or exclude_kw in item_f for exclude_kw in exclude_keywords)
                      # add row item
                      output.append([operator, level, area, function, source, ''])
                # Option 1.2: function through titles
                elif titles_contain_function:
                  for item_t in job_titles_raw:
                    matched_function = match_with_dict(item_t, job_areas_dict, area, model_type=match_model)
                    if matched_function not in ('NA', None):
                      job_function_is_matched = True
                      function = matched_function
                      source = 'function' # is it really 'function' when we map like this?
                      operator = any(exclude_kw in item_a or exclude_kw in item_t for exclude_kw in exclude_keywords)
                      # add row item
                      output.append([operator, level, area, function, source, ''])
                # Option 2.1: keywords are present in functions (for functions case)
                elif functions_contain_keywords:
                  for item_f in job_functions_raw:
                    if any([kw in item_f for kw in keywords]):
                      keyword = item_f
                      source = 'keywords'
                      operator = any(exclude_kw in item_a or exclude_kw in item_f for exclude_kw in exclude_keywords)
                      # add row item
                      output.append([operator, level, area, 'NA', source, keyword])
                # Option 2.2: keywords are present in titles (for functions case)
                else: # if titles_contain_keywords
                  for item_f in job_titles_raw:
                    if any([kw in item_t for kw in keywords]):
                      keyword = item_t
                      source = 'keywords'
                      operator = any(exclude_kw in item_a or exclude_kw in item_t for exclude_kw in exclude_keywords)
                      # add row item
                      output.append([operator, level, area, 'NA', source, keyword])
      # Area case 2: keywords
      elif functions_contain_keywords or titles_contain_keywords:
        # 1: keywords are present in functions
        if functions_contain_keywords:
          for item_f in job_functions_raw:
            if any([kw in item_f for kw in keywords]):
              keyword = item_f
              source = 'keywords'
              operator = any(exclude_kw in item_f for exclude_kw in exclude_keywords)
              # add row item
              output.append([operator, level, 'NA', 'NA', source, keyword])
        # 2: keywords are present in titles
        else: # if titles_contain_keywords
          for item_f in job_titles_raw:
            if any([kw in item_t for kw in keywords]):
              keyword = item_t
              source = 'keywords'
              operator = any(exclude_kw in item_t for exclude_kw in exclude_keywords)
              # add row item
              output.append([operator, level, 'NA', 'NA', source, keyword])
      # Area case 3: customFunctionAndArea
      else:
        source = 'customFunctionAndArea'
        # 1: customFunctionAndArea through functions
        for item_f in job_functions_raw:
          if not match_with_dict(item_f, job_levels_dict, model_type=match_model):
            function, area = item_f, item_f ## to replace with generative (completion) model
            operator = any(exclude_kw in item_a or exclude_kw in item_f for exclude_kw in exclude_keywords)
            # add row item
            output.append([operator, level, area, function, source, ''])
        # Option 0.2: customFunction through titles
        for item_t in job_titles_raw:
          if not match_with_dict(item_t, job_levels_dict, model_type=match_model):
            function, area = item_t, item_t ## to replace with generative (completion) model
            operator = any(exclude_kw in item_a or exclude_kw in item_f for exclude_kw in exclude_keywords)
            # add row item
            output.append([operator, level, area, function, source, ''])

    return output

In [34]:
# Convert to DataFrame for structured output
df = pd.DataFrame(process_items(input_text), columns=['Is_Exclude', 'Job Levels', 'Job Areas', 'Job Functions', 'Source', 'Keywords']) # not sure if to keep keywords separate in the output
df.to_csv('/content/drive/MyDrive/example_output.csv')
df

raw titles:  ['Any C-suite: CDO, CTO, CIO, CMO, CEO, CFO, CSO, COO', 'Data Team Manager+', 'Machine Learning (ML)', 'Head of Finance', 'VP in Technology', 'Cloud Architects', 'Data Engineers Data Analysts', 'Data insights Analytics titles', 'Artificial Intelligence (AI)', 'Data Science (DS) titles', 'VP+ in Marketing', 'Predictive Marketing', 'VP+ in IT', 'VP+ in Finance', 'Head of Audit', 'VP+ in Software', 'VP+ in Sales Operations', 'VP+ in Statistics', 'Data Scientists', 'Business Intelligence titles', 'Data Architects', 'Data/Digital Transformation titles', 'Data/Digital Innovation titles', 'Data Execs', 'Privacy, Risk, Governance, Compliance, Security & Audit titles']
raw levels:  ['VP+', 'Manager']
raw functions:  ['Engineering']


Unnamed: 0,Is_Exclude,Job Levels,Job Areas,Job Functions,Source,Keywords
0,False,C-Level,Engineering,Engineering,function,
1,False,C-Level,Biotech,Biotech,function,
2,False,C-Level,Finance / Accounting,Finance / Accounting,function,
3,False,C-Level,Sales,Sales,function,
4,False,C-Level,IT / Computers / Electronics,IT / Computers / Electronics,function,
...,...,...,...,...,...,...
319,False,Administrator,IT / Computers / Electronics,IT / Computers / Electronics,function,
320,False,Administrator,Human Resources,Human Resources,function,
321,False,Administrator,Marketing,Marketing,function,
322,False,Administrator,Security Services,Security Services,function,


In [39]:
input_text3 = {
    "geo": "UKI - 25%\nGermany - 20%\nFrance - 15%\nME (UAE+SA) - 10%\nNL - 7%\nNordics (Denmark, SW, FI, NO) - 6%\nSpain - 3%\nBelux - 10%",
    "revenue": "1 - 1M\n1M - 5M\n5M - 20M\n20M - 50M\n50M - 100M\n100M - 500M\n500M - 1B\n1B+",
    "sub_cid": "LC7481-1",
    "industry": "Accommodation and Food Services\nAdministrative and Support and Waste Management and Remediation Services\nAgriculture, Forestry, Fishing and Hunting\nArts, Entertainment, and Recreation\nConstruction\nFinance and Insurance\nHealth Care and Social Assistance\nInformation\nManagement of Companies and Enterprises\nManufacturing\nMining, Quarrying, and Oil and Gas Extraction\nOther Services (except Public Administration)\nProfessional, Scientific, and Technical Services\nPublic Administration\nReal Estate and Rental and Leasing\nRetail Trade\nTransportation and Warehousing\nUtilities\nWholesale Trade",
    "job_level": "Director+",
    "job_title": "",
    "company_size": "1 - 50\n51 - 100\n101 - 250\n251 - 500\n500 - 1000\n501 - 1000\n1001 - 2000\n1001 - 5000\n2001 - 5000\n5001 - 10000\n10000+\n10001+",
    "job_function": "Network"
}
# Convert to DataFrame for structured output
df = pd.DataFrame(columns=['Is_Exclude', 'Job Levels', 'Job Areas', 'Job Functions', 'Source', 'Keywords'])  # not sure if to keep keywords separate in the output
new_data = pd.DataFrame(process_items(input_text3), columns=['Is_Exclude', 'Job Levels', 'Job Areas', 'Job Functions', 'Source', 'Keywords'])
df = pd.concat([df, new_data], ignore_index=True)
df

raw titles:  ['']
raw levels:  ['Director+']
raw functions:  ['Network']


Unnamed: 0,Is_Exclude,Job Levels,Job Areas,Job Functions,Source,Keywords
0,False,Director,Marketing,Marketing,function,
1,False,C-Level,Marketing,Marketing,function,
2,False,Executive VP,Marketing,Marketing,function,
3,False,Senior VP,Marketing,Marketing,function,
4,False,VP,Marketing,Marketing,function,
5,False,Head,Marketing,Marketing,function,
6,False,Senior Director,Marketing,Marketing,function,


In [29]:
# from google.colab import drive
# drive.mount('/content/drive')


Mounted at /content/drive


In [48]:
null = ""
inputs = [{"geo": "AMER", "revenue": null, "sub_cid": "XR55058-1", "industry": null, "job_level": "manager+", "job_title": "CISO\nChief Information Security Officer\nChief Privacy Officer/IT Compliance Officer\nChief Security Officer\nChief Technology Officer\nNetwork Security Analyst\nSecurity Analyst T1\nSecurity Analyst T2\nSecurity Analyst T3\nSOC Manager\nSOC Director\nVice President Cyber Services\nVice President Head of Data Science -- Cyber Data Intelligence & Innovation\nVice President Information Security\nVice President Information Security\nVice President IT Auditor\nVice President of Cyber Defense Strategy\nVice President of Operations\nVice President Security\nVice President Security & Privacy\nVice President Security Engineering\n\n\n\nNetwork Operations Titles\nVP Network Ops\nDirector Network Ops\nDirector NOC\nVoice Operations Titles\nVP Voice Ops\nDirector Voice Ops\nVideo Operations/Content Delivery Network Titles\nVP Video Ops\nDirector Video Ops\nVP CDN Operations\nDirector CDN Operations\nField Operations Titles\nVP Field Ops\nVP Regional Ops\nDirector Field Ops\nCustomer Experience Titles\nSVP Customer Experience\nDirector Experience (CX)\nLoB Manager\nLoB Director\nCustomer Care Titles\nVP Customer Care\nVP Contact Center Ops\nDirector Consumer Ops\nDirector Customer Care\nDirector Contact Center Ops\nDigital Transformation Titles\nCDO\nCIO\nSVP Digital Transformation\nVP Digital Transformation\nDirector Data Science\nChief IT Architect\nDirector - Digital & Data Portfolio\nNetwork Transformation Titles\nCTO\nSVP Network Transformation\nVP Network Transformation\nDirector Innovation\nDirector Strategy and Transformation\nIT Operations Titles\nSVP IT Operations\nVP IT Operations\nDirector IT Operations\nSecurity Operations\nCISO\nSVP SecOps\nVP SecOps\nDirector SecOps\nSRE \nDevOps Engineers\nDevelopers", "company_size": null, "job_function": null}
  , {"geo": "United Kingdom, + Netherlands and Ireland.", "revenue": "For 3PL’s:  £1M-£20M\n comment: we can increase revenue", "sub_cid": "DA57302-2", "industry": "Regarding the industry, we are targeting only 3PLs (not e-commerce or retailers) for WMS with these contents, which can do both B2B and B2C, so for 3PLs we can target any industries except fresh food/beverage/medical and high-tech industries (such as military, etc.)", "job_level": "For 3PLS: \n\n-\tOwner/ Founder\n-\tOperations Director/ Manager\n-\tWarehouse Manager\n-\tHead of Supply Chain\n-\tCEO (Chief Executive Officer)\n-\tCOO (Chief Operating Officer)\n-\tLogistics Manager\n-\tHead of Logistic\n-\tFulfilment Director\n-\tIT Manager \n-\tHead of Information Technology\n-\tOwner\n-\tFounder\n-\tManaging director\n-\tGeneral Manager\n", "job_title": "see job level\n", "company_size": "", "job_function": null}
  , {"geo": "United Kingdom", "revenue": "5M to 100M", "sub_cid": "DA57302-1", "industry": "Cosmetics, Beauty, Personal Care, Consumer Electronics, Toys, Home Décor, fashion apparel & footwear, Household wellness / home improvement, food supplements", "job_level": "Manager and Above", "job_title": "\n\nManaging Director\nFounder\nCo-Founder\nCEO\nHead Of E-commerce \nEcommerce Director/Manager\nMarketplace Manager \nHead of Digital\nHead of Digital Transformation\nCIO \nBrand Manager\nOperation Manager\n", "company_size": "-We can get rid of this completely", "job_function": null}
  , {"geo": "US - All States except FL, ND, SD, MT, WY, HI, and AK", "revenue": null, "sub_cid": "DF57322-6", "industry": "\nControlled Environment Fruit and Vegetable Growers\n\nNAICS Codes: 111419\nSIC Codes: 0182\n", "job_level": null, "job_title": "\nControlled Environment Fruit and Vegetable Growers\nJob Titles: Owner, CEO, CFO\n", "company_size": null, "job_function": null}
  , {"geo": "US - All States except WY, HI, and AK", "revenue": null, "sub_cid": "DF57322-5", "industry": "Wholesale and Retail Florists\nNAICS Codes: 424930, 453110\nSIC Codes: 5193, 5992", "job_level": null, "job_title": "\nWholesale and Retail Florists\nJob Titles: CEO, CFO, Owner, President, General Manager, Head of Finance\n", "company_size": null, "job_function": null}
  , {"geo": "US - All States except FL, ND, SD, MT, WY, HI, and AK", "revenue": null, "sub_cid": "DF57322-4", "industry": "Garden Centers\n\nNAICS Codes: 444220\nSIC Codes: 5261", "job_level": null, "job_title": "\nGarden Centers\nJob Titles: Owner, CEO, CFO\n", "company_size": null, "job_function": null}
  , {"geo": "US - All States except FL, ND, SD, MT, WY, HI, and AK", "revenue": null, "sub_cid": "DF57322-3", "industry": "Greenhouse Growers\n\nNAICS Codes: 111421, 111422\nSIC Codes: 0181", "job_level": null, "job_title": "Greenhouse Growers\nJob Titles: CEO, CFO, Controller, Owner, Head Grower, Production\n\n", "company_size": null, "job_function": null}
  , {"geo": "70% Brazil/30% Mexico Split", "revenue": null, "sub_cid": "DF57322-2", "industry": "e-commerce\n", "job_level": "Manager +", "job_title": "IT & Marketing\n", "company_size": null, "job_function": "IT & Marketing"}
  , {"geo": "US - All States except FL, ND, SD, MT, WY, HI, and AK", "revenue": null, "sub_cid": "DF57322-1", "industry": "Nurseries\nNAICS Codes: 111421, 424930\n", "job_level": null, "job_title": "\nNurseries\nJob titles: CEO, CFO, Controller, Owner, General Manager, Territory Manager\n", "company_size": null, "job_function": null}
  , {"geo": "Bangladesh, Hong Kong, India, Indonesia, Japan, Korea, Malaysia, Singapore, Taiwan, Thailand, Vietnam, Australia, New Zealand\n comment: Please only target accounts in corresponding countries", "revenue": "See TAL", "sub_cid": "MG57788-1", "industry": "See TAL", "job_level": "Senior Management: Director, CXO, Owner, Partner, VP, Manager", "job_title": "Suppress attached job titles\n\nCan also target these General Business Management titles: Operations Director, COO, CEO, General Manager, Executive Director, President, Business Operations VP\n comment: Exclude job titles with the following keywords: Governance, Analytics", "company_size": "See TAL", "job_function": "Credit cards / payment cards\nCustomer experience and engagement, benefits \nCustomer acquisition and Sales\nLoyalty, rewards and partnerships\nMarketing\nProduct\nProcurement\n comment: The client would like to focus on less Marketing leads and more of the following:\n\nCredit cards / payment cards​​\nCustomer experience and engagement, benefits​​\nLoyalty, rewards and partnerships​​"}
  , {"geo": "US", "revenue": null, "sub_cid": "DC57996-1-1", "industry": null, "job_level": null, "job_title": null, "company_size": null, "job_function": null}
  , {"geo": "US + Canada. (US States: New York, Florida, Texas, Maryland, Virginia, Pennsylvania,\nCalifornia, Washington, Indiana, Arkansas, Minnesota); Canada States: (Ontario,\nBritish, Quebec, Alberta)\n\nUS Cities:\nAtlanta, Brooklyn, Los Angeles, Austin, Miami, San Antonio, Houston, Lafayette, Maryland, New York, Norfolk, Philadephia, San Carlos, San Francisco, Seattle, Chicago, Cleveland, corona, Ellenwood, Englewood, Fairfax, Georgia, Grapevine, Jacksonville, Kissimmee, Orlando, Poughkeep, Rogers, Saint Paul, Virginia Beach\n\nCanada Cities:\nToronto, Vancouver, Montreal, Waterloo, Ottawa, Calgary, Edmonton, Quebec City", "revenue": null, "sub_cid": "DC57996-1", "industry": "Life Style, Ecommerce, Real Estate, Lawyers or Law Firm,Gaming, Insurance or\nMortagage,Banking & Finance, B2B & B2C Marketplace, Fitness & Training,Logistics,\nDating, Social Networking, Social Commerce, Social Media, Education,\nHealthcare,Sports, Stock broker or companies,Media & Entertainment, Tourism and\ntravel,hospitality, Beauty, Art & Design,Event Management, Parenting, Day care, Errand Service Market, Fashion & Aparell, food & Beverage,Automotive, ride hailing\nservices.\n", "job_level": "C-Level, Director, VP, Senior Manager+", "job_title": "Board of Directors, Managing Director, Chairman, Chief Executive Officer, Chief\nOperating Officer, Vice-President, President, General Manager, and other Senior\nExecutives.", "company_size": "50-200 and 300 - 500", "job_function": null}
  , {"geo": "CA", "revenue": null, "sub_cid": "XU58298-7-1", "industry": "TAL", "job_level": "See title attachment", "job_title": "see attached \"ITOPs\" column", "company_size": "TAL", "job_function": null}
  , {"geo": "CA", "revenue": null, "sub_cid": "XU58298-6-1", "industry": "TAL", "job_level": "See title attachment", "job_title": "see attached \"Engineers/Architects\" column", "company_size": "TAL", "job_function": null}
  , {"geo": "CA", "revenue": null, "sub_cid": "XU58298-5-1", "industry": "TAL", "job_level": "See title attachment", "job_title": "see attached \"Data Scientist\" column", "company_size": "TAL", "job_function": null}
  , {"geo": "CA", "revenue": null, "sub_cid": "XU58298-4-1", "industry": "TAL", "job_level": "See title attachment", "job_title": "see attached \"Executive LOB\" column", "company_size": "TAL", "job_function": null}
  , {"geo": "CA", "revenue": null, "sub_cid": "XU58298-3-1", "industry": "TAL", "job_level": "See title attachment", "job_title": "see attached \"C-Level LOB\" column", "company_size": "TAL", "job_function": null}
  , {"geo": "CA", "revenue": null, "sub_cid": "XU58298-2-1", "industry": "TAL", "job_level": "See title attachment", "job_title": "see attached \"Executive ITOPs\" column", "company_size": "TAL", "job_function": null}
  , {"geo": "CA", "revenue": null, "sub_cid": "XU58298-1-1", "industry": "TAL", "job_level": "See title attachment", "job_title": "see attached \"C-Level IT\" Column", "company_size": "TAL", "job_function": null}
  , {"geo": "US", "revenue": null, "sub_cid": "XU58343-7", "industry": "TAL", "job_level": "See title attachment", "job_title": "see attached \"ITOPs\" column", "company_size": "TAL", "job_function": null}
  , {"geo": "US", "revenue": null, "sub_cid": "XU58343-6", "industry": "TAL", "job_level": "See title attachment", "job_title": "see attached \"Engineers/Architects\" column", "company_size": "TAL", "job_function": null}]

# Convert to DataFrame for structured output
df = pd.DataFrame(columns=['Input', 'Is_Exclude', 'Job Levels', 'Job Areas', 'Job Functions', 'Source', 'Keywords'])  # not sure if to keep keywords separate in the output
for input in inputs:
  new_data = pd.DataFrame(process_items(input), columns=['Is_Exclude', 'Job Levels', 'Job Areas', 'Job Functions', 'Source', 'Keywords'])
  new_data['Input'] = str(input)
  df = pd.concat([df, new_data[['Input', 'Is_Exclude', 'Job Levels', 'Job Areas', 'Job Functions', 'Source', 'Keywords']]], ignore_index=True)
df

raw titles:  ['CISO', 'Chief Information Security Officer', 'Chief Privacy Officer/IT Compliance Officer', 'Chief Security Officer', 'Chief Technology Officer', 'Network Security Analyst', 'Security Analyst T1', 'Security Analyst T2', 'Security Analyst T3', 'SOC Manager', 'SOC Director', 'Vice President Cyber Services', 'Vice President Head of Data Science -- Cyber Data Intelligence & Innovation', 'Vice President Information Security', 'Vice President Information Security', 'Vice President IT Auditor', 'Vice President of Cyber Defense Strategy', 'Vice President of Operations', 'Vice President Security', 'Vice President Security & Privacy', 'Vice President Security Engineering', '', '', '', 'Network Operations Titles', 'VP Network Ops', 'Director Network Ops', 'Director NOC', 'Voice Operations Titles', 'VP Voice Ops', 'Director Voice Ops', 'Video Operations/Content Delivery Network Titles', 'VP Video Ops', 'Director Video Ops', 'VP CDN Operations', 'Director CDN Operations', 'Field Oper

Unnamed: 0,Input,Is_Exclude,Job Levels,Job Areas,Job Functions,Source,Keywords
0,"{'geo': '70% Brazil/30% Mexico Split', 'revenu...",False,Manager,Marketing,Marketing,function,
1,"{'geo': '70% Brazil/30% Mexico Split', 'revenu...",False,Manager,Marketing,Marketing,function,
2,"{'geo': '70% Brazil/30% Mexico Split', 'revenu...",False,C-Level,Marketing,Marketing,function,
3,"{'geo': '70% Brazil/30% Mexico Split', 'revenu...",False,C-Level,Marketing,Marketing,function,
4,"{'geo': '70% Brazil/30% Mexico Split', 'revenu...",False,Executive VP,Marketing,Marketing,function,
...,...,...,...,...,...,...,...
388,"{'geo': 'Bangladesh, Hong Kong, India, Indones...",False,Senior Employee,Marketing,Marketing,function,
389,"{'geo': 'Bangladesh, Hong Kong, India, Indones...",False,Senior Employee,Marketing,Marketing,function,
390,"{'geo': 'Bangladesh, Hong Kong, India, Indones...",False,Senior Employee,Marketing,Marketing,function,
391,"{'geo': 'Bangladesh, Hong Kong, India, Indones...",False,Senior Employee,Executives,Executives,function,


In [49]:
df.to_csv('/content/drive/MyDrive/100_samples.csv')

In [None]:
 def generate_custom_suggestion(raw_input, list):
    prompt = f"Based on the input '{raw_input}', suggest an appropriate job function that does not exist in the provided '{list}'. Make it well formulated and relevant. Pls only provide the suggested function in your output."

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an assistant that provides custom job suggestions."},
            {"role": "user", "content": prompt}
        ]
        # max_tokens=100,
        # temperature=0.5  # Adjust temperature for creativity if needed
    )

    suggestion = response.choices[0].message.content
    return suggestion

generate_custom_suggestion('Credit cards', job_areas_dict.get('Finance / Accounting'))

'Credit Card Product Management'

In [None]:
def generate_custom_area_suggestion(raw_input):
    prompt = f"Based on the input '{raw_input}', suggest an appropriate job area. Pls only provide the suggested area in your output."

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an assistant that provides custom job suggestions."},
            {"role": "user", "content": prompt}
        ]
        # max_tokens=100,
        # temperature=0.5  # Adjust temperature for creativity if needed
    )

    suggestion = response.choices[0].message.content
    return suggestion

def generate_custom_function_suggestion(raw_input):
    prompt = f"Based on the input '{raw_input}', suggest an appropriate job function. Pls only provide the suggested function in your output."

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an assistant that provides custom job suggestions."},
            {"role": "user", "content": prompt}
        ]
        # max_tokens=100,
        # temperature=0.5  # Adjust temperature for creativity if needed
    )

    suggestion = response.choices[0].message.content
    return suggestion

In [None]:
  generate_custom_area_suggestion('Trading')

'Finance'

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

def generate_custom_suggestion(raw_input, list):
    # Load the tokenizer and model
    model_name = "EleutherAI/gpt-neo-2.7B"  # Example using GPT-Neo, replace with other models like LLaMA
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Initialize the text generation pipeline
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=100)

    # Prepare the prompt
    prompt = f"Based on the input '{raw_input}', suggest an appropriate job function that does not exist in the provided '{list}'. Make it well formulated and relevant. Please only provide the suggested function in your output."

    # Generate the suggestion
    response = generator(prompt, num_return_sequences=1, temperature=0.5, max_new_tokens=50)

    # Extract the generated text
    suggestion = response[0]['generated_text'].strip()

    # Optionally post-process the suggestion to remove any extra text if needed
    return suggestion


In [None]:
generate_custom_suggestion('Credit cards', job_areas_dict.get('Finance / Accounting'))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=50) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


KeyboardInterrupt: 

In [None]:
input_text3 = {
    "geo": "UKI - 25%\nGermany - 20%\nFrance - 15%\nME (UAE+SA) - 10%\nNL - 7%\nNordics (Denmark, SW, FI, NO) - 6%\nSpain - 3%\nBelux - 10%",
    "revenue": "1 - 1M\n1M - 5M\n5M - 20M\n20M - 50M\n50M - 100M\n100M - 500M\n500M - 1B\n1B+",
    "sub_cid": "LC7481-1",
    "industry": "Accommodation and Food Services\nAdministrative and Support and Waste Management and Remediation Services\nAgriculture, Forestry, Fishing and Hunting\nArts, Entertainment, and Recreation\nConstruction\nFinance and Insurance\nHealth Care and Social Assistance\nInformation\nManagement of Companies and Enterprises\nManufacturing\nMining, Quarrying, and Oil and Gas Extraction\nOther Services (except Public Administration)\nProfessional, Scientific, and Technical Services\nPublic Administration\nReal Estate and Rental and Leasing\nRetail Trade\nTransportation and Warehousing\nUtilities\nWholesale Trade",
    "job_level": "Director",
    "job_title": ",
    "company_size": "1 - 50\n51 - 100\n101 - 250\n251 - 500\n500 - 1000\n501 - 1000\n1001 - 2000\n1001 - 5000\n2001 - 5000\n5001 - 10000\n10000+\n10001+",
    "job_function": "Network"
}



Unnamed: 0,Include/Exclude,Job Levels,Job Areas,Job Functions,Source
0,include,Director,Network,Network,customFunctionAndArea
1,include,Director,,,customFunctionAndArea


In [51]:
item = "Managrr"
example_dict = job_levels_dict

# Test different models
match_default = match_with_dict(item, example_dict)
match_gpt = match_with_dict(item, example_dict, model_type="gpt", threshold=0.5)
match_gpt2 = match_with_dict(item, example_dict, model_type="gpt", threshold=0.8)
match_bert = match_with_dict(item, example_dict, model_type="bert")
match_seq = match_with_dict(item, example_dict, model_type="sequence")

print(f"Matched item (default: TfIdf): {match_default}")
print(f"Matched item (GPT, 0.5 match): {match_gpt}")
print(f"Matched item (GPT, 0.8 match): {match_gpt2}")
print(f"Matched item (BERT): {match_bert}")
print(f"Matched item (sequencial (character) based model): {match_seq}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Matched item (default: TfIdf): Manager
Matched item (GPT, 0.5 match): Manager
Matched item (GPT, 0.8 match): Manager
Matched item (BERT): NA
Matched item (sequencial (character) based model): Manager


In [53]:
for item in "C Level, C-1, C-2".split(','):
  match_gpt = match_with_dict(item, example_dict, model_type="gpt", threshold=0.5)
  print(f"Matched item (GPT): {match_gpt}")

Matched item (GPT): C-Level
Matched item (GPT): C-Level
Matched item (GPT): C-Level


**text-embedding-ada-002**
Provider: OpenAI

Architecture: Based on the GPT architecture

Dimension: 1536-dimensional embeddings

Capabilities:

- High Performance: Offers state-of-the-art performance in capturing semantic relationships between texts.

- Versatility: Suitable for a wide range of tasks including search, clustering, classification, and anomaly detection.

- Cost-Effective: Designed to be computationally efficient, reducing costs for large-scale applications.

In [None]:
# # ! pip install sentence_transformers
# from sentence_transformers import SentenceTransformer, util

# # Load a pre-trained model
# model = SentenceTransformer('all-MiniLM-L6-v2')

# # Input and dictionary keys
# input_text = "managr"
# keys = [
#     'C-Level', 'Executive VP', 'Senior VP', 'VP', 'Head', 'Senior Director',
#     'Director', 'Senior Manager', 'Manager', 'Architect', 'Supervisor',
#     'Senior Employee', 'Coordinator', 'Analyst', 'Specialist', 'Administrator',
#     'Engineer', 'Developer', 'Consultant'
# ]

# # Encode the input and keys
# input_embedding = model.encode(input_text, convert_to_tensor=True)
# key_embeddings = model.encode(keys, convert_to_tensor=True)

# # Calculate cosine similarity
# cosine_scores = util.cos_sim(input_embedding, key_embeddings)

# # Find the index of the most similar key
# most_similar_index = cosine_scores.argmax().item()
# most_similar_key = keys[most_similar_index]

# print(f"The most similar key is: {most_similar_key}")


The most similar key is: Manager


In [None]:
# debadding
def process_items2(input_text, match_model="gpt"):
    # initialize the output
    output = []

    # Extract items from raw
    job_titles_raw = get_input_list(input_text, 'job_title')
    print('raw titles: ', job_titles_raw)
    job_levels_raw = get_input_list(input_text, 'job_level')
    print('raw levels: ', job_levels_raw)
    job_functions_raw = get_input_list(input_text, 'job_function')
    print('raw functions: ', job_functions_raw)

    # get job levels check
    titles_contain_level = any([match_with_dict(t_item, job_levels_dict, model_type=match_model) for t_item in job_titles_raw])
    functions_contain_level = any([match_with_dict(f_item, job_levels_dict, model_type=match_model) for f_item in job_functions_raw])

    ## Placeholder for paired cases implementation

    # Level - Case 1: Check if job_level block is not empty and job title/function contain higher job level
    if len(job_levels_raw) > 0 and (titles_contain_level or functions_contain_level):

      # Get highest levels to compare
      max_level = max(list(filter(lambda x: x is not None, [job_levels_dict.get(match_with_dict(clean_text(level), job_levels_dict, model_type=match_model)) for level in job_levels_raw]))+[-1])
      max_t_level = max(list(filter(lambda x: x is not None, [job_levels_dict.get(match_with_dict(t_item, job_levels_dict, model_type=match_model)) for t_item in job_titles_raw]))+[-1])
      max_f_level = max(list(filter(lambda x: x is not None, [job_levels_dict.get(match_with_dict(f_item, job_levels_dict, model_type=match_model)) for f_item in job_functions_raw]))+[-1])

      if max_level > max_t_level and max_level > max_f_level:
        levels = list(filter(lambda x: x is not None, [match_with_dict(clean_text(level), job_levels_dict, model_type=match_model) for level in job_levels_raw]))
      elif max_t_level > max_f_level:
        levels = list(filter(lambda x: x is not None, [match_with_dict(t_item, job_levels_dict, model_type=match_model) for t_item in job_titles_raw]))
      else:
        levels = list(filter(lambda x: x is not None, [match_with_dict(f_item, job_levels_dict, model_type=match_model) for f_item in job_functions_raw]))


    # Lavel - Case 2: levels are available only in levels block (apply to all areas and functions)
    elif len(job_levels_raw) > 0:
      levels = [match_with_dict(clean_text(level), job_levels_dict, model_type=match_model) for level in job_levels_raw]

    # Level - Case 3: a/ titles contain job levels
    elif titles_contain_level:
      levels = [job_levels_dict.get(match_with_dict(t_item, job_levels_dict, model_type=match_model)) for t_item in job_titles_raw]
    elif functions_contain_level:
      levels = [job_levels_dict.get(match_with_dict(f_item, job_levels_dict, model_type=match_model)) for f_item in job_functions_raw]

    else: # no such cases
      levels = []
    print('levels before melt: ', levels)

    # Rule 18: Handle the + case (melting) as the default
    min_level = min(list(filter(lambda x: x is not None, [job_levels_dict.get(match_with_dict(clean_text(level), job_levels_dict, model_type=match_model)) for level in levels]))+[100])
    levels = levels + [missing_level for missing_level in job_levels_dict.keys() if job_levels_dict.get(missing_level) > min_level and missing_level not in levels]
    ## Target audience output placeholder
    target_audince_level = list(job_levels_dict.keys())[list(job_levels_dict.values()).index(min_level)] + '+'
    ## Are there except cases for levels? Placeholder to handle
    print('levels after melt: ', levels)

    # get job areas check
    functions_area = list(set([match_with_dict(f_item, job_areas_dict, model_type=match_model) for f_item in job_functions_raw]))
    functions_contain_area = any(functions_area)
    titles_area = list(set([match_with_dict(t_item, job_areas_dict, model_type='gpt') for t_item in job_titles_raw]))
    titles_contain_area = any(titles_area)
    # print('areas in titles', [match_with_dict(t_item, job_areas_dict, model_type='gpt') for t_item in job_titles_raw])
    functions_contain_keywords = any([max([kw in item for kw in keywords]) for item in job_functions_raw])
    titles_contain_keywords = any([max([kw in item for kw in keywords]) for item in job_titles_raw])
    # Rule 2 implementation (functions higher priority over titles)
    search_areas = functions_area + titles_area
    print(search_areas)

    for level in levels:
      # Area case 1: area can be found
      if functions_contain_area or titles_contain_area:
        print('function case')
        for item_a in search_areas:
            matched_area = match_with_dict(clean_text(item_a), job_areas_dict)
            if matched_area not in ('NA', None):
                job_area_is_matched = True
                area = matched_area
                print('area: ', area)
                functions_contain_function = any([match_with_dict(f_item, job_areas_dict, area, model_type=match_model) for f_item in job_functions_raw])
                titles_contain_function = any([match_with_dict(t_item, job_areas_dict, area, model_type=match_model) for t_item in job_titles_raw])
                # Option 0: customFunction case - no mapping available and no keywords
                if not (functions_contain_function or titles_contain_function or functions_contain_keywords or titles_contain_keywords):
                  # Option 0.1: customFunction through functions
                  for item_f in job_functions_raw:
                    if not (match_with_dict(item_f, job_levels_dict, model_type=match_model) or match_with_dict(item_f, job_areas_dict, model_type=match_model)):
                      function = item_f ## to replace with generative (completion) model
                      source = 'customFunction'
                      operator = any(exclude_kw in item_a or exclude_kw in item_f for exclude_kw in exclude_keywords)
                      # add row item
                      output.append([operator, level, area, function, source, ''])
                      print(operator, level, area, function, source, '')
                  # Option 0.2: customFunction through titles
                  for item_t in job_titles_raw:
                    if not (match_with_dict(item_t, job_levels_dict, model_type=match_model) or match_with_dict(item_t, job_areas_dict, model_type=match_model)):
                      function = item_t ## to replace with generative (completion) model
                      source = 'customFunction'
                      operator = any(exclude_kw in item_a or exclude_kw in item_f for exclude_kw in exclude_keywords)
                      # add row item
                      output.append([operator, level, area, function, source, ''])
                      print(operator, level, area, function, source, '')
                # Option 1.1: function through functions and Rule 2 again
                elif functions_contain_function:
                  for item_f in job_functions_raw:
                    matched_function = match_with_dict(item_f, job_areas_dict, area, model_type=match_model)
                    if matched_function not in ('NA', None):
                      job_function_is_matched = True
                      function = matched_function
                      source = 'function'
                      operator = any(exclude_kw in item_a or exclude_kw in item_f for exclude_kw in exclude_keywords)
                      # add row item
                      output.append([operator, level, area, function, source, ''])
                      print(operator, level, area, function, source, '')
                # Option 1.2: function through titles
                elif titles_contain_function:
                  for item_t in job_titles_raw:
                    matched_function = match_with_dict(item_t, job_areas_dict, area, model_type=match_model)
                    if matched_function not in ('NA', None):
                      job_function_is_matched = True
                      function = matched_function
                      source = 'function' # is it really 'function' when we map like this?
                      operator = any(exclude_kw in item_a or exclude_kw in item_t for exclude_kw in exclude_keywords)
                      # add row item
                      output.append([operator, level, area, function, source, ''])
                      print(operator, level, area, function, source, '')
                # Option 2.1: keywords are present in functions (for functions case)
                elif functions_contain_keywords:
                  for item_f in job_functions_raw:
                    if any([kw in item_f for kw in keywords]):
                      keyword = item_f
                      source = 'keywords'
                      operator = any(exclude_kw in item_a or exclude_kw in item_f for exclude_kw in exclude_keywords)
                      # add row item
                      output.append([operator, level, area, 'NA', source, keyword])
                      print(operator, level, area, 'NA', source, keyword)
                # Option 2.2: keywords are present in titles (for functions case)
                else: # if titles_contain_keywords
                  for item_f in job_titles_raw:
                    if any([kw in item_t for kw in keywords]):
                      keyword = item_t
                      source = 'keywords'
                      operator = any(exclude_kw in item_a or exclude_kw in item_t for exclude_kw in exclude_keywords)
                      # add row item
                      output.append([operator, level, area, 'NA', source, keyword])
                      print(operator, level, area, 'NA', source, keyword)
      # Area case 2: keywords
      elif functions_contain_keywords or titles_contain_keywords:
        print('keywords case')
        # 1: keywords are present in functions
        if functions_contain_keywords:
          for item_f in job_functions_raw:
            if any([kw in item_f for kw in keywords]):
              keyword = item_f
              source = 'keywords'
              operator = any(exclude_kw in item_f for exclude_kw in exclude_keywords)
              # add row item
              output.append([operator, level, 'NA', 'NA', source, keyword])
              print(operator, level, 'NA', 'NA', source, keyword)
        # 2: keywords are present in titles
        else: # if titles_contain_keywords
          for item_f in job_titles_raw:
            if any([kw in item_t for kw in keywords]):
              keyword = item_t
              source = 'keywords'
              operator = any(exclude_kw in item_t for exclude_kw in exclude_keywords)
              # add row item
              output.append([operator, level, 'NA', 'NA', source, keyword])
              print(operator, level, 'NA', 'NA', source, keyword)
      # Area case 3: customFunctionAndArea
      else:
        print('customFunctionAndArea case')
        source = 'customFunctionAndArea'
        # 1: customFunctionAndArea through functions
        for item_f in job_functions_raw:
          if not match_with_dict(item_f, job_levels_dict, model_type=match_model):
            function, area = item_f, item_f ## to replace with generative (completion) model
            operator = any(exclude_kw in item_a or exclude_kw in item_f for exclude_kw in exclude_keywords)
            # add row item
            output.append([operator, level, area, function, source, ''])
            print(operator, level, area, function, source, '')
        # Option 0.2: customFunction through titles
        for item_t in job_titles_raw:
          if not match_with_dict(item_t, job_levels_dict, model_type=match_model):
            function, area = item_t, item_t ## to replace with generative (completion) model
            operator = any(exclude_kw in item_a or exclude_kw in item_f for exclude_kw in exclude_keywords)
            # add row item
            output.append([operator, level, area, function, source, ''])
            print(operator, level, area, function, source, '')

    return output

# Convert to DataFrame for structured output
df = pd.DataFrame(process_items2(input_text), columns=['Is_Exclude', 'Job Levels', 'Job Areas', 'Job Functions', 'Source', 'Keywords']) # not sure if to keep keywords separate in the output
df.to_csv('/content/drive/MyDrive/example_output.csv')
df

In [None]:
def get_embedding(text, model="text-embedding-ada-002"):
    response = client.embeddings.create(
        input=text,
        model=model
    )
    embedding = response.data[0].embedding
    return np.array(embedding)

# Function to match with dictionary using different models
def match_with_dict(item, dictionary, area=None, model_type="default", threshold=0.8):
    keys = dictionary.get(area) if area is not None else dictionary.keys()

    if model_type == "gpt":
        def cosine_similarity(a, b):
          return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
        # Get embeddings for input and keys
        input_embedding = get_embedding(item)
        key_embeddings = [get_embedding(key) for key in dictionary.keys()]

        # Calculate cosine similarities
        scores = [cosine_similarity(input_embedding, key_emb) for key_emb in key_embeddings]

        max_score = max(scores)
        match_id = np.argmax(np.array(scores))
        match = list(dictionary.keys())[match_id]

        if max_score >= threshold:
            return match
        else:
            return 'NA'

    elif model_type == "bert":
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        model = AutoModel.from_pretrained("bert-base-uncased")

        scores = [compute_llm_similarity(item, key, model, tokenizer) for key in dictionary.keys()]

        max_score = max(scores)
        match_id = np.argmax(np.array(scores))
        match = list(dictionary.keys())[match_id]

        if max_score >= threshold:
            return match
        else:
            return 'NA'

    elif model_type == "TFIDF":
        vectorizer = TfidfVectorizer()
        from sklearn.metrics.pairwise import cosine_similarity
        scores = [cosine_similarity(vectorizer.fit_transform([item, key]))[0][1] for key in dictionary.keys()]

        max_score = max(scores)
        match_id = np.argmax(np.array(scores))
        match = list(dictionary.keys())[match_id]

        if max_score >= threshold:
            return match
        else:
            return 'NA'

    else:
        # Default case: sequence similarity
        closest_match = get_close_matches(item, dictionary.keys(), n=1, cutoff=threshold)
        match = closest_match[0] if len(closest_match) > 0 else ''
        if match != '':
            return match
        else:
            'NA'
