
# Data Load

In [2]:
import pandas as pd

import json

# Load JSON file from disk
with open('../data/data.json', 'r') as file:
    data = json.load(file)


# Data understanding


## Data structure

In [3]:
data[1]

{'talent': {'languages': [{'rating': 'C2', 'title': 'German'},
   {'rating': 'C2', 'title': 'English'},
   {'rating': 'A2', 'title': 'Spanish'},
   {'rating': 'A2', 'title': 'French'}],
  'job_roles': ['frontend-developer',
   'full-stack-developer',
   'c-c-developer',
   'mobile-developer'],
  'seniority': 'junior',
  'salary_expectation': 44000,
  'degree': 'master'},
 'job': {'languages': [{'title': 'German', 'rating': 'C1', 'must_have': True},
   {'title': 'English', 'rating': 'B2', 'must_have': True}],
  'job_roles': ['frontend-developer'],
  'seniorities': ['junior', 'midlevel'],
  'max_salary': 70000,
  'min_degree': 'none'},
 'label': True}

In [4]:
len(data)

2000


## Universe analysis

In [5]:
import pandas as pd


all_talent_seniorities = set()
all_talent_languages = set()
all_talent_job_roles = set()
all_talent_degrees = set()
all_job_languages = set()
all_job_roles = set()
all_job_seniorities = set()
all_job_degrees = set()
all_talent_language_ratings = set()
all_job_language_ratings = set()

for index, elem in enumerate(data):
    talent = elem["talent"]
    job = elem["job"]

    # talent
    all_talent_job_roles = all_talent_job_roles.union(set(talent["job_roles"]))
    all_talent_seniorities = all_talent_seniorities.union(set([talent["seniority"]]))
    all_talent_degrees = all_talent_degrees.union(set([talent["degree"]]))
    talent_languages = talent["languages"]
    for lang in talent_languages:
        all_talent_languages = all_talent_languages.union(set([lang["title"]]))
        all_talent_language_ratings = all_talent_language_ratings.union(set([lang["rating"]]))

    # job
    all_job_roles = all_job_roles.union(set(job["job_roles"]))
    all_job_seniorities = all_job_seniorities.union(set(job["seniorities"]))
    all_job_degrees = all_job_degrees.union(set([job["min_degree"]]))
    job_languages = job["languages"]
    for lang in job_languages:
        all_job_languages = all_job_languages.union(set([lang["title"]]))
        all_job_language_ratings = all_job_language_ratings.union(set([lang["rating"]]))

print(f"Number of distinct talent languages: {len(all_talent_languages)}")
print(f"Number of distinct talent job roles: {len(all_talent_job_roles)}")
print(f"Number of distinct job languages: {len(all_job_languages)}")
print(f"Number of distinct job roles: {len(all_job_roles)}")
print(all_talent_seniorities)
print(all_job_seniorities)
print(all_talent_degrees)
print(all_job_degrees)
print(all_talent_language_ratings)
print(all_job_language_ratings)

Number of distinct talent languages: 35
Number of distinct talent job roles: 53
Number of distinct job languages: 2
Number of distinct job roles: 33
{'junior', 'senior', 'midlevel', 'none'}
{'junior', 'senior', 'midlevel', 'none'}
{'bachelor', 'none', 'master', 'doctorate', 'apprenticeship'}
{'bachelor', 'none', 'master', 'doctorate', 'apprenticeship'}
{'B1', 'A2', 'C2', 'C1', 'B2', 'A1'}
{'C2', 'B1', 'C1', 'B2'}



# Modeling prototyping


## Training data preparation

In [6]:
seniority_rank_mapping = {
    "none": 1,
    "junior": 2,
    "midlevel": 3,
    "senior": 4
}

degree_rank_mapping = {
    "none": 1,
    "apprenticeship": 2,
    "bachelor": 3,
    "master": 4,
    "doctorate": 5
}

language_rating_rank_mapping = {
    "A1": 1,
    "A2": 2,
    "B1": 3,
    "B2": 4,
    "C1": 5,
    "C2": 6,
}

pdf_seniority_rank_mapping = pd.DataFrame(list(seniority_rank_mapping.items()), columns=["seniority", "seniority_rank"])
pdf_language_rating_rank_mapping = pd.DataFrame(list(language_rating_rank_mapping.items()), columns=["rating", "rating_rank"])

In [7]:
import pandas as pd
pdf_talent_languages_all = pd.DataFrame()
pdf_job_languages_all = pd.DataFrame()
pdf_talent_job_roles_all = pd.DataFrame()
pdf_job_job_roles_all = pd.DataFrame()
labels = list()


for index, elem in enumerate(data):
    talent = elem["talent"]
    job = elem["job"]

    # talent 
    ## languages
    talent_languages = talent["languages"]
    pdf_talent_languages = pd.DataFrame(talent_languages)
    pdf_talent_languages = pdf_talent_languages.merge(pdf_language_rating_rank_mapping, on="rating", how="inner")
    pdf_talent_languages["id"] = index
    pdf_talent_languages["number_of_languages_TALENT"] = len(pdf_talent_languages)
    pdf_talent_languages_all = pd.concat([pdf_talent_languages_all, pdf_talent_languages])
    
    # talent
    ## jobs
    talent_job_roles = talent["job_roles"]
    pdf_talent_job_roles = pd.DataFrame(talent_job_roles, columns=["job_role"])
    pdf_talent_job_roles["id"] = index
    # seniority
    pdf_talent_job_roles["seniority_rank"] =  seniority_rank_mapping.get(talent["seniority"])

    # salary_expectation
    pdf_talent_job_roles["salary"] =  talent["salary_expectation"]

    # degree
    pdf_talent_job_roles["degree"] =  degree_rank_mapping.get(talent["degree"])
    pdf_talent_job_roles_all = pd.concat([pdf_talent_job_roles_all, pdf_talent_job_roles])

    # job
    ## languages
    job_languages = job["languages"]
    pdf_job_languages = pd.DataFrame(job_languages)
    pdf_job_languages = pdf_job_languages.merge(pdf_language_rating_rank_mapping, on="rating", how="inner")
    pdf_job_languages["id"] = index
    pdf_job_languages_all = pd.concat([pdf_job_languages_all, pdf_job_languages])

    # job
    ## jobs
    job_job_roles = job["job_roles"]
    pdf_job_job_roles = pd.DataFrame(job_job_roles, columns=["job_role"])
    pdf_job_job_roles["id"] = index

    # seniority
    pdf_job_seniorities = pd.DataFrame(job["seniorities"], columns = ["seniority"])
    pdf_job_seniorities = pdf_job_seniorities.merge(pdf_seniority_rank_mapping, on="seniority", how="inner")
    pdf_job_job_roles["seniority_rank"] =  pdf_job_seniorities["seniority_rank"].mean()

    # salary
    pdf_job_job_roles["salary"] =  job["max_salary"]

    # degree
    pdf_job_job_roles["degree"] =  degree_rank_mapping.get(job["min_degree"])
    pdf_job_job_roles_all = pd.concat([pdf_job_job_roles_all, pdf_job_job_roles])

    # label
    labels.append(int(elem["label"]))
pdf_labels = pd.DataFrame(zip(range(len(data)), labels), columns=["id", "label"])

In [8]:
pdf_labels.groupby("label").count()

Unnamed: 0_level_0,id
label,Unnamed: 1_level_1
0,1000
1,1000



## Feature engineering

In [9]:
pdf_talent_languages_required = pdf_talent_languages_all[pdf_talent_languages_all["title"].isin(["German", "English"])]
pdf_talent_languages_required_wide = pdf_talent_languages_required.pivot(index=['id', 'number_of_languages_TALENT'], columns='title', values="rating_rank").fillna(0).reset_index()
pdf_talent_languages_required_wide = pdf_talent_languages_required_wide.rename(columns={"English": "English_TALENT", "German": "German_TALENT"})

pdf_job_languages_must_have_wide = pdf_job_languages_all[pdf_job_languages_all["must_have"]==True].pivot(index=['id'], columns='title', values="rating_rank").fillna(0).reset_index()
pdf_job_languages_must_have_wide = pdf_job_languages_must_have_wide.rename(columns={"English": "English_must_have_JOB", "German": "German_must_have_JOB"})

pdf_job_languages_should_have_wide = pdf_job_languages_all[pdf_job_languages_all["must_have"]==False].pivot(index=['id'], columns='title', values="rating_rank").fillna(0).reset_index()
pdf_job_languages_should_have_wide = pdf_job_languages_should_have_wide.rename(columns={"English": "English_should_have_JOB", "German": "German_should_have_JOB"})

In [10]:
pdf_job_job_roles_all.head()

Unnamed: 0,job_role,id,seniority_rank,salary,degree
0,frontend-developer,0,2.5,70000,1
0,frontend-developer,1,2.5,70000,1
0,php-developer,2,3.5,65000,1
0,frontend-developer,3,2.5,70000,1
0,backend-developer,4,3.0,80000,3


In [11]:
pdf_talent_job_roles_all.head()

Unnamed: 0,job_role,id,seniority_rank,salary,degree
0,frontend-developer,0,2,48000,3
1,backend-developer,0,2,48000,3
2,full-stack-developer,0,2,48000,3
3,java-developer,0,2,48000,3
4,mobile-developer,0,2,48000,3


In [12]:
pdf_job_job_roles_all["indicator"] = 1
pdf_talent_job_roles_all["indicator"] = 1

In [13]:
pdf_job_job_roles_all_wide = pdf_job_job_roles_all.pivot(index=['id', 'salary', "degree", "seniority_rank"], columns='job_role', values="indicator").fillna(0).reset_index()
pdf_talent_job_roles_all_wide = pdf_talent_job_roles_all.pivot(index=['id', 'salary', 'degree', "seniority_rank"], columns='job_role', values="indicator").fillna(0).reset_index()

In [14]:
# filter out not seeked job roles (for the sake of sparing some variables) (maybe try both)
pdf_talent_job_roles_all_wide = pdf_talent_job_roles_all_wide[list(pdf_job_job_roles_all_wide)]

In [15]:
rel_jobs = list(pdf_job_job_roles_all_wide)[4:]

In [16]:
len(pdf_talent_job_roles_all_wide)

2000

In [17]:
len(pdf_job_job_roles_all_wide)

2000

In [18]:
pdf_features = pdf_talent_job_roles_all_wide.join(pdf_job_job_roles_all_wide, on="id", how="inner", lsuffix="_TALENT", rsuffix="_JOB").drop(["id_TALENT", "id_JOB"], axis=1)

In [19]:
pdf_features

job_role,id,salary_TALENT,degree_TALENT,seniority_rank_TALENT,backend-developer_TALENT,business-analyst_TALENT,business-development-manager_TALENT,c-c-developer_TALENT,c-net-developer_TALENT,cloud-engineer_TALENT,...,product-owner_JOB,project-manager_JOB,qa-engineer_JOB,sales-manager_JOB,scrum-master-agile-coach_JOB,security_JOB,software-architect_JOB,system-administrator_JOB,system-engineer_JOB,tech-lead_JOB
0,0,48000,3,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,44000,4,2,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,40000,1,4,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,46000,2,2,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,75000,4,3,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1995,101250,2,3,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,1996,62639,3,1,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,1997,99220,3,2,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,1998,73440,3,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
rel_jobs

['backend-developer',
 'business-analyst',
 'business-development-manager',
 'c-c-developer',
 'c-net-developer',
 'cloud-engineer',
 'consulting',
 'content-marketing-manager',
 'customer-success-manager',
 'data-analyst',
 'data-engineer',
 'data-scientist',
 'devops-engineer',
 'engineering-manager',
 'frontend-developer',
 'full-stack-developer',
 'java-developer',
 'key-account-manager',
 'mobile-developer',
 'online-marketing-manager',
 'performance-marketing-manager',
 'php-developer',
 'presales-manager',
 'product-owner',
 'project-manager',
 'qa-engineer',
 'sales-manager',
 'scrum-master-agile-coach',
 'security',
 'software-architect',
 'system-administrator',
 'system-engineer',
 'tech-lead']

In [21]:
pdf_features_final = pdf_features.merge(pdf_talent_languages_required_wide, on="id", how="left")
pdf_features_final = pdf_features_final.merge(pdf_job_languages_must_have_wide, on="id", how="left")
pdf_features_final = pdf_features_final.merge(pdf_job_languages_should_have_wide, on="id", how="left")
pdf_features_final = pdf_features_final.fillna(0)


rel_jobs_talent = [f"{rel_job}_TALENT" for rel_job in rel_jobs]
rel_jobs_job = [f"{rel_job}_JOB" for rel_job in rel_jobs]
# perform some feature engineering

pdf_features_final["salary_discrepancy"] = pdf_features_final["salary_TALENT"] - pdf_features_final["salary_JOB"]
pdf_features_final["degree_discrepancy"] = pdf_features_final["degree_TALENT"] - pdf_features_final["degree_JOB"]
pdf_features_final["seniority_rank_discrepancy"] = pdf_features_final["seniority_rank_TALENT"] - pdf_features_final["seniority_rank_JOB"]
pdf_features_final["German_must_have_discrepancy"] = pdf_features_final["German_must_have_JOB"] - pdf_features_final["German_TALENT"]
pdf_features_final["English_must_have_discrepancy"] = pdf_features_final["English_must_have_JOB"] - pdf_features_final["English_TALENT"]
pdf_features_final["German_should_have_discrepancy"] = pdf_features_final["German_should_have_JOB"] - pdf_features_final["German_TALENT"]
pdf_features_final["English_should_have_discrepancy"] = pdf_features_final["English_should_have_JOB"] - pdf_features_final["English_TALENT"]

role_match = pd.DataFrame(pdf_features_final[rel_jobs_talent].to_numpy() * pdf_features_final[rel_jobs_job].to_numpy(), columns=rel_jobs)
pdf_features_final["p_role_match"] = role_match.sum(axis=1) / pdf_features_final[rel_jobs_job].sum(axis=1) 

pdf_features_final = pdf_features_final.merge(pdf_labels, on="id", how="inner")
rel_features = ["salary_TALENT", "degree_TALENT", "seniority_rank_TALENT", "salary_JOB", "degree_JOB", "seniority_rank_JOB", "number_of_languages_TALENT", "English_TALENT", "German_TALENT", "English_must_have_JOB", "German_must_have_JOB", "English_should_have_JOB",
"German_should_have_JOB", "salary_discrepancy", "degree_discrepancy", "seniority_rank_discrepancy", "German_must_have_discrepancy", "English_must_have_discrepancy", "German_should_have_discrepancy", "English_should_have_discrepancy", "p_role_match"]


## Modeling

In [22]:
X = pdf_features_final[rel_features]
y = pdf_features_final["label"]

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y, shuffle=True)

In [24]:
X_train

Unnamed: 0,salary_TALENT,degree_TALENT,seniority_rank_TALENT,salary_JOB,degree_JOB,seniority_rank_JOB,number_of_languages_TALENT,English_TALENT,German_TALENT,English_must_have_JOB,...,English_should_have_JOB,German_should_have_JOB,salary_discrepancy,degree_discrepancy,seniority_rank_discrepancy,German_must_have_discrepancy,English_must_have_discrepancy,German_should_have_discrepancy,English_should_have_discrepancy,p_role_match
478,70000,2,4,100000,1,3.5,3,5.0,5.0,0.0,...,4.0,0.0,-30000,1,0.5,0.0,-5.0,-5.0,-1.0,0.5
488,46000,3,2,65000,3,2.0,3,4.0,6.0,0.0,...,5.0,0.0,-19000,0,0.0,0.0,-4.0,-6.0,1.0,0.5
1499,81900,1,4,70000,5,2.5,5,1.0,3.0,4.0,...,0.0,0.0,11900,-4,1.5,2.0,3.0,-3.0,-1.0,0.0
1605,65000,2,1,70000,5,3.0,2,3.0,1.0,0.0,...,0.0,0.0,-5000,-3,-2.0,5.0,-3.0,-1.0,-3.0,0.0
511,60000,4,2,82000,3,2.5,4,5.0,6.0,0.0,...,0.0,0.0,-22000,1,-0.5,-1.0,-5.0,-6.0,-5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,25000,4,4,80000,4,3.5,3,6.0,6.0,0.0,...,5.0,0.0,-55000,0,0.5,-1.0,-6.0,-6.0,-1.0,1.0
1452,87500,1,4,70000,5,2.5,3,4.0,5.0,5.0,...,0.0,0.0,17500,-4,1.5,1.0,1.0,-5.0,-4.0,1.0
248,55000,3,3,70000,1,2.5,4,5.0,6.0,4.0,...,0.0,0.0,-15000,2,0.5,-1.0,-1.0,-6.0,-5.0,1.0
303,55000,3,2,54000,1,1.5,3,6.0,6.0,0.0,...,0.0,0.0,1000,2,0.5,0.0,-6.0,-6.0,-6.0,0.5


In [25]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
    max_depth=3, random_state=42).fit(X_train, y_train)

In [26]:
# accuracy on the hold out set
model.score(X_test, y_test)

0.9925

In [27]:
model.feature_importances_

array([3.59189936e-03, 1.37821940e-04, 2.24942317e-04, 3.19742105e-03,
       5.76369497e-03, 2.85643715e-03, 0.00000000e+00, 4.13207357e-03,
       4.19738785e-04, 3.46469012e-04, 7.12842375e-04, 2.92083610e-05,
       0.00000000e+00, 1.79282333e-01, 2.33043469e-01, 4.96426390e-02,
       7.78178846e-02, 8.28434890e-03, 3.03652464e-03, 1.63755670e-03,
       4.25842695e-01])