###Libraries

In [17]:
#%pip install sentence-transformers scikit-learn pandas numpy
#%pip install -U spacy
!python3 -m spacy download pt_core_news_md

Defaulting to user installation because normal site-packages is not writeable
Collecting pt-core-news-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_md-3.8.0/pt_core_news_md-3.8.0-py3-none-any.whl (42.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pt-core-news-md
Successfully installed pt-core-news-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_md')


In [2]:
import json
import pandas as pd
import numpy as np
import spacy
from spacy.matcher import PhraseMatcher, Matcher
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


###Functions

In [3]:
#Function to read json files
def load_json(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

#Functions to clean the resume and job text
def clean_text(text):
    if pd.isnull(text): return ''
    text = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', text)
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    return text.lower()


def preprocess_text(text):
  if not isinstance(text, str):
      return ""

  # 1. Remove e-mails e phones numbers
  text = re.sub(r'\S+@\S+', ' ', text)  # e-mail
  text = re.sub(r'\(?\d{2}\)?\s?\d{4,5}-?\d{4}', ' ', text)  # phones numbers

  # 2. Remove expressions like "32 anos", "age: 45 anos"
  text = re.sub(r'\b\d{1,2}\s?(anos|anos de idade)?\b', ' ', text, flags=re.IGNORECASE)
  text = re.sub(r'idade\s*[:\-]?\s*\d{1,2}', ' ', text, flags=re.IGNORECASE)

  # 3. Processing with spaCy
  doc = nlp(text)

  clean_tokens = []

  for token in doc:
      # Ignore names, localizations e irrelevants words
      if token.ent_type_ in ["PER", "LOC", "GPE"]:  # people, localization, city/state/country
          continue
      if token.is_stop or token.is_punct or not token.is_alpha:
          continue
      clean_tokens.append(token.lemma_.lower())

  return " ".join(clean_tokens)

###Loading the data

In [None]:
prospects = load_json('Datathon Decision/prospects.json')
vagas = load_json('Datathon Decision/vagas.json')
applicants = load_json('Datathon Decision/applicants.json')

In [5]:
#Create the applicants dataframe
applicants_list = []
for cand_id, a in applicants.items():
    basic_infos = a.get('infos_basicas', {})
    pro_infos = a.get('informacoes_profissionais', {})
    area = pro_infos.get('area_atuacao', '')
    skills = pro_infos.get('conhecimentos_tecnicos', '')
    certifications = pro_infos.get('certificacoes', '')
    resume_text = ' '.join([
            a.get('cv_pt', '') or '',
            skills or '',
            certifications or ''
        ])

    applicants_list.append({
        'candidate_id': cand_id.strip(),
        'candidate_name': basic_infos.get('nome', ''),
        'area': area,
        'skills': skills,
        'certifications': certifications,
        'resume_text': resume_text

    })
applicants_df = pd.DataFrame(applicants_list)

In [None]:
#Table shape: lines and columns
applicants_df.shape

(42482, 6)

In [None]:
applicants_df[['candidate_id']].duplicated().sum()

np.int64(0)

In [None]:
applicants_df[['resume_text']].duplicated().sum()

np.int64(13751)

In [None]:
applicants_df[['resume_text']].isnull().sum()

Unnamed: 0,0
resume_text,0


In [None]:
applicants_df[['candidate_id', 'resume_text']].duplicated().sum()

np.int64(0)

In [None]:
applicants_df[['candidate_name', 'resume_text']].duplicated().sum()

np.int64(682)

We probably have the same candidate applying with different IDs because we have duplicated resume text with the same candidate name.

In [None]:
applicants_df.head()

Unnamed: 0,candidate_id,candidate_name,area,skills,certifications,resume_text
0,31000,Carolina Aparecida,,,,assistente administrativo\n\n\nsantosbatista\n...
1,31001,Eduardo Rios,Administrativa,,,formação acadêmica\nensino médio (2º grau) em ...
2,31002,Pedro Henrique Carvalho,Administrativa,,"MS [77-418] MOS: Microsoft Office Word 2013, M...",objetivo: área administrativa | financeira\n\n...
3,31003,Thiago Barbosa,Administrativa,,,formação\nensino médio completo\ninformática i...
4,31004,Diogo das Neves,,,,última atualização em 09/11/2021\n­ sp\n\nensi...


In [None]:
applicants_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42482 entries, 0 to 42481
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   candidate_id    42482 non-null  object
 1   candidate_name  42482 non-null  object
 2   area            42482 non-null  object
 3   skills          42482 non-null  object
 4   certifications  42482 non-null  object
 5   resume_text     42482 non-null  object
dtypes: object(6)
memory usage: 1.9+ MB


In [6]:
#Create the jobs dataframe
vagas_list = []
for job_id, v in vagas.items():
    info = v.get('informacoes_basicas', {})
    profile = v.get('perfil_vaga', {})
    job_text = ' '.join([
        info.get('titulo_vaga', '') or '',
        info.get('objetivo_vaga', '') or '',
        profile.get('nivel profissional'),
        profile.get('areas_atuacao') or '',
        profile.get('principais_atividades') or '',
        profile.get('competencia_tecnicas_e_comportamentais') or '',
        profile.get('habilidades_comportamentais_necessarias') or '',
        profile.get('demais_observacoes') or ''

    ])
    vagas_list.append({'job_id': job_id.strip(),
                       'job_title': info.get('titulo_vaga', ''),
                       'job_text': job_text
                       })
vagas_df = pd.DataFrame(vagas_list)

In [None]:
vagas_df.shape

(14081, 3)

In [None]:
vagas_df.job_id.duplicated().sum()

np.int64(0)

In [None]:
vagas_df.head(20)

Unnamed: 0,job_id,job_title,job_text
0,5185,Operation Lead -,Operation Lead - Sênior TI - Sistemas e Ferra...
1,5184,Consultor PP/QM Sênior,Consultor PP/QM Sênior Contratação Sênior TI -...
2,5183,ANALISTA PL/JR C/ SQL,ANALISTA PL/JR C/ SQL RFP Analista TI - Sistem...
3,5182,Technical Architect - 11894809,Technical Architect - 11894809 Contratação Ana...
4,5181,Consultor SAP AUTHORIZATION (BCA) -Pleno / Sênior,Consultor SAP AUTHORIZATION (BCA) -Pleno / Sên...
5,5180,Desenvolvedor Web Pleno / Sênior,Desenvolvedor Web Pleno / Sênior Sênior TI - ...
6,5179,Consultor SAP HR Pleno,Consultor SAP HR Pleno Pleno TI - SAP- Experi...
7,5178,Consultor FI Pleno / Sênior,Consultor FI Pleno / Sênior Sênior TI - SAP- ...
8,5177,Consultor SAP CS / PM Pleno / Senior,Consultor SAP CS / PM Pleno / Senior Sênior T...
9,5176,Consultor SAP SD Sênior,Consultor SAP SD Sênior Sênior TI - SAP- Expe...


We have 14k uniques job_id in the table but looking at the job title I've noticed that there are some duplicates.

In [7]:
# All prospects candidates (hired or not)
pairs = []
for job_id, entry in prospects.items():
    for p in entry.get('prospects', []):
        label = 1 if p.get('situacao_candidado', '').lower().startswith('contratado') else 0
        pairs.append({
            'job_id': job_id.strip(),
            'candidate_id': p['codigo'].strip(),
            'label': label
        })
positives_df = pd.DataFrame(pairs)

In [None]:
positives_df['job_id'].duplicated().sum()

np.int64(42480)

In [None]:
positives_df['candidate_id'].duplicated().sum()

np.int64(24354)

In [None]:
#Verifying duplicated for the set candidate and job
positives_df[['job_id', 'candidate_id']].duplicated().sum()

np.int64(0)

In [None]:
positives_df.groupby(['candidate_id', 'label']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,job_id
candidate_id,label,Unnamed: 2_level_1
10,0,2
10002,0,1
10005,0,3
10009,0,2
10009,1,1
...,...,...
9956,0,2
9956,1,1
9969,0,1
9993,0,10


In [None]:
#Counting records per label
positives_df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,50775
1,2984


As we can see, we have more records in prospects table than in applicants table because the same candidate apply to more than 1 job. For example candidate_id 9993 has applied to 10 jobs and has been rejected in all of them. But it could be an application to the same job which has differents job_ids. Let's see if we have records that don't exist in applicants table

In [None]:
#Let's see the difference between the 2 tables
l = set(positives_df['candidate_id'])
m = set(applicants_df['candidate_id'])
len(m.difference(l))

19019

We have 19k candidates with no records in prospects. Almost the half of applicants

In [None]:
len(l.difference(m))

5942

We have 5942 candidates with no records in applicants

In [None]:
positives_df

Unnamed: 0,job_id,candidate_id,label
0,4530,25632,0
1,4530,25529,0
2,4531,25364,1
3,4531,25360,0
4,4533,26338,1
...,...,...,...
53754,14217,2018,0
53755,14218,40384,0
53756,14220,16828,0
53757,14220,15042,0


In [8]:
hired_df = applicants_df.merge(positives_df, on='candidate_id', how ='left')
hired_df = hired_df.dropna(subset=['label'])
full_hired_df = hired_df.merge(vagas_df, on='job_id', how ='left')
full_hired_df = full_hired_df.dropna(subset=['job_text'])
full_hired_df.label = full_hired_df.label.astype(int)

In [None]:
full_hired_df[['candidate_id', 'job_id']].duplicated().sum()

np.int64(0)

In [None]:
full_hired_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,42645
1,2426


In [None]:
full_hired_df.head()

Unnamed: 0,candidate_id,candidate_name,area,skills,certifications,resume_text,job_id,label,job_title,job_text
0,31000,Carolina Aparecida,,,,assistente administrativo\n\n\nsantosbatista\n...,7422,0,Assistente Administrativo/Operação - 12416476,Assistente Administrativo/Operação - 12416476 ...
1,31000,Carolina Aparecida,,,,assistente administrativo\n\n\nsantosbatista\n...,7423,0,Analista Administrativo/Operação - 12416481,Analista Administrativo/Operação - 12416481 Co...
2,31001,Eduardo Rios,Administrativa,,,formação acadêmica\nensino médio (2º grau) em ...,7423,0,Analista Administrativo/Operação - 12416481,Analista Administrativo/Operação - 12416481 Co...
3,31002,Pedro Henrique Carvalho,Administrativa,,"MS [77-418] MOS: Microsoft Office Word 2013, M...",objetivo: área administrativa | financeira\n\n...,7423,0,Analista Administrativo/Operação - 12416481,Analista Administrativo/Operação - 12416481 Co...
4,31003,Thiago Barbosa,Administrativa,,,formação\nensino médio completo\ninformática i...,7422,0,Assistente Administrativo/Operação - 12416476,Assistente Administrativo/Operação - 12416476 ...


In [9]:
df_completed = full_hired_df.drop_duplicates(subset=['candidate_id', 'candidate_name','resume_text', 'job_title'])

In [None]:
df_completed.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,42179
1,2390


###Embedding & Features

In [18]:
nlp = spacy.load("pt_core_news_md")

In [19]:
df_completed['resume_text_clean'] = df_completed['resume_text'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_completed['resume_text_clean'] = df_completed['resume_text'].apply(preprocess_text)


In [20]:
df_completed['job_text_clean'] = df_completed['job_text'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_completed['job_text_clean'] = df_completed['job_text'].apply(preprocess_text)


In [None]:
#Cleaning the texts
df_completed['resume_text_clean'] = df_completed['resume_text'].apply(preprocess_text)
df_completed['job_text_clean'] = df_completed['job_text'].apply(preprocess_text)

In [21]:
#Using a SentenceTransformer model to generate embeddings of text
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') #('paraphrase-multilingual-MiniLM-L12-v2') all-MiniLM-L6-v2
resume_embeds = model.encode(df_completed['resume_text_clean'].tolist(), show_progress_bar=True, batch_size=64)
job_embeds = model.encode(df_completed['job_text_clean'].tolist(), show_progress_bar=True, batch_size=64)

#For each job, calculate "ideal profile" embedding (mean of all hired resumes)
job_ideal_embeds = {}
for job_id in df_completed['job_id'].unique():
    hired_texts = df_completed[(df_completed['job_id']==job_id) & (df_completed['label']==1)]['resume_text_clean']
    cleaned = [x for x in hired_texts]
    if len(cleaned) > 0:
        hired_embeds = model.encode(cleaned, batch_size=16)
        job_ideal_embeds[job_id] = np.mean(hired_embeds, axis=0)

Batches: 100%|██████████| 697/697 [03:52<00:00,  3.00it/s]
Batches: 100%|██████████| 697/697 [02:39<00:00,  4.37it/s]


In [22]:
job_ideal_embeds_serializable = {
    job_id: embedding.tolist()
    for job_id, embedding in job_ideal_embeds.items()
}

with open('job_ideal_embeddings.json', 'w') as f:
    json.dump(job_ideal_embeds_serializable, f)

print("job_ideal_embeddings.json criado com sucesso!")

job_ideal_embeddings.json criado com sucesso!


In [23]:
job_ids_seen = set()
jobs_ids_texts_embeddings = {}
for i, row in df_completed.iterrows():
    try:
      job_id = row['job_id']
      if job_id not in job_ids_seen:
          job_ids_seen.add(job_id)
          jobs_ids_texts_embeddings[job_id] = job_embeds[i]
    except:
      continue

In [26]:
job_embeds_serializable = {
    job_id: embedding.tolist()
    for job_id, embedding in jobs_ids_texts_embeddings.items()
}

with open('job_texts_embeddings.json', 'w') as f:
    json.dump(job_embeds_serializable, f)

print("job_texts_embeddings.json criado com sucesso!")

job_texts_embeddings.json criado com sucesso!


In [30]:
# Computing features -> cosine similarity between resume text and job description/hired resumes
cos_job = []
cos_ideal = []
for i, row in enumerate(df_completed.itertuples(index=False)):
  job_id = row.job_id
  e1 = resume_embeds[i].reshape(1, -1)
  e2 = job_embeds[i].reshape(1, -1)
  cos_job.append(cosine_similarity(e1, e2))
  if job_id in job_ideal_embeds:
    cos_ideal.append(cosine_similarity(e1, job_ideal_embeds[job_id].reshape(1, -1)))
  else:
    cos_ideal.append(0)

df_completed['cosine_to_job'] = cos_job
df_completed['cosine_to_ideal_employee'] = cos_ideal

X = df_completed[['cosine_to_job','cosine_to_ideal_employee']].values
y = df_completed['label'].values
X = StandardScaler().fit_transform(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_completed['cosine_to_job'] = cos_job
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_completed['cosine_to_ideal_employee'] = cos_ideal


###Model training and evaluation

In [None]:
#Train Model & Evaluate
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.33)
clf = RandomForestClassifier(n_estimators=120, random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
roc = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
print("Advanced Model Evaluation:", report)
print("ROC-AUC: {:.3f}".format(roc))

Advanced Model Evaluation:               precision    recall  f1-score   support

           0       0.99      0.97      0.98     23819
           1       0.50      0.77      0.61       985

    accuracy                           0.96     24804
   macro avg       0.75      0.87      0.79     24804
weighted avg       0.97      0.96      0.96     24804

ROC-AUC: 0.921


###Model export

In [None]:
# Save model & features for app/GenAI integration
#import pickle
#pickle.dump({'model': clf, 'scaler': StandardScaler().fit(X), 'embedder': model}, open('resume_fit_prediction_model.pkl', 'wb'))

In [None]:
import gzip
import pickle
# Save compressed
with gzip.open('resume_fit_prediction_model.pkl.gz', 'wb') as f:
    pickle.dump({'model': clf, 'scaler': scaler, 'embedder': embedder}, f)