In [4]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils import resample

import pandas as pd

df = pd.read_pickle('labeled_df_emb')

df_majority = df[df.label == 0]
df_minority = df[df.label == 1]

df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=len(df_minority) * 3,
                                   random_state=42)

df_balanced = pd.concat([df_majority_downsampled, df_minority])

X = df_balanced[["similarity_score"]]  # You can add more features later
y = df_balanced["label"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 1. Train logistic regression with balanced weights
logreg = LogisticRegression(class_weight='balanced')
logreg.fit(X_train, y_train)

# 2. Train XGBoost with imbalance handling
scale = y_train.value_counts()[0] / y_train.value_counts()[1]
xgb = XGBClassifier(scale_pos_weight=scale, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

# 3. Get predicted probabilities from both
logreg_probs = logreg.predict_proba(X_test)[:, 1]
xgb_probs = xgb.predict_proba(X_test)[:, 1]

# 4. Combine via soft voting
ensemble_probs = (logreg_probs + xgb_probs) / 2

# 5. Evaluate
from sklearn.metrics import roc_auc_score, f1_score

threshold = 0.5
ensemble_preds = (ensemble_probs >= threshold).astype(int)

print("📈 Ensemble Model Evaluation:")
print(classification_report(y_test, ensemble_preds))
print(f"ROC AUC Score: {roc_auc_score(y_test, ensemble_probs):.4f}")
print(f"F1 Score: {f1_score(y_test, ensemble_preds):.4f}")

📈 Ensemble Model Evaluation:
              precision    recall  f1-score   support

           0       0.85      0.70      0.77       134
           1       0.41      0.62      0.50        45

    accuracy                           0.68       179
   macro avg       0.63      0.66      0.63       179
weighted avg       0.74      0.68      0.70       179

ROC AUC Score: 0.7556
F1 Score: 0.4956


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [3]:
df_balanced

Unnamed: 0,applicant_id,job_id,similarity_score,status,label,binary_label
230985,43379,14156,0.626962,,0,0
122924,13378,6893,1.000000,,0,0
140369,1882,4113,1.000000,,0,0
218992,16011,11692,0.728049,,0,0
92487,10196,6690,1.000000,,0,0
...,...,...,...,...,...,...
416013,43069,13737,0.667984,Encaminhado ao Requisitante,1,0
416019,43069,13718,0.639748,Encaminhado ao Requisitante,1,0
416550,43123,13969,0.786721,Encaminhado ao Requisitante,1,0
416724,43140,11619,0.709323,Encaminhado ao Requisitante,1,0


In [6]:
import joblib
joblib.dump(logreg, "logistic_model.pkl")
joblib.dump(xgb, "xgboost_model.pkl")

['xgboost_model.pkl']

In [8]:
import json
import joblib
from sentence_transformers import SentenceTransformer
import tqdm as notebook_tqdm

def preprocess(text):
    import re, string
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    stop_words = set(stopwords.words('portuguese'))
    
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text, language="portuguese")
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

def extract_job_requirements(job):
    skills = job["perfil_vaga"].get("competencia_tecnicas_e_comportamentais", "")
    activities = job["perfil_vaga"].get("principais_atividades", "")
    return skills.lower() + " " + activities.lower()


with open(r'vagas.json', encoding='utf-8') as f:
    jobs = json.load(f)

embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Preprocess job text
job_ids = list(jobs.keys())
job_texts = [preprocess(extract_job_requirements(jobs[jid])) for jid in job_ids]

# Generate embeddings (expensive)
job_embeddings = embedding_model.encode(job_texts, show_progress_bar=True)

# Optional: get job titles for display
job_titles = [jobs[jid]["informacoes_basicas"]["titulo_vaga"] for jid in job_ids]


joblib.dump({
    "job_ids": job_ids,
    "job_titles": job_titles,
    "job_embeddings": job_embeddings
}, "job_data.pkl")

Batches: 100%|██████████| 441/441 [13:09<00:00,  1.79s/it]


['job_data.pkl']

In [2]:
import json

with open("vagas.json", encoding="utf-8") as f:
    vagas = json.load(f)

In [3]:
import pickle

with open("vagas.pkl", "wb") as f:
    pickle.dump(vagas, f)