# Model Notebook

En: This notebook was used to save necessay data and model for the final stage.
It was built with the best metrics and steps from datathon notebook.

Pt: Este notebook foi usado para salvar os dados e modelos necessários para o estágio final.
Eles foi contruído com base nas melhores métricas e etapas do notebook datathon.

In [None]:
# Importing libs/ Importando libs
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.utils import resample
import pandas as pd
import pickle
import json
import joblib
from sentence_transformers import SentenceTransformer
import tqdm as notebook_tqdm

# Reading the df with the best data from datathon notebook
# Lendo o dataframe com os melhores dados do notebook datathon
df = pd.read_pickle('labeled_df_emb')

# Balancing the df
# Balanceando o dataframe
df_majority = df[df.label == 0]
df_minority = df[df.label == 1]

df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=len(df_minority) * 3,
                                   random_state=42)

df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Starting the ML step
# Começando a etapa de machine learning

# Splitting the feature and the target
# Separando atributo e alvo
X = df_balanced[["similarity_score"]]  
y = df_balanced["label"]

# Train-test split
# Separação de treino e test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 1. Train logistic regression with balanced weights
# 1. Treino do modelo de Regressão Logística com pesos balanceados
logreg = LogisticRegression(class_weight='balanced')
logreg.fit(X_train, y_train)

# 2. Train XGBoost with imbalance handling
# 2. Treino do modelo XGBoost com gerenciador de desbalanceamento
scale = y_train.value_counts()[0] / y_train.value_counts()[1]
xgb = XGBClassifier(scale_pos_weight=scale, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

# 3. Getting predicted probabilities from both
# 3. Obtendo probabilidade de predição de ambos
logreg_probs = logreg.predict_proba(X_test)[:, 1]
xgb_probs = xgb.predict_proba(X_test)[:, 1]

# 4. Combining via soft voting
# 4. Combinando via soft voting
ensemble_probs = (logreg_probs + xgb_probs) / 2

# 5. Evaluating
# 5. Avaliando
from sklearn.metrics import roc_auc_score, f1_score

threshold = 0.5
ensemble_preds = (ensemble_probs >= threshold).astype(int)

print("📈 Ensemble Model Evaluation:")
print(classification_report(y_test, ensemble_preds))
print(f"ROC AUC Score: {roc_auc_score(y_test, ensemble_probs):.4f}")
print(f"F1 Score: {f1_score(y_test, ensemble_preds):.4f}")

In [None]:
df_balanced

In [None]:
# Saving the models to be used later
# Salvando os modelos para serem usandos mais tarde
joblib.dump(logreg, "logistic_model.pkl")
joblib.dump(xgb, "xgboost_model.pkl")

In [None]:
# Saving the job data embedding
# Salvando o embedding dos dados de vagas

def preprocess(text):
    import re, string
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    stop_words = set(stopwords.words('portuguese'))
    
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text, language="portuguese")
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

def extract_job_requirements(job):
    skills = job["perfil_vaga"].get("competencia_tecnicas_e_comportamentais", "")
    activities = job["perfil_vaga"].get("principais_atividades", "")
    return skills.lower() + " " + activities.lower()


with open(r'vagas.json', encoding='utf-8') as f:
    jobs = json.load(f)

embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Preprocess job text
job_ids = list(jobs.keys())
job_texts = [preprocess(extract_job_requirements(jobs[jid])) for jid in job_ids]

# Generate embeddings (expensive)
job_embeddings = embedding_model.encode(job_texts, show_progress_bar=True)

# Optional: get job titles for display
job_titles = [jobs[jid]["informacoes_basicas"]["titulo_vaga"] for jid in job_ids]


joblib.dump({
    "job_ids": job_ids,
    "job_titles": job_titles,
    "job_embeddings": job_embeddings
}, "job_data.pkl")

In [None]:
# Saving the jobs data as pickle to save storage space
# Salvando os dados de vagas como pickle para economizar no armazenamento
with open("vagas.pkl", "wb") as f:
    pickle.dump(jobs, f)