In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import warnings
import fitz  # PyMuPDF
import pickle

warnings.simplefilter("ignore")

with open("vagas.pkl", "rb") as f:
    jobs = pickle.load(f)

# Load models
logreg = joblib.load("logistic_model.pkl")
xgb = joblib.load("xgboost_model.pkl")
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

job_data = joblib.load("job_data.pkl")
job_ids = job_data["job_ids"]
job_titles = job_data["job_titles"]
job_embeddings = job_data["job_embeddings"]

def preprocess(text):
    import re, string
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    stop_words = set(stopwords.words('portuguese'))
    
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text, language="portuguese")
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def predict_jobs_for_cv(cv_text, top_n=5):
    cleaned_cv = preprocess(cv_text)
    cv_vec = embedding_model.encode([cleaned_cv])
    
    sims = cosine_similarity(cv_vec, job_embeddings).flatten()

    results = []
    for i, sim in enumerate(sims):
        logreg_prob = logreg.predict_proba([[sim]])[0][1]
        xgb_prob = xgb.predict_proba([[sim]])[0][1]
        ensemble_prob = (logreg_prob + xgb_prob) / 2

        job = jobs.get(job_ids[i], {})
        title = job.get("informacoes_basicas", {}).get("titulo_vaga", "N/A")
        area = job.get("perfil_vaga", {}).get("areas_atuacao", "N/A")
        skills = job.get("perfil_vaga", {}).get("competencia_tecnicas_e_comportamentais", "")
        activities = job.get("perfil_vaga", {}).get("principais_atividades", "")

        results.append({
            "job_id": job_ids[i],
            "title": title,
            "area": area,
            "skills": skills,
            "activities": activities,
            "similarity": sim,
            "hire_prob": ensemble_prob
        })

    # Sort and display
    top_jobs = sorted(results, key=lambda x: x["hire_prob"], reverse=True)[:top_n]

    for idx, job in enumerate(top_jobs, 1):
        print(f"\n🔹 Recommendation #{idx}")
        print(f"🏢 Job Title       : {job['title']}")
        print(f"📍 Area            : {job['area']}")
        print(f"📈 Similarity Score: {job['similarity']:.2f}")
        print(f"🤖 Hire Probability: {job['hire_prob']:.2%}")
        print(f"🔧 Skills Required : {job['skills'][:200]}...")
        print(f"📋 Activities      : {job['activities'][:200]}...")
        print("-" * 80)


In [3]:
def extract_text_from_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [None]:
cv_path = r"" 
cv_text = extract_text_from_pdf(cv_path)

In [11]:
top_jobs = predict_jobs_for_cv(cv_text, top_n=5)
print(top_jobs)



🔹 Recommendation #1
🏢 Job Title       : Core Banking
📍 Area            : Financeira/Controladoria-
📈 Similarity Score: 0.65
🤖 Hire Probability: 79.61%
🔧 Skills Required : Core Banking-L1 (Mandatory)
As a Domain Consultant in one of the industry verticals, you are responsible for implementation of roadmaps for business process analysis, data analysis, diagnosis of gaps,...
📋 Activities      : Core Banking-L1 (Mandatory)
As a Domain Consultant in one of the industry verticals, you are responsible for implementation of roadmaps for business process analysis, data analysis, diagnosis of gaps,...
--------------------------------------------------------------------------------

🔹 Recommendation #2
🏢 Job Title       : SAP Basis Pl - 20208922208
📍 Area            : TI - SAP-
📈 Similarity Score: 0.65
🤖 Hire Probability: 79.60%
🔧 Skills Required : Buscar o aumento da eficiência dos serviços SAP;
Eliminar riscos e falhas no ambiente;
Monitorar as aplicações SAP, com análise e ajustes
sobre a cap