# Final notebook

En: This notebook is used to create the final code to be used in the front-end app done with streamlit.
Pt: Este notebook foi usando para criar a versão final do código que será usado no app fron-end criado com streamlit.

In [None]:
# Importing libs/ importando libs
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import warnings
import fitz
import pickle

warnings.simplefilter("ignore")

# Reading the jobs data/ lendo os dados de vagas
with open("vagas.pkl", "rb") as f:
    jobs = pickle.load(f)

# Loading models/ Carregando os modelos
logreg = joblib.load("logistic_model.pkl")
xgb = joblib.load("xgboost_model.pkl")
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Getting the jobs embeddings' data/ Obtendo os embeddings dos dados de vagas
job_data = joblib.load("job_data.pkl")
job_ids = job_data["job_ids"]
job_titles = job_data["job_titles"]
job_embeddings = job_data["job_embeddings"]

# Function that will process the CV text/ Função que processará o texto do CV.
def preprocess(text):
    import re, string
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    stop_words = set(stopwords.words('portuguese'))
    
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text, language="portuguese")
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)


In [None]:
# This function will use the loaded models and embeddings to calculate similarity and predict the probability to be hired from one uploaded CV.
# Esta função usará os modelos carregados e embeddings para calcular a similaridade e predizer a probabilidade de ser contratado com o text de um CV.
def predict_jobs_for_cv(cv_text, top_n=5):

    # Precessing and cleaning CV text/ Preccessando e limpando texto do CV
    cleaned_cv = preprocess(cv_text)

    # Calculate the CV text embedding/ calculando o embedding do texto do cv
    cv_vec = embedding_model.encode([cleaned_cv])
    
    # Calculating the similirarity between jobs and cv vectors
    # Calculando similiridade entre vetores de vagas e do cv
    sims = cosine_similarity(cv_vec, job_embeddings).flatten()


    # Looping through the jobs, calculating the hire probability and getting job's data
    # Iterando pelas vagas, calculando a probilidade de ser contratado e obtendo informações das vagas

    results = []
    for i, sim in enumerate(sims):
        # Predicting the hire probability/ Predizendo a probabilidade de ser contratado
        logreg_prob = logreg.predict_proba([[sim]])[0][1]
        xgb_prob = xgb.predict_proba([[sim]])[0][1]
        ensemble_prob = (logreg_prob + xgb_prob) / 2

        # Getting the jobs' data/ Obtendo dados das vagas
        job = jobs.get(job_ids[i], {})
        title = job.get("informacoes_basicas", {}).get("titulo_vaga", "N/A")
        area = job.get("perfil_vaga", {}).get("areas_atuacao", "N/A")
        skills = job.get("perfil_vaga", {}).get("competencia_tecnicas_e_comportamentais", "")
        activities = job.get("perfil_vaga", {}).get("principais_atividades", "")

        results.append({
            "job_id": job_ids[i],
            "title": title,
            "area": area,
            "skills": skills,
            "activities": activities,
            "similarity": sim,
            "hire_prob": ensemble_prob
        })

    # Sort and display/ Ordenar e mostrar
    top_jobs = sorted(results, key=lambda x: x["hire_prob"], reverse=True)[:top_n]

    for idx, job in enumerate(top_jobs, 1):
        print(f"\n🔹 Recommendation #{idx}")
        print(f"🏢 Job Title       : {job['title']}")
        print(f"📍 Area            : {job['area']}")
        print(f"📈 Similarity Score: {job['similarity']:.2f}")
        print(f"🤖 Hire Probability: {job['hire_prob']:.2%}")
        print(f"🔧 Skills Required : {job['skills'][:200]}...")
        print(f"📋 Activities      : {job['activities'][:200]}...")
        print("-" * 80)


In [None]:
# Function that will convert PDF into TXT/ Função que converterá PDF em TXT
def extract_text_from_pdf(file_path):
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [None]:
# Getting CV path/ obtendo caminho do CV
cv_path = r"" 

# Converting CV text/ convertendo texto do cv
cv_text = extract_text_from_pdf(cv_path)

In [None]:
# Using the above functions/ Usando funções acima
top_jobs = predict_jobs_for_cv(cv_text, top_n=5)
print(top_jobs)
