In [None]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np

## Step 1: Load and Parse the Data

In [None]:
# Load JSON files
with open(r'applicants.json', encoding='utf-8') as f:
    applicants = json.load(f)

with open(r'vagas.json', encoding='utf-8') as f:
    jobs = json.load(f)

with open("prospects.json", encoding="utf-8") as f:
    prospects = json.load(f)


## Step 2: Extract Text (Skills & Descriptions)

In [None]:

# From applicants:
def extract_applicant_skills(applicant):
    skills = applicant["informacoes_profissionais"].get("conhecimentos_tecnicos", "")
    cv = applicant.get("cv_pt", "")
    return skills + " " + cv.lower()  # merge and normalize

# From jobs
def extract_job_requirements(job):
    skills = job["perfil_vaga"].get("competencia_tecnicas_e_comportamentais", "")
    activities = job["perfil_vaga"].get("principais_atividades", "")
    return skills.lower() + " " + activities.lower()


##  Step 3: Create Matching Dataset

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('portuguese'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text, language='portuguese')
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

In [None]:
applicant_ids = list(applicants.keys())
job_ids = list(jobs.keys())

In [None]:
# Extract and preprocess texts
applicant_texts = [
    preprocess(extract_applicant_skills(applicants[aid]))
    for aid in applicant_ids
]

job_texts = [
    preprocess(extract_job_requirements(jobs[jid]))
    for jid in job_ids
]

In [None]:
applicant_texts

In [None]:
# Fit only on jobs
vectorizer = TfidfVectorizer()
job_vecs = vectorizer.fit_transform(job_texts)

# Transform applicants using same vectorizer
applicant_vecs = vectorizer.transform(applicant_texts)


In [None]:
similarity_matrix = cosine_similarity(applicant_vecs, job_vecs)

In [None]:
#np.save('similarity_matrix', similarity_matrix)

## Step 4: Recommend Top Jobs for an Applicant

In [None]:
def show_recommendations_for_applicant(applicant_index, top_n=5):
    applicant_id = applicant_ids[applicant_index]
    applicant = applicants.get(applicant_id, {})

    # Extract applicant details
    name = applicant.get("infos_basicas", {}).get("nome", "N/A")
    area = applicant.get("informacoes_profissionais", {}).get("area_atuacao", "N/A")
    skills = applicant.get("informacoes_profissionais", {}).get("conhecimentos_tecnicos", "N/A")
    academic = applicant.get("formacao_e_idiomas", {}).get("nivel_academico", "N/A")
    english = applicant.get("formacao_e_idiomas", {}).get("nivel_ingles", "N/A")
    spanish = applicant.get("formacao_e_idiomas", {}).get("nivel_espanhol", "N/A")
    cv_excerpt = applicant.get("cv_pt", "").strip().replace("\n", " ")[:300] + "..."

    # Show applicant info once
    print(f"\n=== 🧑 Applicant: {name} (ID: {applicant_id}) ===")
    print(f"Área de Atuação: {area}")
    print(f"Conhecimentos Técnicos: {skills}")
    print(f"Formação: {academic} | Inglês: {english} | Espanhol: {spanish}")
    print(f"📄 CV (resumo): {cv_excerpt}\n")
    print('\n--------------------------------------------------------------------\n')

    # Job recommendations
    sim_scores = similarity_matrix[applicant_index]
    top_indices = sim_scores.argsort()[::-1][:top_n]

    for j in top_indices:
        job_id = job_ids[j]
        job = jobs.get(job_id, {})
        job_title = job.get("informacoes_basicas", {}).get("titulo_vaga", "N/A")
        job_area = job.get("perfil_vaga", {}).get("areas_atuacao", "N/A")
        job_skills = job.get("perfil_vaga", {}).get("competencia_tecnicas_e_comportamentais", "N/A")
        job_activities = job.get("perfil_vaga", {}).get("principais_atividades", "N/A")

        print(f"🔹 Job Recommendation: {job_title} (ID: {job_id})")
        print(f"   Similarity Score: {sim_scores[j]:.2f}")
        print(f"   Área: {job_area}")
        print(f"   🔧 Competências: {job_skills[:250]}...")
        print(f"   📋 Atividades: {job_activities[:250]}...\n")
        print('--------------------------------------------------------------------')


In [None]:
show_recommendations_for_applicant(1, top_n=3)

## ML step

In [None]:
job_applicant_status = defaultdict(dict)

# job_id → applicant_id → status
for job_id, job_data in prospects.items():
    for prospect in job_data.get("prospects", []):
        applicant_id = prospect["codigo"]
        status = prospect.get("situacao_candidado", "")
        job_applicant_status[job_id][applicant_id] = status

In [None]:
def label_from_status(status):
    status = status.lower()
    if "contratado" in status or "fechado" in status or "encaminhado" in status:
        return 1  # Match
    else:
        return 0  # no Match

In [None]:
records = []
TOP_N = 10

for i, applicant_id in enumerate(applicant_ids):
    sim_scores = similarity_matrix[i]
    top_indices = sim_scores.argsort()[::-1][:TOP_N]

    for j in top_indices:
        print(i, len(applicant_ids))
        job_id = job_ids[j]
        sim = sim_scores[j]

        status = job_applicant_status.get(job_id, {}).get(applicant_id, "")
        label = label_from_status(status)

        records.append({
            "applicant_id": applicant_id,
            "job_id": job_id,
            "similarity_score": sim,
            "status": status,
            "label": label
        })

df = pd.DataFrame(records)


In [None]:
df['status'].value_counts()

In [None]:
#df.to_pickle('labeled_df')

In [None]:
# Optional: simplify to binary classification (predict Hired vs Not Hired)
# Usar regularização/ ressembly / 

df["binary_label"] = df["label"]

X = df[["similarity_score"]]
y = df["binary_label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(
    scale_pos_weight=80000 / 139,  # imbalance ratio
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.utils import resample

df_majority = df[df.label == 0]
df_minority = df[df.label == 1]

df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=len(df_minority) * 3,
                                   random_state=42)

df_balanced = pd.concat([df_majority_downsampled, df_minority])

In [None]:

df_balanced["binary_label"] = df_balanced["label"]

X = df_balanced[["similarity_score"]]
y = df_balanced["binary_label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
model = XGBClassifier(
    scale_pos_weight=80000 / 139,  # imbalance ratio
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
def show_recommendations_for_applicant(applicant_index, top_n=5, model=None):
    applicant_id = applicant_ids[applicant_index]
    applicant = applicants.get(applicant_id, {})

    # Extract applicant details
    name = applicant.get("infos_basicas", {}).get("nome", "N/A")
    area = applicant.get("informacoes_profissionais", {}).get("area_atuacao", "N/A")
    skills = applicant.get("informacoes_profissionais", {}).get("conhecimentos_tecnicos", "N/A")
    academic = applicant.get("formacao_e_idiomas", {}).get("nivel_academico", "N/A")
    english = applicant.get("formacao_e_idiomas", {}).get("nivel_ingles", "N/A")
    spanish = applicant.get("formacao_e_idiomas", {}).get("nivel_espanhol", "N/A")
    cv_excerpt = applicant.get("cv_pt", "").strip().replace("\n", " ")[:300] + "..."

    # Show applicant info once
    print(f"\n=== 🧑 Applicant: {name} (ID: {applicant_id}) ===")
    print(f"Área de Atuação: {area}")
    print(f"Conhecimentos Técnicos: {skills}")
    print(f"Formação: {academic} | Inglês: {english} | Espanhol: {spanish}")
    print(f"📄 CV (resumo): {cv_excerpt}\n")
    print('\n--------------------------------------------------------------------\n')

    # Job recommendations
    sim_scores = similarity_matrix[applicant_index]
    top_indices = sim_scores.argsort()[::-1][:top_n]

    for j in top_indices:
        job_id = job_ids[j]
        job = jobs.get(job_id, {})
        job_title = job.get("informacoes_basicas", {}).get("titulo_vaga", "N/A")
        job_area = job.get("perfil_vaga", {}).get("areas_atuacao", "N/A")
        job_skills = job.get("perfil_vaga", {}).get("competencia_tecnicas_e_comportamentais", "N/A")
        job_activities = job.get("perfil_vaga", {}).get("principais_atividades", "N/A")

        # Get similarity score
        sim_score = sim_scores[j]

        # Use model to predict hire probability
        hire_prob = None
        if model is not None:
            # Model expects 2D array of features
            hire_prob = model.predict_proba([[sim_score]])[0][1]  # probability of class 1 (hired)

        print(f"🔹 Job Recommendation: {job_title} (ID: {job_id})")
        print(f"   Similarity Score: {sim_score:.2f}")
        if hire_prob is not None:
            print(f"   🤖 Predicted Hire Probability: {hire_prob:.2%}")
        print(f"   Área: {job_area}")
        print(f"   🔧 Competências: {job_skills[:250]}...")
        print(f"   📋 Atividades: {job_activities[:250]}...\n")
        print('--------------------------------------------------------------------')


In [None]:
# After training your model...
show_recommendations_for_applicant(applicant_index=300, top_n=5, model=model)


In [None]:
# Embedding

# Embedding

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # good multilingual model

# Convert preprocessed text
applicant_embeddings = model.encode(applicant_texts, show_progress_bar=True)
job_embeddings = model.encode(job_texts, show_progress_bar=True)

# Similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(applicant_embeddings, job_embeddings)

In [None]:
#np.save('similarity_matrix_emb', similarity_matrix)

In [None]:
show_recommendations_for_applicant(1, top_n=3)

In [None]:
records = []
TOP_N = 10

for i, applicant_id in enumerate(applicant_ids):
    sim_scores = similarity_matrix[i]
    top_indices = sim_scores.argsort()[::-1][:TOP_N]

    for j in top_indices:
        print(i, len(applicant_ids))
        job_id = job_ids[j]
        sim = sim_scores[j]

        status = job_applicant_status.get(job_id, {}).get(applicant_id, "")
        label = label_from_status(status)

        records.append({
            "applicant_id": applicant_id,
            "job_id": job_id,
            "similarity_score": sim,
            "status": status,
            "label": label
        })

df = pd.DataFrame(records)

In [None]:
#df.to_pickle('labeled_df_emb')

In [None]:
df_majority = df[df.label == 0]
df_minority = df[df.label == 1]

df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=len(df_minority) * 3,
                                   random_state=42)

df_balanced = pd.concat([df_majority_downsampled, df_minority])

In [None]:

df_balanced["binary_label"] = df_balanced["label"]

X = df_balanced[["similarity_score"]]
y = df_balanced["binary_label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
model = XGBClassifier(
    scale_pos_weight=80000 / 139,  # imbalance ratio
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
show_recommendations_for_applicant(applicant_index=300, top_n=5, model=model)