# 01_Escolha_features_e_pipeline

## 01_import_bibliotecas

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
import numpy as np
import requests

## 02_Carga_de_dados

In [4]:
candidatos = "https://github.com/Kinrider/tech_challenge_5/raw/refs/heads/main/01_fontes/arquivos_decision/fontes_tratadas/candidatos.parquet"

In [5]:
# vagas
response_candidatos = requests.get(candidatos)
data_candidatos = response_vagas.content
df_candidatos = pd.read_parquet(candidatos)

## 03_seleção_e_tratamento_de_features

In [6]:

# === Definir data de referência (hoje)
hoje = pd.to_datetime("2025-07-15")

In [7]:

# === Converter colunas de data e calcular deltas
df_candidatos["data_atualizacao"] = pd.to_datetime(df_candidatos["data_atualizacao"], errors='coerce')
df_candidatos["data_criacao"] = pd.to_datetime(df_candidatos["data_criacao"], errors='coerce')


In [8]:

df_candidatos["dias_desde_atualizacao"] = (hoje - df_candidatos["data_atualizacao"]).dt.days
df_candidatos["dias_ativo"] = (hoje - df_candidatos["data_criacao"]).dt.days


In [9]:

# === Criar colunas binárias
df_candidatos["tem_outras_certificacoes"] = df_candidatos["outras_certificacoes"].notnull().astype(int)
df_candidatos["tem_outro_idioma"] = df_candidatos["outro_idioma"].notnull().astype(int)


In [10]:

# === Preencher vazios com string vazia para TF-IDF
df_candidatos["area_atuacao"] = df_candidatos["area_atuacao"].fillna("")
df_candidatos["certificacoes"] = df_candidatos["certificacoes"].fillna("")


In [11]:

# === TF-IDF para campos multivalorados separados por vírgula
tfidf_area = TfidfVectorizer(tokenizer=lambda x: x.split(","), lowercase=True)
tfidf_cert = TfidfVectorizer(tokenizer=lambda x: x.split(","), lowercase=True)


In [12]:

tfidf_area_matrix = tfidf_area.fit_transform(df_candidatos["area_atuacao"])
tfidf_cert_matrix = tfidf_cert.fit_transform(df_candidatos["certificacoes"])




In [13]:

# === OneHot Encoding para níveis de idioma e formação
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
onehot_matrix = encoder.fit_transform(df_candidatos[[
    "nivel_academico", "nivel_espanhol", "nivel_ingles"
]].fillna(""))




In [14]:

# === Combinar todas as variáveis em uma matriz final
final_matrix = hstack([
    tfidf_area_matrix,
    tfidf_cert_matrix,
    onehot_matrix,
    np.array(df_candidatos[[
        "dias_desde_atualizacao", "dias_ativo",
        "tem_outras_certificacoes", "tem_outro_idioma"
    ]])
])


In [15]:

# === Criar DataFrame final com nomes das colunas
col_names = (
    [f"area_{feat}" for feat in tfidf_area.get_feature_names_out()] +
    [f"cert_{feat}" for feat in tfidf_cert.get_feature_names_out()] +
    list(encoder.get_feature_names_out([
        "nivel_academico", "nivel_espanhol", "nivel_ingles"
    ])) +
    ["dias_desde_atualizacao", "dias_ativo",
     "tem_outras_certificacoes", "tem_outro_idioma"]
)



In [20]:
df_cluster_input = pd.DataFrame(final_matrix.toarray(), columns=col_names)

In [21]:
df_cluster_input.head()

Unnamed: 0,area_,area_ comercial,area_ financeira/controladoria,area_ gestão e alocação de recursos de ti,area_ jurídica,area_ marketing,area_ novos negócios e parcerias,area_ qualidade corporativa,area_ recursos humanos,area_ relacionamento técnico,...,nivel_ingles_,nivel_ingles_Avançado,nivel_ingles_Básico,nivel_ingles_Fluente,nivel_ingles_Intermediário,nivel_ingles_Nenhum,dias_desde_atualizacao,dias_ativo,tem_outras_certificacoes,tem_outro_idioma
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1372.0,1372.0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1341.0,1372.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1372.0,1372.0,1.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1372.0,1372.0,1.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1372.0,1372.0,1.0,1.0


## 04_Salvando_base_no_git

In [22]:
Caminho = "C:\\Users\\pedro\\Documents\\Área de Trabalho\\tech_challenge_5\\01_fontes\\arquivos_decision\\fontes_tratadas\\"

In [23]:
# === (Opcional) salvar a base tratada
df_cluster_input.to_parquet(Caminho + "02_cluster_input.parquet", index=False)