In [10]:
import fitz  # PyMuPDF
import re


def extract_text_from_pdf(pdf_path):
    # Abrir o documento PDF
    document = fitz.open(pdf_path)
    
    # Extração de texto de todas as páginas
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text("text")

    text = text.replace('\n', ' ')
    return text

def extract_skills(resume_text):
    # Procurar a seção "Skills" usando regex, incluindo variações como "Technical Skills"
    skills_pattern = re.compile(r'\bSkills\b(.*?)(?=\b[A-Z][a-z]*:|\Z)', re.DOTALL | re.IGNORECASE)
    skills_match = skills_pattern.search(resume_text)
    
    if skills_match:
        # Extrair as habilidades e separar por vírgulas
        skills_text = skills_match.group(1).strip()
        
        # Manter apenas as habilidades separadas por vírgulas ou novas linhas
        skills_list = re.split(r',\s*|\n', skills_text)
        
        # Filtrar itens vazios e remover espaços em branco adicionais
        skills_list = [skill.strip() for skill in skills_list if skill.strip()]

        skills_list = ' '.join(skills_list)
        
        return skills_list
    else:
        return ''

In [5]:
import pandas as pd
import os
import pdfplumber

In [17]:
def process_pdfs_in_folders(base_folder_path):
    data = []
    
    # Percorrer todas as subpastas e arquivos
    for root, dirs, files in os.walk(base_folder_path):
        for filename in files:
            if filename.endswith(".pdf"):
                file_path = os.path.join(root, filename)
                folder_name = os.path.basename(root)
                text = extract_text_from_pdf(file_path)
                skills = extract_skills(text)
                data.append({"text": text, "skills": skills, "job": folder_name, })
    
    return data


In [18]:
base_folder_path = "."

# Processar PDFs e coletar os dados
data = process_pdfs_in_folders(base_folder_path)

# Criar um DataFrame
df = pd.DataFrame(data)

In [19]:
df.to_csv("output.csv", index=False, encoding='utf-8')

In [20]:
df

Unnamed: 0,text,skills,job
0,DIRECTOR OF BUSINESS DEVELOPMENT Highlights Wo...,and extensive network by seeking new and expan...,BUSINESS-DEVELOPMENT
1,BUSINESS DEVELOPMENT REPRESENTATIVE Summary De...,FCA Kain Automotive training (3 steps to digit...,BUSINESS-DEVELOPMENT
2,,,BUSINESS-DEVELOPMENT
3,BUSINESS DEVELOPMENT REP Summary Results-orien...,competency versatility attention to detail as ...,BUSINESS-DEVELOPMENT
4,BUSINESS DEVELOPMENT MANAGER/PROGRAM DIRECTOR ...,.Complex project negotiator.International busi...,BUSINESS-DEVELOPMENT
...,...,...,...
2479,AS K-12 PRINCIPAL Professional Summary Committ...,and 10 years of experience. Builds trusting au...,ARTS
2480,BILINGUAL LANGUAGE ARTS SIXTH GRADE TEACHER Su...,and ability to partner across departments with...,ARTS
2481,CREATIVE ASSISTANT Professional Summary Self-m...,consistent quality work and a drive to innovat...,ARTS
2482,"DIRECTOR OF THEATER Highlights Edline, Google ...",academic photo Photoshop Approach arts basic b...,ARTS


In [30]:
# IMPORTAÇÃO DE BIBLIOTECAS
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import spacy
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
# Carregar o modelo spaCy para português
nlp = spacy.load('en_core_web_md')

# FUNÇÃO DE PRÉ-PROCESSAMENTO
def pre_processamento(texto):
    # Processar o texto com spaCy
    doc = nlp(texto)
    
    # Filtrar tokens: remover stopwords, pontuações e tokens que não são letras
    tokens = [token for token in doc if not token.is_stop and not token.is_punct and token.is_alpha]
    
    # Lematizar os tokens filtrados
    lematizados = [token.lemma_.lower() for token in tokens]
    
    # Juntar os tokens lematizados em um único texto
    texto_limpo = " ".join(lematizados)
    
    return texto_limpo

In [33]:
df.text = df.text.apply(pre_processamento)

In [34]:
df

Unnamed: 0,text,skills,job
0,director business development highlight word e...,and extensive network by seeking new and expan...,BUSINESS-DEVELOPMENT
1,business development representative summary de...,FCA Kain Automotive training (3 steps to digit...,BUSINESS-DEVELOPMENT
2,,,BUSINESS-DEVELOPMENT
3,business development rep summary ambitious mar...,competency versatility attention to detail as ...,BUSINESS-DEVELOPMENT
4,business development manager program director ...,.Complex project negotiator.International busi...,BUSINESS-DEVELOPMENT
...,...,...,...
2479,principal professional summary commit passiona...,and 10 years of experience. Builds trusting au...,ARTS
2480,bilingual language art sixth grade teacher sum...,and ability to partner across departments with...,ARTS
2481,creative assistant professional summary indivi...,consistent quality work and a drive to innovat...,ARTS
2482,director theater highlight edline google class...,academic photo Photoshop Approach arts basic b...,ARTS


In [35]:
df.skills = df.skills.apply(pre_processamento)

In [36]:
df

Unnamed: 0,text,skills,job
0,director business development highlight word e...,extensive network seek new expand opportunity ...,BUSINESS-DEVELOPMENT
1,business development representative summary de...,fca kain automotive training step digital succ...,BUSINESS-DEVELOPMENT
2,,,BUSINESS-DEVELOPMENT
3,business development rep summary ambitious mar...,competency versatility attention detail superv...,BUSINESS-DEVELOPMENT
4,business development manager program director ...,project negotiator international business mark...,BUSINESS-DEVELOPMENT
...,...,...,...
2479,principal professional summary commit passiona...,year experience builds trust authentic relatio...,ARTS
2480,bilingual language art sixth grade teacher sum...,ability partner department outside organizatio...,ARTS
2481,creative assistant professional summary indivi...,consistent quality work drive innovate benefit...,ARTS
2482,director theater highlight edline google class...,academic photo photoshop approach art basic br...,ARTS


In [44]:
import yake

def extract_top_keywords(text, language="en", max_ngram_size=2, deduplication_threshold=0.1, deduplication_algo='seqm', window_size=1, num_of_keywords=40):
    if not isinstance(text, str):
        raise ValueError("Input must be a string")

    try:
        custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=window_size, top=num_of_keywords, features=None)
    except ModuleNotFoundError:
        raise ModuleNotFoundError("YAKE library not installed")

    keywords = custom_kw_extractor.extract_keywords(text)

    top_keywords = [kw[0] for kw in keywords]

    return top_keywords

In [45]:
top_keywords_text = df.text.apply(extract_top_keywords)

In [47]:
df["top_keywords_text"] = top_keywords_text

In [48]:
df

Unnamed: 0,text,skills,job,top_keywords_text
0,director business development highlight word e...,extensive network seek new expand opportunity ...,BUSINESS-DEVELOPMENT,"[high praise, praise summa, company city, deve..."
1,business development representative summary de...,fca kain automotive training step digital succ...,BUSINESS-DEVELOPMENT,"[city state, service work, new employee, good ..."
2,,,BUSINESS-DEVELOPMENT,[]
3,business development rep summary ambitious mar...,competency versatility attention detail superv...,BUSINESS-DEVELOPMENT,"[business development, marketing business, dev..."
4,business development manager program director ...,project negotiator international business mark...,BUSINESS-DEVELOPMENT,"[city state, summa cum, abuse gang, new exist,..."
...,...,...,...,...
2479,principal professional summary commit passiona...,year experience builds trust authentic relatio...,ARTS,"[city state, governors university, middle high..."
2480,bilingual language art sixth grade teacher sum...,ability partner department outside organizatio...,ARTS,"[company city, grade teacher, atpe language, i..."
2481,creative assistant professional summary indivi...,consistent quality work drive innovate benefit...,ARTS,"[city state, student body, affair high, liaiso..."
2482,director theater highlight edline google class...,academic photo photoshop approach art basic br...,ARTS,"[city state, perform art, work individual, sch..."


In [49]:
df.to_csv("output2.csv", index=False, encoding='utf-8')

In [50]:
# FUNÇÃO DE CARREGAR AS PALAVRAS CHAVES
def extract_keywords_spacy(text, num_keywords=10):
    # Processar o texto com spaCy
    doc = nlp(text)
    
    # Extrair tokens relevantes: substantivos, adjetivos, verbos e advérbios
    tokens = [token.text for token in doc if token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV'] and not token.is_stop]
    
    # Usar TfidfVectorizer para encontrar os termos mais importantes
    vectorizer = TfidfVectorizer(max_features=num_keywords)
    tfidf_matrix = vectorizer.fit_transform([" ".join(tokens)])
    
    # Obter os nomes das características (palavras)
    feature_names = vectorizer.get_feature_names_out()
    
    # Obter as pontuações TF-IDF para cada termo
    tfidf_scores = tfidf_matrix.toarray().flatten()
    
    # Criar um dicionário de termos e suas pontuações
    keywords_scores = dict(zip(feature_names, tfidf_scores))
    
    # Classificar os termos pelo valor TF-IDF e selecionar os mais importantes
    sorted_keywords = Counter(keywords_scores).most_common(num_keywords)
    
    # Extrair apenas as palavras-chave (ignorando as pontuações)
    keywords = [keyword for keyword, score in sorted_keywords]
    
    return keywords

In [51]:
top_keywords_skills = df.skills.apply(extract_top_keywords)

In [52]:
df["top_keywords_skills"] = top_keywords_skills

In [53]:
df

Unnamed: 0,text,skills,job,top_keywords_text,top_keywords_skills
0,director business development highlight word e...,extensive network seek new expand opportunity ...,BUSINESS-DEVELOPMENT,"[high praise, praise summa, company city, deve...","[executive director, work closely, new, camp, ..."
1,business development representative summary de...,fca kain automotive training step digital succ...,BUSINESS-DEVELOPMENT,"[city state, service work, new employee, good ...","[city state, manner oversaw, employeeâ crm, go..."
2,,,BUSINESS-DEVELOPMENT,[],[]
3,business development rep summary ambitious mar...,competency versatility attention detail superv...,BUSINESS-DEVELOPMENT,"[business development, marketing business, dev...","[city state, trend clear, service met, spyfu j..."
4,business development manager program director ...,project negotiator international business mark...,BUSINESS-DEVELOPMENT,"[city state, summa cum, abuse gang, new exist,...","[city state, new exist, march april, identify ..."
...,...,...,...,...,...
2479,principal professional summary commit passiona...,year experience builds trust authentic relatio...,ARTS,"[city state, governors university, middle high...","[city state, governors university, middle high..."
2480,bilingual language art sixth grade teacher sum...,ability partner department outside organizatio...,ARTS,"[company city, grade teacher, atpe language, i...","[ability partner, innovative lesson, new educa..."
2481,creative assistant professional summary indivi...,consistent quality work drive innovate benefit...,ARTS,"[city state, student body, affair high, liaiso...","[city state, label publish, student body, resu..."
2482,director theater highlight edline google class...,academic photo photoshop approach art basic br...,ARTS,"[city state, perform art, work individual, sch...","[individual student, student recognize, perfor..."


In [54]:
df.to_csv("output2.csv", index=False, encoding='utf-8')