In [29]:
# Pacote pandas - Documentação: https://pandas.pydata.org/pandas-docs/stable/index.html
import pandas as pd

# Pacote nltk - Documentação: https://www.nltk.org/
import nltk
nltk.download('punkt')
import nltk
nltk.download('stopwords')
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

# Pacote re - Documentação: https://docs.python.org/3/library/re.html
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gusta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gusta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Carga de Dados

In [2]:
dados = pd.read_csv('Dados/postings.csv')

## Análise Exploratória

In [3]:
dados.head()

Unnamed: 0,job_title,company,job_location,job_link,first_seen,search_city,search_country,job level,job_type,job_summary,job_skills
0,Data Engineer 2,Cook Medical,"Bloomington, IN",https://www.linkedin.com/jobs/view/data-engine...,2023-12-17,Bloomington,United States,Mid senior,Onsite,"Overview\nThe Data Engineer develops, implemen...","Azure, SQL, NoSQL, SQL Server, Oracle, MongoDB..."
1,Staff Data Engineer,Recruiting from Scratch,"Bloomington, IN",https://www.linkedin.com/jobs/view/staff-data-...,2023-12-17,Bloomington,United States,Mid senior,Onsite,This is for a client of Recruiting from Scratc...,"Python, Snowflake, Airflow, Kubernetes, Docker..."
2,"Senior Data Engineer, Public Company",Recruiting from Scratch,"Bloomington, IN",https://www.linkedin.com/jobs/view/senior-data...,2023-12-17,Bloomington,United States,Mid senior,Onsite,This is for a client of Recruiting from Scratc...,"Python, SQL, Snowflake, Airflow, Kubernetes, D..."
3,"Senior Data Engineer, Public Company",Recruiting from Scratch,"Bloomington, IN",https://www.linkedin.com/jobs/view/senior-data...,2023-12-17,Bloomington,United States,Mid senior,Onsite,This is for a client of Recruiting from Scratc...,"TDD, Automation, Continuous delivery, Data eng..."
4,"Senior Systems Engineer, Azure Data Platform",Cook Medical,"Bloomington, IN",https://www.linkedin.com/jobs/view/senior-syst...,2023-12-17,Bloomington,United States,Mid senior,Hybrid,Overview\nWe are seeking a talented Azure Clou...,


## Tranformação e Limpeza

In [4]:
# Tratamento da coluna "job_skills"
# Aqui eu crio um dataframe com o nome da skill e um contador de quantas vezes ela aparece.
skills_list = []
skills_count = {}
for obs in dados['job_skills']:
    if type(obs) == str:
        aux = obs.split(',')
        for skill in aux:
            if skill not in skills_list:
                skills_list.append(skill)
                skills_count[skill] = 1  
            else:
                skills_count[skill] += 1
                
print(len(skills_list))
df_skills = pd.DataFrame(list(skills_count.items()), columns=['Job Skill', 'Count'])
df_skills.head()

18990


Unnamed: 0,Job Skill,Count
0,Azure,92
1,SQL,2692
2,NoSQL,378
3,SQL Server,174
4,Oracle,252


In [32]:
# Processamento de linguagem natural (NLP)

# Armazenar coluna 'job_title' em um novo dataframe 'df_titles'
df_titles = dados['job_title']

# Converter para minúsculas
df_titles = df_titles.str.lower()

# Remover caracteres indesejados
def clean_text(text):
    cleaned_text = re.sub(r'[-,()]', '', text)
    return cleaned_text

df_titles = df_titles.apply(clean_text)

# Tokenização
df_titles = df_titles.apply(nltk.word_tokenize)

# Remoção de Stop Words
stop_words = set(stopwords.words('english'))
df_titles = df_titles.apply(lambda x: [word for word in x if word not in stop_words])

# Stemming
stemmer = PorterStemmer()
df_titles = df_titles.apply(lambda x: [stemmer.stem(word) for word in x])

# Contagem de palavras com mais frequência
flattened_titles = [token for title in df_titles for token in title]
word_frequency = Counter(flattened_titles)
most_common_jobs = word_frequency.most_common(10)

# Visualização de palavras com mais frequência
print(most_common_jobs)

[('data', 5781), ('engin', 4064), ('senior', 1845), ('analyst', 1330), ('lead', 372), ('remot', 341), ('staff', 322), ('sr.', 297), ('databas', 230), ('scientist', 208)]


In [6]:
# Converter dados para CSV
# dados é transformado em CSV "newPostings" para manter o index
dados.to_csv('Dados/newPostings.csv')

# Transformar o df_skills em arquivo CSV.
df_skills.to_csv('Dados/Jobs.csv')