# 🧹 Tratamento e Preparação dos Dados sobre Mercado de Trabalho em AI

In [1]:

import pandas as pd
import numpy as np

# Leitura dos dados
df_jobs = pd.read_csv('ai_job_dataset.csv')

# Exibir primeiras linhas
df_jobs.head()


Unnamed: 0,job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,required_skills,education_required,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name
0,AI00001,AI Research Scientist,90376,USD,SE,CT,China,M,China,50,"Tableau, PyTorch, Kubernetes, Linux, NLP",Bachelor,9,Automotive,2024-10-18,2024-11-07,1076,5.9,Smart Analytics
1,AI00002,AI Software Engineer,61895,USD,EN,CT,Canada,M,Ireland,100,"Deep Learning, AWS, Mathematics, Python, Docker",Master,1,Media,2024-11-20,2025-01-11,1268,5.2,TechCorp Inc
2,AI00003,AI Specialist,152626,USD,MI,FL,Switzerland,L,South Korea,0,"Kubernetes, Deep Learning, Java, Hadoop, NLP",Associate,2,Education,2025-03-18,2025-04-07,1974,9.4,Autonomous Tech
3,AI00004,NLP Engineer,80215,USD,SE,FL,India,M,India,50,"Scala, SQL, Linux, Python",PhD,7,Consulting,2024-12-23,2025-02-24,1345,8.6,Future Systems
4,AI00005,AI Consultant,54624,EUR,EN,PT,France,S,Singapore,100,"MLOps, Java, Tableau, Python",Master,0,Media,2025-04-15,2025-06-23,1989,6.6,Advanced Robotics


## 1. Verificação e tratamento de valores ausentes

In [2]:

# Contagem de valores nulos
df_jobs.isnull().sum()


Unnamed: 0,0
job_id,0
job_title,0
salary_usd,0
salary_currency,0
experience_level,0
employment_type,0
company_location,0
company_size,0
employee_residence,0
remote_ratio,0


In [3]:

# Preenchimento de colunas numéricas com média
df_jobs['years_experience'].fillna(df_jobs['years_experience'].mean(), inplace=True)
df_jobs['benefits_score'].fillna(0, inplace=True)

# Preenchimento de texto com 'Não informado'
df_jobs['required_skills'].fillna('Não informado', inplace=True)
df_jobs['education_required'].fillna('Não informado', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_jobs['years_experience'].fillna(df_jobs['years_experience'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_jobs['benefits_score'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedi

## 2. Padronização de datas

In [4]:

df_jobs['posting_date'] = pd.to_datetime(df_jobs['posting_date'], errors='coerce')
df_jobs['application_deadline'] = pd.to_datetime(df_jobs['application_deadline'], errors='coerce')


## 3. Criação de colunas derivadas

In [5]:

# Salário por ano de experiência
df_jobs['salario_por_ano_experiencia'] = df_jobs['salary_usd'] / df_jobs['years_experience'].replace(0, np.nan)

# Classificação do tipo de trabalho
df_jobs['tipo_trabalho'] = df_jobs['remote_ratio'].apply(
    lambda x: 'Remoto' if x == 100 else ('Presencial' if x == 0 else 'Híbrido'))

# Nível de experiência mais amigável
mapa_experiencia = {
    'EN': 'Iniciante',
    'MI': 'Pleno',
    'SE': 'Sênior',
    'EX': 'Executivo'
}
df_jobs['nivel_experiencia_label'] = df_jobs['experience_level'].map(mapa_experiencia)


## 4. Conferência final dos dados tratados

In [6]:
df_jobs.head()

Unnamed: 0,job_id,job_title,salary_usd,salary_currency,experience_level,employment_type,company_location,company_size,employee_residence,remote_ratio,...,years_experience,industry,posting_date,application_deadline,job_description_length,benefits_score,company_name,salario_por_ano_experiencia,tipo_trabalho,nivel_experiencia_label
0,AI00001,AI Research Scientist,90376,USD,SE,CT,China,M,China,50,...,9,Automotive,2024-10-18,2024-11-07,1076,5.9,Smart Analytics,10041.777778,Híbrido,Sênior
1,AI00002,AI Software Engineer,61895,USD,EN,CT,Canada,M,Ireland,100,...,1,Media,2024-11-20,2025-01-11,1268,5.2,TechCorp Inc,61895.0,Remoto,Iniciante
2,AI00003,AI Specialist,152626,USD,MI,FL,Switzerland,L,South Korea,0,...,2,Education,2025-03-18,2025-04-07,1974,9.4,Autonomous Tech,76313.0,Presencial,Pleno
3,AI00004,NLP Engineer,80215,USD,SE,FL,India,M,India,50,...,7,Consulting,2024-12-23,2025-02-24,1345,8.6,Future Systems,11459.285714,Híbrido,Sênior
4,AI00005,AI Consultant,54624,EUR,EN,PT,France,S,Singapore,100,...,0,Media,2025-04-15,2025-06-23,1989,6.6,Advanced Robotics,,Remoto,Iniciante


In [7]:
# Salvar
df_jobs.to_csv('ai_job_dataset_limpo.csv', index=False)

# Baixar
from google.colab import files
files.download('ai_job_dataset_limpo.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>