# 🧹 Tratamento dos Dados Complementares

Tratamento e preparação dos dados dos arquivos `salaries.csv` e `ai_job_market_insights.csv`.

In [1]:

import pandas as pd
import numpy as np

# Leitura dos arquivos
df_salaries = pd.read_csv('salaries.csv')
df_market = pd.read_csv('ai_job_market_insights.csv')

df_salaries.head()


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2025,SE,FT,Data Product Owner,170000,USD,170000,US,0,US,M
1,2025,SE,FT,Data Product Owner,110000,USD,110000,US,0,US,M
2,2025,SE,FT,Data Product Owner,170000,USD,170000,US,0,US,M
3,2025,SE,FT,Data Product Owner,110000,USD,110000,US,0,US,M
4,2025,SE,FT,Engineer,143000,USD,143000,US,0,US,M


## 1. Tratamento do arquivo `salaries.csv`

In [2]:

# Verificando valores ausentes
df_salaries.isnull().sum()


Unnamed: 0,0
work_year,0
experience_level,0
employment_type,0
job_title,0
salary,0
salary_currency,0
salary_in_usd,0
employee_residence,0
remote_ratio,0
company_location,0


In [3]:

# Garantir tipos corretos
df_salaries['work_year'] = df_salaries['work_year'].astype(int)

# Criar coluna de tipo de trabalho a partir do remote_ratio
df_salaries['tipo_trabalho'] = df_salaries['remote_ratio'].apply(
    lambda x: 'Remoto' if x == 100 else ('Presencial' if x == 0 else 'Híbrido'))

# Mapeamento de nível de experiência
mapa_exp = {'EN': 'Iniciante', 'MI': 'Pleno', 'SE': 'Sênior', 'EX': 'Executivo'}
df_salaries['nivel_experiencia_label'] = df_salaries['experience_level'].map(mapa_exp)

# Criar coluna de salário anual estimado
df_salaries['salario_mensal_estimado'] = df_salaries['salary_in_usd'] / 12


## 2. Tratamento do arquivo `ai_job_market_insights.csv`

In [4]:

# Verificando valores ausentes
df_market.isnull().sum()


Unnamed: 0,0
Job_Title,0
Industry,0
Company_Size,0
Location,0
AI_Adoption_Level,0
Automation_Risk,0
Required_Skills,0
Salary_USD,0
Remote_Friendly,0
Job_Growth_Projection,0


In [7]:

# Preencher valores nulos
df_market['Required_Skills'] = df_market['Required_Skills'].fillna('Não informado')
df_market['Job_Growth_Projection'] = df_market['Job_Growth_Projection'].fillna('Não informado')


df_market['Automation_Risk'] = pd.to_numeric(df_market['Automation_Risk'], errors='coerce')

df_market['Risco_Automacao_Label'] = pd.cut(
    df_market['Automation_Risk'],
    bins=[0, 0.3, 0.7, 1.0],
    labels=['Baixo', 'Médio', 'Alto']
)



## 3. Conferência final

In [8]:

df_salaries.head()


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,tipo_trabalho,nivel_experiencia_label,salario_mensal_estimado
0,2025,SE,FT,Data Product Owner,170000,USD,170000,US,0,US,M,Presencial,Sênior,14166.666667
1,2025,SE,FT,Data Product Owner,110000,USD,110000,US,0,US,M,Presencial,Sênior,9166.666667
2,2025,SE,FT,Data Product Owner,170000,USD,170000,US,0,US,M,Presencial,Sênior,14166.666667
3,2025,SE,FT,Data Product Owner,110000,USD,110000,US,0,US,M,Presencial,Sênior,9166.666667
4,2025,SE,FT,Engineer,143000,USD,143000,US,0,US,M,Presencial,Sênior,11916.666667


In [9]:

df_market.head()


Unnamed: 0,Job_Title,Industry,Company_Size,Location,AI_Adoption_Level,Automation_Risk,Required_Skills,Salary_USD,Remote_Friendly,Job_Growth_Projection,Risco_Automacao_Label
0,Cybersecurity Analyst,Entertainment,Small,Dubai,Medium,,UX/UI Design,111392.165243,Yes,Growth,
1,Marketing Specialist,Technology,Large,Singapore,Medium,,Marketing,93792.562466,No,Decline,
2,AI Researcher,Technology,Large,Singapore,Medium,,UX/UI Design,107170.263069,Yes,Growth,
3,Sales Manager,Retail,Small,Berlin,Low,,Project Management,93027.953758,No,Growth,
4,Cybersecurity Analyst,Entertainment,Small,Tokyo,Low,,JavaScript,87752.922171,Yes,Decline,


In [12]:
from google.colab import files

df_salaries.to_csv('salaries_limpo.csv', index=False)
files.download('salaries_limpo.csv')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
from google.colab import files
import time

df_salaries.to_csv('salaries_limpo.csv', index=False)

# Esperar um momento antes de iniciar o download
time.sleep(2)
files.download('salaries_limpo.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>