# Bibliotecas necessárias

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pycountry

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 250)
pd.set_option("display.precision", 3)
pd.set_option("expand_frame_repr", False)

df = pd.read_csv("ds_salaries.csv", index_col=0)

In [2]:
def check_data(df):
    print(f"Infos:\n{df.info()}\n")

    for col in df.columns.values:
        print(f"Coluna {col}:\n{df.loc[:,col].unique()}\n")

    print(f"Primeiras instâncias:\n{df.head()}")

check_data(df)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 607 entries, 0 to 606
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           607 non-null    int64 
 1   experience_level    607 non-null    object
 2   employment_type     607 non-null    object
 3   job_title           607 non-null    object
 4   salary              607 non-null    int64 
 5   salary_currency     607 non-null    object
 6   salary_in_usd       607 non-null    int64 
 7   employee_residence  607 non-null    object
 8   remote_ratio        607 non-null    int64 
 9   company_location    607 non-null    object
 10  company_size        607 non-null    object
dtypes: int64(4), object(7)
memory usage: 56.9+ KB
Infos:
None

Coluna work_year:
[2020 2021 2022]

Coluna experience_level:
['MI' 'SE' 'EN' 'EX']

Coluna employment_type:
['FT' 'CT' 'PT' 'FL']

Coluna job_title:
['Data Scientist' 'Machine Learning Scientist' 'Big Data E

# Preparação dos dados para análise

Algumas colunas possuem valores abreviados através de siglas, dificultando o entendimento e análise dos dados. Então vamos expandir as siglas para seu texto original, visando facilitar a nossa análise.

In [3]:
def expand_experience_level(acronym):
    if acronym == "EN":
        return "Entry-level / Junior"
    
    if acronym == "MI":
        return "Mid-level / Intermediate"
    
    if acronym == "SE":
        return "Senior-level / Expert"

    if acronym == "EX":
        return "Executive-level / Director"

    return None

def expand_employment_type(acronym):
    if acronym == "PT":
        return "Part-time"
    
    if acronym == "FT":
        return "Full-time"
    
    if acronym == "CT":
        return "Contract"
    
    if acronym == "FL":
        return "Freelance"
    
    return None

def expand_country(acronym):
    country = pycountry.countries.get(alpha_2=acronym)

    if country and hasattr(country, "name"):
        return country.name
    
    return None

def expand_currency(acronym):
    currency = pycountry.currencies.get(alpha_3=acronym)

    if currency and hasattr(currency, "name"):
        return currency
    
    return None

def expand_company_size(acronym):
    if acronym == "s":
        return "Small"
    
    if acronym == "M":
        return "Medium"
    
    if acronym == "L":
        return "Large"
    
    return None

def expand_acronyms(df):
    df = df.copy()
    df["experience_level"] = df["experience_level"].map(expand_experience_level)
    df["employment_type"] = df["employment_type"].map(expand_employment_type)
    df["salary_currency"] = df["salary_currency"].map(expand_currency)
    df["employee_residence"] = df["employee_residence"].map(expand_country)
    df["company_location"] = df["company_location"].map(expand_country)
    df["company_size"] = df["company_size"].map(expand_company_size)

    return df

df_exp = expand_acronyms(df)

In [4]:
df_exp.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,Mid-level / Intermediate,Full-time,Data Scientist,70000,"Currency(alpha_3='EUR', name='Euro', numeric='...",79833,Germany,0,Germany,Large
1,2020,Senior-level / Expert,Full-time,Machine Learning Scientist,260000,"Currency(alpha_3='USD', name='US Dollar', nume...",260000,Japan,0,Japan,
2,2020,Senior-level / Expert,Full-time,Big Data Engineer,85000,"Currency(alpha_3='GBP', name='Pound Sterling',...",109024,United Kingdom,50,United Kingdom,Medium
3,2020,Mid-level / Intermediate,Full-time,Product Data Analyst,20000,"Currency(alpha_3='USD', name='US Dollar', nume...",20000,Honduras,0,Honduras,
4,2020,Senior-level / Expert,Full-time,Machine Learning Engineer,150000,"Currency(alpha_3='USD', name='US Dollar', nume...",150000,United States,50,United States,Large
