In [42]:
# Importando o pandas e o altair
import pandas as pd
import altair as alt

%load_ext lab_black

In [3]:
# Importando o dataset
df = pd.read_csv('ds_salaries.csv')

In [4]:
df.drop(columns="Unnamed: 0", inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [178]:
# Criando um filtro e plotando um gráfico de linhas

alt.Chart(
    df.groupby("work_year")
    .sum()
    .reset_index()
    .sort_values("work_year", ascending=False)
).mark_line().encode(y="salary_in_usd", x="work_year:N").properties(
    width=800, height=500
)

In [56]:
# Criando um gráfico de barras

alt.Chart(
    df.groupby("job_title")
    .mean()
    .reset_index()
    .sort_values("salary_in_usd", ascending=False)[:10]
).mark_bar(color="green").encode(
    alt.X("job_title", sort="-y"), y="salary_in_usd"
).properties(
    width=800, height=500
)

In [146]:
# Criando um gráfico de barras horizontais

alt.Chart(
    df.groupby("job_title")
    .mean()
    .reset_index()
    .sort_values("salary_in_usd", ascending=False)[:10]
).mark_bar(color="green").encode(
    alt.Y("job_title", sort="-x"), x="salary_in_usd"
).properties(
    width=800, height=500
)

In [159]:
# Criando um histograma

alt.Chart(df[df["company_size"] == "M"]).mark_bar().encode(
    alt.X("salary_in_usd:Q", bin=alt.Bin(step=25000)), y="count()",
).properties(width=1000, height=500)

In [165]:
# Criando um histograma com média e mediana

hist = (
    alt.Chart(df[df["company_size"] == "M"])
    .mark_bar()
    .encode(alt.X("salary_in_usd:Q", bin=alt.Bin(step=25000)), y="count()")
    .properties(width=1000, height=500)
)

media = (
    alt.Chart(df[df["company_size"] == "M"])
    .mark_rule(color="red")
    .encode(x="mean(salary_in_usd):Q", size=alt.value(4))
)

mediana = (
    alt.Chart(df[df["company_size"] == "M"])
    .mark_rule(color="yellow")
    .encode(x="median(salary_in_usd):Q", size=alt.value(4))
)

hist + media + mediana

In [171]:
# Criando um box plot

alt.Chart(df).mark_boxplot(size=100).encode(
    x="employment_type:O", y="salary_in_usd:Q"
).properties(width=1000, height=600)

In [145]:
# Criando um scatter plot somente com os 'Data Scientist'
df["remote_ratio"] = df["remote_ratio"].astype(str)

alt.Chart(
    df.query("job_title == 'Data Scientist'").reset_index().reset_index()
).mark_point(filled=True, size=100).encode(
    alt.X(
        "level_0",
        axis=alt.Axis(title="Amostras"),
        scale=alt.Scale(domain=[0, 145], nice=False),
    ),
    alt.Y("salary_in_usd", axis=alt.Axis(format="$", title="Salário em dolar")),
    alt.Color(
        "remote_ratio",
        scale=alt.Scale(scheme="dark2"),
        sort=["0", "50", "100"],
        legend=alt.Legend(
            title="Trabalho remoto", orient="left", labelFontSize=10, titleFontSize=15
        ),
    ),
).properties(
    width=1000, height=600
)

In [273]:
# Incrementando o gráfico

alt.Chart(
    df.query("job_title == 'Data Scientist'").reset_index().reset_index()
).mark_point(filled=True, size=100).encode(
    alt.X(
        "level_0",
        axis=alt.Axis(title="Amostras"),
        scale=alt.Scale(domain=[0, 145], nice=False),
    ),
    alt.Y("salary_in_usd", axis=alt.Axis(format="$", title="Salário em dolar")),
    alt.Color(
        "remote_ratio",
        scale=alt.Scale(scheme="dark2"),
        sort=["0", "50", "100"],
        legend=alt.Legend(
            title="Trabalho remoto", orient="left", labelFontSize=10, titleFontSize=15
        ),
    ),
    tooltip=[alt.Tooltip("experience_level:N"), alt.Tooltip("employment_type:N")],
).properties(
    width=1000, height=600
).interactive()

In [267]:
#Criando um múltiplos gráficos e com filtro

select_year = alt.selection_single(
    name="Select",
    fields=["work_year"],
    bind=alt.binding_select(options=df["work_year"].unique()),
)

tam_emp = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        x=alt.X(
            "salary_in_usd",
            type="quantitative",
            bin=alt.Bin(maxbins=20),
            axis=alt.Axis(format="$", title="Salário em dolar"),
        ),
        y="count()",
        column=alt.Column("company_size:N", header=alt.Header(orient="bottom")),
    )
    .properties(width=300, height=300)
    .add_selection(select_year)
    .transform_filter(select_year)
)

trab_rem = (
    alt.Chart(df)
    .mark_bar()
    .encode(
        x=alt.X(
            "salary_in_usd",
            type="quantitative",
            bin=alt.Bin(maxbins=20),
            axis=alt.Axis(format="$", title="Salário em dolar"),
        ),
        y="count()",
        column=alt.Column(
            "remote_ratio:N",
            header=alt.Header(orient="bottom"),
            sort=["0", "50", "100"],
        ),
    )
    .properties(width=300, height=300)
    .add_selection(select_year)
    .transform_filter(select_year)
)

trab_rem & tam_emp