## **What is the most optimal skill to learn for Data Analysts?**

In [1]:
# Importing Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import plotly.express as px

# Loading Data
df = pd.read_csv("C:/Users/user/Desktop/packages/data_jobs.csv")

# Data Cleanup
df["job_posted_date"] = pd.to_datetime(df["job_posted_date"])
df["job_posted_month"] = df["job_posted_date"].dt.strftime("%b")
def cleanup(skill_list):
    if pd.notna(skill_list):
        return ast.literal_eval(skill_list)
    else:
        return skill_list
df["job_skills"] = df["job_skills"].apply(cleanup)

In [34]:
df_US = df[(df["job_country"] == "United States") & (df["job_title_short"] == "Data Analyst")].copy()
df_US = df_US.dropna(subset = "salary_year_avg")
df_US = df_US.explode("job_skills")
skills = df_US.groupby("job_skills")["salary_year_avg"].agg(
    skill_count = "count",
    median_salary = "median"
)
skills = skills.sort_values(by = "skill_count", ascending = False).head(10)
skills = skills.reset_index()
skills

Unnamed: 0,job_skills,skill_count,median_salary
0,sql,2508,91000.0
1,excel,1808,84392.0
2,python,1431,97500.0
3,tableau,1364,92875.0
4,sas,926,90000.0
5,r,893,92500.0
6,power bi,838,90000.0
7,powerpoint,462,85000.0
8,word,461,81194.75
9,sql server,286,92500.0


In [35]:
fig = px.scatter(
    skills,
    x = "median_salary",
    y = "skill_count",
    text = "job_skills",
    title = "Most Optimal Skills for Data Analysts in the US",
    labels = {
        "median_salary": "Median Salary ($USD)",
        "skill_count": "Count"
    }
)
fig.update_traces(textposition = "top center")
fig.show()