In [1]:
# Importing Libraries
import ast
import pandas as pd
import seaborn as sns
from datasets import load_dataset
import matplotlib.pyplot as plt  

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
df['job_skills'] = df['job_skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)

In [2]:
## Skill count per month for data analysts
df_DS_FR = df[(df['job_title'] == 'Data Scientist') & (df['job_country'] == 'France')].copy()

df_DS_FR['job_posted_month_no'] = df_DS_FR['job_posted_date'].dt.month

# Explode job_skills column
df_DS_FR_explode = df_DS_FR.explode('job_skills')

In [14]:
df_DS_FR_pivot = df_DS_FR_explode.pivot_table(index='job_posted_month_no', columns='job_skills',  aggfunc='size', fill_value=0)
df_DS_FR_pivot.loc['Total'] = df_DS_FR_pivot.sum()
df_DS_FR_pivot = df_DS_FR_pivot[df_DS_FR_pivot.loc['Total'].sort_values(ascending=False).index]
df_DS_FR_pivot = df_DS_FR_pivot.drop('Total')


In [16]:
DS_totals = df_DS_FR.groupby('job_posted_month_no').size()

In [None]:
df_DA_FR_percent = df_DS_FR_pivot.div(DS_totals/100, axis=0)

# changes month number to month name
df_DA_FR_percent =df_DA_FR_percent.reset_index()
df_DA_FR_percent['job_posted_month'] =df_DA_FR_percent['job_posted_month_no'].apply(lambda x: pd.to_datetime(x, format='%m').strftime('%b'))
df_DA_FR_percent =df_DA_FR_percent.set_index('job_posted_month')
df_DA_FR_percent =df_DA_FR_percent.drop(columns='job_posted_month_no')

df_DA_FR_percent

In [None]:
from matplotlib.ticker import PercentFormatter


df_plot = df_DA_FR_percent.iloc[:,:5]

sns.lineplot(data = df_plot, dashes = False,palette='tab10')
sns.set_theme(style='ticks')

sns.despine() # remove top and right spines

plt.title('Trending Top Skills for Data Scientist in France')
plt.ylabel('Likelihood in Job Posting')
plt.xlabel('2023')
plt.legend().remove()


ax = plt.gca()
plt.gca().yaxis.set_major_formatter(PercentFormatter(decimals=0))

for i in range (5):
    plt.text(11.3,df_plot.iloc[-1,i],df_plot.columns[i])
