In [30]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_excel('public_use-industry-skills-needs.xlsx',  sheet_name="Industry Skills Needs") 

data

Unnamed: 0,year,isic_section_index,isic_section_name,industry_name,skill_group_category,skill_group_name,skill_group_rank
0,2015,B,Mining and quarrying,Mining & Metals,Specialized Industry Skills,Mining,1
1,2015,B,Mining and quarrying,Mining & Metals,Soft Skills,Negotiation,2
2,2015,B,Mining and quarrying,Mining & Metals,Business Skills,Project Management,3
3,2015,B,Mining and quarrying,Mining & Metals,Business Skills,Business Management,4
4,2015,B,Mining and quarrying,Mining & Metals,Specialized Industry Skills,Earth Science,5
...,...,...,...,...,...,...,...
3495,2019,R,"Arts, entertainment and recreation",Animation,Tech Skills,Social Media,6
3496,2019,R,"Arts, entertainment and recreation",Animation,Tech Skills,Digital Literacy,7
3497,2019,R,"Arts, entertainment and recreation",Animation,Soft Skills,Teamwork,8
3498,2019,R,"Arts, entertainment and recreation",Animation,Specialized Industry Skills,Editing,9


In [31]:
len(data['isic_section_name'].unique())

6

In [2]:
# Remove unnecessary columns
data = data.drop(columns=['isic_section_index', 'isic_section_name', 'skill_group_category'])

In [3]:
# Clean up industry name
data['industry_name'] = data['industry_name'].str.strip().str.lower()




In [4]:
data = data.rename(columns={'skill_group_name': 'skill_name'})

In [5]:
data = data.rename(columns={'skill_group_rank': 'skill_rank'})

In [6]:
display(data)

Unnamed: 0,year,industry_name,skill_name,skill_rank
0,2015,mining & metals,Mining,1
1,2015,mining & metals,Negotiation,2
2,2015,mining & metals,Project Management,3
3,2015,mining & metals,Business Management,4
4,2015,mining & metals,Earth Science,5
...,...,...,...,...
3495,2019,animation,Social Media,6
3496,2019,animation,Digital Literacy,7
3497,2019,animation,Teamwork,8
3498,2019,animation,Editing,9


In [7]:
# Clean up skill group names
data['skill_name'] = data['skill_name'].str.strip().str.lower()

In [8]:
# Remove duplicates
data = data.drop_duplicates()



In [None]:
display(data)

Unnamed: 0_level_0,industry_name,skill_name,skill_rank
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015,mining & metals,mining,1
2015,mining & metals,negotiation,2
2015,mining & metals,project management,3
2015,mining & metals,business management,4
2015,mining & metals,earth science,5
...,...,...,...
2019,animation,foreign languages,5
2019,animation,social media,6
2019,animation,digital literacy,7
2019,animation,teamwork,8


In [9]:
data['skill_tier'] = pd.cut(data['skill_rank'], bins=[0, 3, 6,10], labels=['top', 'mid', 'low'])

In [10]:
data['industry_name'] = data['industry_name'].str.replace('&','and')

In [None]:
display(data)

Unnamed: 0_level_0,industry_name,skill_name,skill_rank,skill_group
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015,mining and metals,mining,1,top
2015,mining and metals,negotiation,2,top
2015,mining and metals,project management,3,top
2015,mining and metals,business management,4,mid
2015,mining and metals,earth science,5,mid
...,...,...,...,...
2019,animation,foreign languages,5,mid
2019,animation,social media,6,mid
2019,animation,digital literacy,7,low
2019,animation,teamwork,8,low


In [None]:
# Group by skill group category and compute summary statistics
category_summary = data.groupby(['industry_name','year']).agg({'skill_rank': ['mean', 'median', 'min', 'max']})
category_summary.columns = ['rank_mean', 'rank_median', 'rank_min', 'rank_max']

In [None]:
display(category_summary)

Unnamed: 0_level_0,Unnamed: 1_level_0,rank_mean,rank_median,rank_min,rank_max
industry_name,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
accounting,2015,5.500000,5.5,1,10
accounting,2016,4.200000,5.0,1,7
accounting,2017,7.000000,7.0,4,10
accounting,2018,8.000000,8.0,8,8
accounting,2019,5.000000,4.0,2,10
...,...,...,...,...,...
writing and editing,2015,5.500000,5.5,1,10
writing and editing,2016,8.000000,8.0,6,10
writing and editing,2017,7.666667,8.0,5,10
writing and editing,2018,6.500000,6.5,6,7


Preparing the industry dimensions

In [19]:
# Filter the DataFrame to only keep the top skills for each year and industry
industry_dimension = data[data['skill_tier'] == 'top'].groupby(['industry_name'])['skill_name'].apply(list).reset_index()

# Rename the skill_name column to top_skills
industry_dimension = industry_dimension.rename(columns={'skill_name': 'top_skills'})

In [22]:
industry_dimension['surrogate keys'] = range(1,len(industry_dimension)+1)

In [23]:
display(industry_dimension)

Unnamed: 0,industry_name,top_skills,surrogate keys
0,accounting,"[auditing, financial accounting, tax accountin...",1
1,animation,"[animation, graphic design, video, animation, ...",2
2,architecture and planning,"[architecture, urban planning, interior design...",3
3,arts and crafts,"[visual arts, art history, graphic design, vis...",4
4,automotive,"[automotive, negotiation, leadership, automoti...",5
...,...,...,...
65,textiles,"[apparel, product development, negotiation, ap...",66
66,translation and localization,"[translation, linguistics, editing, translatio...",67
67,venture capital and private equity,"[growth strategies, entrepreneurship, capital ...",68
68,veterinary,"[veterinary medicine, zoology, agricultural pr...",69


In [32]:
industry_dimension.to_csv("industry_dimension_skills.csv")