In [1]:
import pandas as pd
import sqlalchemy 
import sql_functions as sf

In [2]:
# load jobs & skill data

schema = 'capstone_datacvpro'

jobs_20 = sf.get_dataframe(f' SELECT * FROM {schema}.analysts_20')
skills_20 = sf.get_dataframe(f' SELECT * FROM {schema}.skills_20')

In [3]:
# drop skills with low count
df_skills_top = skills_20.query('count >= 40').copy()
df_skills_top

Unnamed: 0,skill,count,type
0,Analytics,3385,Hard Skill
1,Management,1949,Soft Skill
2,Communications,1899,Soft Skill
3,SQL (Programming Language),1780,Hard Skill
4,Tooling,1769,Hard Skill
...,...,...,...
538,Employee Assistance Programs,40,Hard Skill
539,Modulation,40,Hard Skill
540,Scrum (Software Development),40,Hard Skill
541,Medicaid,40,Hard Skill


In [4]:
# drop entries that are not really skills

df_skills_top= df_skills_top.drop([0, 1, 4, 7, 9, 12, 13, 16, 20, 23, 44, 54, 104]).reset_index(drop=True)
# dropped skills: analytics(0), management(1), tooling(4), operations(7), disabilities(9), levelling(12),  equalization(13), activism(20), industrialization(16), job descriptions(23), additives (44), governance (54), Hostile Work Environment(104) 

# delete count column (shows count of every occurence of skill, also multiple times per description)
df_skills_top.drop(['count'], axis=1, inplace=True)

In [5]:
# remove everything in () so skills can be found in description

# split at ( and save in new column
df_skills_top['skill_clean'] = df_skills_top['skill'].str.split('(').str[0]
df_skills_top.head()

Unnamed: 0,skill,type,skill_clean
0,Communications,Soft Skill,Communications
1,SQL (Programming Language),Hard Skill,SQL
2,Positivity,Soft Skill,Positivity
3,Maintainability,Hard Skill,Maintainability
4,Collaboration,Soft Skill,Collaboration


In [7]:
# use stemming to reduce skills to base

from nltk.stem import PorterStemmer
ps = PorterStemmer()

#make a list of all skills
skill_list_clean = df_skills_top.skill_clean.tolist()
  
for w in skill_list_clean:
    print(w, " : ", ps.stem(w))

Communications  :  commun
SQL   :  sql 
Positivity  :  posit
Maintainability  :  maintain
Collaboration  :  collabor
Research  :  research
Data Quality  :  data qu
Presentations  :  present
Statistics  :  statist
Dashboard  :  dashboard
Planning  :  plan
Tableau   :  tableau 
Python   :  python 
Component Object Model   :  component object model 
Integration  :  integr
Computer Science  :  computer sci
E   :  E 
Decisiveness  :  decis
Innovation  :  innov
Writing  :  write
Personalization  :  person
Source Data  :  source data
Verbal Communication Skills  :  verbal communication skil
Consulting  :  consult
Data Visualization  :  data visu
Professionalism  :  profession
Data Analysis  :  data analysi
Accessioning  :  access
Minimum Data Set  :  minimum data set
Coloring  :  color
Executable  :  execut
Exploratory Data Analysis  :  exploratory data analysi
Problem Solving  :  problem solv
Automation  :  autom
Data Management  :  data manag
Data Mapper Patterns  :  data mapper pattern
Dat

In [10]:
# save stemmed words in new list

skill_stem = []

for skill in skill_list_clean:
    skill_stem.append(ps.stem(skill))

len(skill_stem)

529

In [11]:
# check for duplicates

unique = [] # empty list to hold unique elements from the list
duplist = [] # empty list to hold the duplicate elements from the list
for i in skill_stem:
    if i not in unique:
        unique.append(i)
    else:
        duplist.append(i)

duplist

['account', 'data integr', 'construct']

In [None]:
# check in Data Viewer what skills are doubled while stemming
# account = Accounting(46), Accountability(144)
# data integr = Data Integrity(123), Data Integration(211)
# construct = Construction(427), Constructability(528)

In [12]:
# delete from list & rerun stemming code

double = [46, 144, 123, 211, 427, 528]

# Sort indices in reverse order to avoid index shifting
double.sort(reverse=True)

for index in double:
    del skill_list_clean[index]

In [17]:
# rerun stemming code & save in list

skill_stem1 = []

for skill in skill_list_clean:
    skill_stem1.append(ps.stem(skill))

len(skill_stem1)

523

In [18]:
# append previously deleted skills with unique stem
append_list = ['accounti', 'accounta', 'data integri', 'data integra']
skill_stem1.extend(append_list)

len(skill_stem1)

527

In [19]:
# count each skill only once per job description to count how many job postings require specific skill

# create new columns for each skill that show 1 if skill is mentioned in job_description
for skill in skill_stem1:
    jobs_20[skill] = jobs_20['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()

jobs_20.head()

  jobs_20[skill] = jobs_20['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20[skill] = jobs_20['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20[skill] = jobs_20['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20[skill] = jobs_20['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20[skill] = jobs_20['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20[skill] = jobs_20['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20[skill] = jobs_20['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20[skill] = jobs_20['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20[skill] = jobs_20['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20[skill] = 

Unnamed: 0,job_title,salary_estimate,job_description,company_name,location,industry,commun,sql,posit,maintain,...,imag,mobile app,employee assistance program,modul,scrum,medicaid,accounti,accounta,data integri,data integra
0,"Data Analyst, Center on Immigration and Justic...",$37K-$66K (Glassdoor est.),Are you eager to roll up your sleeves and harn...,Vera Institute of Justice\n3.2,"New York, NY",Social Assistance,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Quality Data Analyst,$37K-$66K (Glassdoor est.),Overview\n\nProvides analytical and technical ...,Visiting Nurse Service of New York\n3.8,"New York, NY",Health Care Services & Hospitals,1,1,0,1,...,0,0,0,0,0,0,0,0,1,0
2,"Senior Data Analyst, Insights & Analytics Team...",$37K-$66K (Glassdoor est.),We’re looking for a Senior Data Analyst who ha...,Squarespace\n3.4,"New York, NY",Internet,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Data Analyst,$37K-$66K (Glassdoor est.),Requisition NumberRR-0001939\nRemote:Yes\nWe c...,Celerity\n4.1,"New York, NY",IT Services,1,1,1,0,...,0,0,0,0,0,0,0,1,0,0
4,Reporting Data Analyst,$37K-$66K (Glassdoor est.),ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,FanDuel\n3.9,"New York, NY",Sports & Recreation,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# create df with only skills to get sum by column

jobs_20_skillcount = jobs_20.drop(columns=['job_title', 'salary_estimate', 'job_description', 'company_name', 'location', 'industry'])

In [22]:
sum_skills = []

for skill_stem in jobs_20_skillcount.columns:
    sum = jobs_20_skillcount[skill_stem].sum()
    sum_skills.append({'skill_stem': skill_stem, 'count' : sum})

skills_count_stem = pd.DataFrame(sum_skills)
skills_count_stem.head()

Unnamed: 0,skill_stem,count
0,commun,1577
1,sql,990
2,posit,1100
3,maintain,901
4,collabor,827


In [23]:
# create copy of df_skills_top to modify
df_onlyskill = df_skills_top.copy()

In [27]:
# add column with not stemmed skills by concating with df_onlyskill
# since we deleted doubles & appended them at the end of the list, we need to repeat same process for df_onlyskill

df_onlyskill.drop(['skill_clean'], axis=1, inplace=True)
df_onlyskill= df_onlyskill.drop([46, 144, 123, 211, 427, 528]).reset_index(drop=True)

In [29]:
# append deleted skills with same index as skills_count_stem to merge them later

to_append = ['Accounting', 'Accountability', 'Data Integrity', 'Data Integration']
# convert to dataframe
df_to_append = pd.DataFrame({'skill' : to_append, 'type' : 'Hard Skill'})

# concat to df_onlyskill
df_onlyskill = pd.concat([df_onlyskill, df_to_append], ignore_index=True)

df_onlyskill

Unnamed: 0,skill,type
0,Communications,Soft Skill
1,SQL (Programming Language),Hard Skill
2,Positivity,Soft Skill
3,Maintainability,Hard Skill
4,Collaboration,Soft Skill
...,...,...
522,Medicaid,Hard Skill
523,Accounting,Hard Skill
524,Accountability,Hard Skill
525,Data Integrity,Hard Skill


In [30]:
# change type of Accountability to Soft Skill
df_onlyskill['type'][524] = 'Soft Skill'

In [31]:
# merge skills_count & df_onlyskill to create df with count of stemmed skills and the matching full skill name

skills_count = pd.merge(skills_count_stem, df_onlyskill, left_index=True, right_index=True)

skills_count.sort_values('count', ascending=False).head(40)

Unnamed: 0,skill_stem,count,skill,type
16,E,2248,E (Programming Language),Hard Skill
37,R,2233,R (Programming Language),Hard Skill
152,M,2110,M (Programming Language),Hard Skill
258,act,1872,Acting,Hard Skill
192,C,1639,C (Programming Language),Hard Skill
0,commun,1577,Communications,Soft Skill
114,B,1500,B (Programming Language),Hard Skill
281,informat,1248,Informatics,Hard Skill
2,posit,1100,Positivity,Soft Skill
116,script,1071,Scripting,Hard Skill
