In [5]:
import pandas as pd
import sqlalchemy 
import sql_functions as sf

In [6]:
# load jobs & skill data

schema = 'capstone_datacvpro'

jobs_20 = sf.get_dataframe(f' SELECT * FROM {schema}.analysts_20')
skills_20 = sf.get_dataframe(f' SELECT * FROM {schema}.skills_20')

In [7]:
# drop skills with low count
df_skills_top = skills_20.query('count >= 40').copy()
df_skills_top

Unnamed: 0,skill,count,type
0,Analytics,3385,Hard Skill
1,Management,1949,Soft Skill
2,Communications,1899,Soft Skill
3,SQL (Programming Language),1780,Hard Skill
4,Tooling,1769,Hard Skill
...,...,...,...
538,Employee Assistance Programs,40,Hard Skill
539,Modulation,40,Hard Skill
540,Scrum (Software Development),40,Hard Skill
541,Medicaid,40,Hard Skill


In [8]:
# drop entries that are not really skills

df_skills_top= df_skills_top.drop([0, 1, 4, 7, 9, 12, 13, 16, 20, 23, 44, 54, 104]).reset_index(drop=True)
# dropped skills: analytics(0), management(1), tooling(4), operations(7), disabilities(9), levelling(12),  equalization(13), activism(20), industrialization(16), job descriptions(23), additives (44), governance (54), Hostile Work Environment(104) 

# delete count column (shows count of every occurence of skill, also multiple times per description)
df_skills_top.drop(['count'], axis=1, inplace=True)

In [9]:
# remove everything in () so skills can be found in description

# split at ( and save in new column
df_skills_top['skill_clean'] = df_skills_top['skill'].str.split('(').str[0]
df_skills_top.head()

Unnamed: 0,skill,type,skill_clean
0,Communications,Soft Skill,Communications
1,SQL (Programming Language),Hard Skill,SQL
2,Positivity,Soft Skill,Positivity
3,Maintainability,Hard Skill,Maintainability
4,Collaboration,Soft Skill,Collaboration


In [10]:
# convert to lower case

df_skills_top['skill_clean'] = df_skills_top['skill_clean'].apply(lambda x: x.lower())

# change communications to communication
df_skills_top['skill_clean'][0] = 'communication'

df_skills_top.head()

Unnamed: 0,skill,type,skill_clean
0,Communications,Soft Skill,communication
1,SQL (Programming Language),Hard Skill,sql
2,Positivity,Soft Skill,positivity
3,Maintainability,Hard Skill,maintainability
4,Collaboration,Soft Skill,collaboration


In [11]:
# save cleaned skills in list

skill_clean = df_skills_top.skill_clean.to_list()

In [12]:
# create copy of jobs_20 to modify

jobs_20_skills = jobs_20.copy()

In [13]:
# count each skill only once per job description to count how many job postings require specific skill

# create new columns for each skill that show 1 if skill is mentioned in job_description
for skill in skill_clean:
    jobs_20_skills[skill] = jobs_20_skills['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()

jobs_20_skills.head()

  jobs_20_skills[skill] = jobs_20_skills['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20_skills[skill] = jobs_20_skills['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20_skills[skill] = jobs_20_skills['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20_skills[skill] = jobs_20_skills['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20_skills[skill] = jobs_20_skills['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20_skills[skill] = jobs_20_skills['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20_skills[skill] = jobs_20_skills['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20_skills[skill] = jobs_20_skills['job_description'].apply(lambda x: 1 if skill.lower() in x.lower() else 0).copy()
  jobs_20_skills

Unnamed: 0,job_title,salary_estimate,job_description,company_name,location,industry,communication,sql,positivity,maintainability,...,financial data management,chartered financial analyst,systems implementations,imaging,mobile app,employee assistance programs,modulation,scrum,medicaid,constructability
0,"Data Analyst, Center on Immigration and Justic...",$37K-$66K (Glassdoor est.),Are you eager to roll up your sleeves and harn...,Vera Institute of Justice\n3.2,"New York, NY",Social Assistance,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Quality Data Analyst,$37K-$66K (Glassdoor est.),Overview\n\nProvides analytical and technical ...,Visiting Nurse Service of New York\n3.8,"New York, NY",Health Care Services & Hospitals,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Senior Data Analyst, Insights & Analytics Team...",$37K-$66K (Glassdoor est.),We’re looking for a Senior Data Analyst who ha...,Squarespace\n3.4,"New York, NY",Internet,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Data Analyst,$37K-$66K (Glassdoor est.),Requisition NumberRR-0001939\nRemote:Yes\nWe c...,Celerity\n4.1,"New York, NY",IT Services,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Reporting Data Analyst,$37K-$66K (Glassdoor est.),ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,FanDuel\n3.9,"New York, NY",Sports & Recreation,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# create df with only skills to sum up the columns

jobs_20_skills = jobs_20_skills.drop(columns=['job_title', 'salary_estimate', 'job_description', 'company_name', 'location', 'industry'])

In [15]:
# create df with the columns skill and count, where skills are summed up by column

sum_skills = []

for skill_clean in jobs_20_skills.columns:
    sum = jobs_20_skills[skill_clean].sum()
    sum_skills.append({'skill_clean': skill_clean, 'count' : sum})

skills_count_clean = pd.DataFrame(sum_skills)
skills_count_clean.head()

Unnamed: 0,skill_clean,count
0,communication,1154
1,sql,990
2,positivity,5
3,maintainability,4
4,collaboration,245


In [16]:
# add type by merging with df_skills_top
# can be merged using index

skills_count = pd.merge(skills_count_clean, df_skills_top, left_index=True, right_index=True)
skills_count

Unnamed: 0,skill_clean_x,count,skill,type,skill_clean_y
0,communication,1154,Communications,Soft Skill,communication
1,sql,990,SQL (Programming Language),Hard Skill,sql
2,positivity,5,Positivity,Soft Skill,positivity
3,maintainability,4,Maintainability,Hard Skill,maintainability
4,collaboration,245,Collaboration,Soft Skill,collaboration
...,...,...,...,...,...
524,employee assistance programs,1,Employee Assistance Programs,Hard Skill,employee assistance programs
525,modulation,0,Modulation,Hard Skill,modulation
526,scrum,27,Scrum (Software Development),Hard Skill,scrum
527,medicaid,25,Medicaid,Hard Skill,medicaid


In [17]:
# delete duplicated column
skills_count.drop(['skill_clean_x'], axis=1, inplace=True)

# rename
skills_count.rename(columns={'skill_clean_y' : 'skill_clean'}, inplace=True)
skills_count.head(1)

Unnamed: 0,count,skill,type,skill_clean
0,1154,Communications,Soft Skill,communication


In [18]:
# rearrange column order
new_column_order = ['skill', 'skill_clean', 'count', 'type']

# reassign dataframe with new column order
skills_count = skills_count[new_column_order]

# sort values by count
skills_count = skills_count.sort_values('count', ascending=False)

skills_count

Unnamed: 0,skill,skill_clean,count,type
16,E (Programming Language),e,2248,Hard Skill
37,R (Programming Language),r,2233,Hard Skill
155,M (Programming Language),m,2110,Hard Skill
195,C (Programming Language),c,1639,Hard Skill
115,B (Programming Language),b,1500,Hard Skill
...,...,...,...,...
322,Mobile Application Software,mobile application software,0,Hard Skill
329,Medical Insurance Claims,medical insurance claims,0,Hard Skill
330,Post-Hoc Analysis,post-hoc analysis,0,Hard Skill
332,Patient Care Technician,patient care technician,0,Soft Skill


In [19]:
# convert job_description to lower

jobs_20['job_description'] = jobs_20['job_description'].str.lower()

In [20]:
# Programming languages like R are counted in each word that contains an R
# to avoid that we extract standalone characters that are programming languages r,e,m,c,b

def has_standalone_r_e_m_c_b(text):
    words = text.split()
    return {'R': 'r' in words, 'E': 'e' in words, 'M': 'm' in words, 'C': 'c' in words, 'B': 'b' in words}

# apply the function to the "job_description" column and expand the result into separate columns
jobs_20[['R', 'E', 'M', 'C', 'B']] = jobs_20['job_description'].apply(has_standalone_r_e_m_c_b).apply(pd.Series)

# counting based on boolean mask
jobs_20[jobs_20["R"]]
print(len(jobs_20[jobs_20["R"]]), len(jobs_20[jobs_20["E"]]), len(jobs_20[jobs_20["M"]]), len(jobs_20[jobs_20["C"]]), len(jobs_20[jobs_20["B"]]))

# making a small table 
jobs_20[["R", "E", "M", "C", "B"]].sum(axis=0).sort_values(ascending = False)

158 7 1 15 11


R    158
C     15
B     11
E      7
M      1
dtype: int64

In [21]:
skills_count.head(6)

Unnamed: 0,skill,skill_clean,count,type
16,E (Programming Language),e,2248,Hard Skill
37,R (Programming Language),r,2233,Hard Skill
155,M (Programming Language),m,2110,Hard Skill
195,C (Programming Language),c,1639,Hard Skill
115,B (Programming Language),b,1500,Hard Skill
0,Communications,communication,1154,Soft Skill


In [22]:
# replace count of r,e,m,c,b in skills_count table

skills_count['count'][16] = 7
skills_count['count'][37] = 158
skills_count['count'][155] = 1
skills_count['count'][195] = 15
skills_count['count'][115] = 11

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  skills_count['count'][16] = 7
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  skills_count['count'][37] = 158
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  skills_count['count'][155] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  skills_count['count'][195] = 15
A value is trying to be set on a copy of a 

In [28]:
skills_count = skills_count.sort_values('count', ascending=False).reset_index(drop=True)
skills_count

Unnamed: 0,skill,skill_clean,count,type,frequency_%
0,Communications,communication,1154,Soft Skill,2.181474
1,SQL (Programming Language),sql,990,Hard Skill,1.871456
2,Data Analysis,data analysis,717,Hard Skill,1.355388
3,Computer Science,computer science,622,Hard Skill,1.175803
4,Statistics,statistics,563,Hard Skill,1.064272
...,...,...,...,...,...
524,Medical Insurance Claims,medical insurance claims,0,Hard Skill,0.000000
525,Post-Hoc Analysis,post-hoc analysis,0,Hard Skill,0.000000
526,Patient Care Technician,patient care technician,0,Soft Skill,0.000000
527,Storage Area Network (SAN),storage area network,0,Hard Skill,0.000000


In [29]:
skills_count.query('count == 0')

Unnamed: 0,skill,skill_clean,count,type,frequency_%
397,Project-Based Solutions,project-based solutions,0,Hard Skill,0.0
398,Transferable Skills Analysis,transferable skills analysis,0,Hard Skill,0.0
399,Design Verification Test,design verification test,0,Hard Skill,0.0
400,Chartered Financial Analyst,chartered financial analyst,0,Soft Skill,0.0
401,Amazon Marketplace,amazon marketplace,0,Hard Skill,0.0
...,...,...,...,...,...
524,Medical Insurance Claims,medical insurance claims,0,Hard Skill,0.0
525,Post-Hoc Analysis,post-hoc analysis,0,Hard Skill,0.0
526,Patient Care Technician,patient care technician,0,Soft Skill,0.0
527,Storage Area Network (SAN),storage area network,0,Hard Skill,0.0


In [31]:
jobs_20['job_description'].count()

2253

In [36]:
# add column with %

skills_count['frequency_%'] = skills_count['count'].apply(lambda x: x/(jobs_20['job_description'].count())*100).round(2)
skills_count = skills_count.sort_values('frequency_%', ascending=False).reset_index(drop=True)
skills_count.head()

Unnamed: 0,skill,skill_clean,count,type,frequency_%
0,Communications,communication,1154,Soft Skill,51.22
1,SQL (Programming Language),sql,990,Hard Skill,43.94
2,Data Analysis,data analysis,717,Hard Skill,31.82
3,Computer Science,computer science,622,Hard Skill,27.61
4,Statistics,statistics,563,Hard Skill,24.99


In [37]:
# load to database

from dotenv import load_dotenv
load_dotenv()

# write dataset into database

# Import get_engine from sql_functions.py. You will need to restart your kernel and rerun at this point since we changed the module since we first imported it.
from sql_functions import get_engine
# create a variable called engine using the get_engine function
engine = get_engine()

import psycopg2

table_name = 'skills_count_20'
schema = 'capstone_datacvpro'

# Write records stored in a dataframe to SQL database
if engine!=None:
    try:
        skills_count.to_sql(name=table_name, # Name of SQL table variable
                        con=engine, # Engine or connection
                        schema=schema, # your class schema variable
                        if_exists='replace', # Drop the table before inserting new values 
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print('No engine')

The skills_count_20 table was imported successfully.
