In [51]:
import pandas as pd
import sqlalchemy 
import sql_functions as sf

In [58]:
# load jobs & skill data

schema = 'capstone_datacvpro'

skills = sf.get_dataframe(f' SELECT * FROM {schema}.skills_count_total')

- delete:
    - Data Analysis(3), Business Intelligence(18)

- limitations:
    - writing can include actual writing skills or eg. writing code

In [59]:
skills.head(20)

Unnamed: 0,skill,skill_clean,type,count_20,frequency_%_20,count_current,frequency_%_current
0,SQL (Programming Language),sql,Hard Skill,1389,61.65,18351,57.31
1,Communications,communication,Soft Skill,1154,51.22,16024,50.04
2,Statistics,statistics,Hard Skill,954,42.34,12223,38.17
3,Data Analysis,data analysis,Hard Skill,717,31.82,13207,41.24
4,Python (Programming Language),python,Hard Skill,637,28.27,9553,29.83
5,Computer Science,computer science,Hard Skill,622,27.61,8160,25.48
6,Tableau (Business Intelligence Software),tableau,Hard Skill,620,27.52,10113,31.58
7,Research,research,Soft Skill,559,24.81,9400,29.35
8,Dashboard,dashboard,Hard Skill,533,23.66,9736,30.4
9,Visualization,visualization,Hard Skill,517,22.95,11400,35.6


In [60]:
# drop Data Analysis, Business Intelligence

skills.drop([3,18], inplace=True)
skills = skills.reset_index(drop=True)

In [61]:
# sort by frequency current & limit to Top 20

skills = skills.sort_values('frequency_%_current', ascending=False).head(20)

In [62]:
skills

Unnamed: 0,skill,skill_clean,type,count_20,frequency_%_20,count_current,frequency_%_current
0,SQL (Programming Language),sql,Hard Skill,1389,61.65,18351,57.31
1,Communications,communication,Soft Skill,1154,51.22,16024,50.04
2,Statistics,statistics,Hard Skill,954,42.34,12223,38.17
8,Visualization,visualization,Hard Skill,517,22.95,11400,35.6
5,Tableau (Business Intelligence Software),tableau,Hard Skill,620,27.52,10113,31.58
7,Dashboard,dashboard,Hard Skill,533,23.66,9736,30.4
3,Python (Programming Language),python,Hard Skill,637,28.27,9553,29.83
6,Research,research,Soft Skill,559,24.81,9400,29.35
4,Computer Science,computer science,Hard Skill,622,27.61,8160,25.48
16,Leadership,leadership,Soft Skill,388,17.22,7871,24.58


In [63]:
# remove everything in () so skills can be found in description

# split at ( and save in new column
skills['skill'] = skills['skill'].str.split('(').str[0]
skills.head()

Unnamed: 0,skill,skill_clean,type,count_20,frequency_%_20,count_current,frequency_%_current
0,SQL,sql,Hard Skill,1389,61.65,18351,57.31
1,Communications,communication,Soft Skill,1154,51.22,16024,50.04
2,Statistics,statistics,Hard Skill,954,42.34,12223,38.17
8,Visualization,visualization,Hard Skill,517,22.95,11400,35.6
5,Tableau,tableau,Hard Skill,620,27.52,10113,31.58


In [64]:
# load to database

from dotenv import load_dotenv
load_dotenv()

# write dataset into database

# Import get_engine from sql_functions.py. You will need to restart your kernel and rerun at this point since we changed the module since we first imported it.
from sql_functions import get_engine
# create a variable called engine using the get_engine function
engine = get_engine()

import psycopg2

table_name = 'skills_list_final'
schema = 'capstone_datacvpro'

# Write records stored in a dataframe to SQL database
if engine!=None:
    try:
        skills.to_sql(name=table_name, # Name of SQL table variable
                        con=engine, # Engine or connection
                        schema=schema, # your class schema variable
                        if_exists='replace', # Drop the table before inserting new values 
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None
else:
    print('No engine')

The skills_list_final table was imported successfully.
