In [25]:
import pandas as pd
import sqlalchemy 
import sql_functions as sf

In [26]:
# load jobs data

schema = 'capstone_datacvpro'

jobs_20 = sf.get_dataframe(f' SELECT * FROM {schema}.jobs_20')
display(jobs_20)

Unnamed: 0,job_title,salary_estimate,job_description,company_name,location,industry
0,"Data Analyst, Center on Immigration and Justic...",$37K-$66K (Glassdoor est.),Are you eager to roll up your sleeves and harn...,Vera Institute of Justice\n3.2,"New York, NY",Social Assistance
1,Quality Data Analyst,$37K-$66K (Glassdoor est.),Overview\n\nProvides analytical and technical ...,Visiting Nurse Service of New York\n3.8,"New York, NY",Health Care Services & Hospitals
2,"Senior Data Analyst, Insights & Analytics Team...",$37K-$66K (Glassdoor est.),We’re looking for a Senior Data Analyst who ha...,Squarespace\n3.4,"New York, NY",Internet
3,Data Analyst,$37K-$66K (Glassdoor est.),Requisition NumberRR-0001939\nRemote:Yes\nWe c...,Celerity\n4.1,"New York, NY",IT Services
4,Reporting Data Analyst,$37K-$66K (Glassdoor est.),ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,FanDuel\n3.9,"New York, NY",Sports & Recreation
...,...,...,...,...,...,...
6157,AWS Data Engineer,$55K-$112K (Glassdoor est.),About Us\n\nTachyon Technologies is a Digital ...,Tachyon Technologies\n4.4,"Dublin, OH",IT Services
6158,Data Analyst â Junior,$55K-$112K (Glassdoor est.),"Job description\nInterpret data, analyze resul...","Staffigo Technical Services, LLC\n5.0","Columbus, OH",IT Services
6159,Security Analytics Data Engineer,$55K-$112K (Glassdoor est.),Job DescriptionThe Security Analytics Data Eng...,"PDS Tech, Inc.\n3.8","Dublin, OH",Staffing & Outsourcing
6160,Security Analytics Data Engineer,$55K-$112K (Glassdoor est.),The Security Analytics Data Engineer will inte...,Data Resource Technologies\n4.0,"Dublin, OH",Accounting


In [27]:
import re
import nltk
import spacy

In [31]:
# Steps in NLP:

# Tokenizing: split text by word/ sentence 
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

# Stop Words: filter out word that are not needed for context (in, is, and,...)
nltk.download("stopwords")
from nltk.corpus import stopwords

# Stemming: reduce words to their root
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Tagging Parts of Speech: labeling words (noun, pronoun, verb,..)

# Lemmatizing: reduce word to their core meaning
from nltk.stem import WordNetLemmatizer

# Chunking: identify phrases ,word need to be tagged before chunking
from nltk.tokenize import word_tokenize

# Named Entity Recognition: identify named entities, eg. locations, people, organizations,..

[nltk_data] Downloading package punkt to /Users/kgolzev/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kgolzev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [77]:
# count words in description column

def count_words(text):
    words = word_tokenize(text)
    return len(words)

jobs_20['desc_words'] = jobs_20['job_description'].apply(count_words)

In [53]:
# count sentences

def count_sent(text):
    sent = sent_tokenize(text)
    return len(sent)

jobs_20['desc_sent'] = jobs_20['job_description'].apply(count_sent)

In [54]:
jobs_20.head()

Unnamed: 0,job_title,salary_estimate,job_description,company_name,location,industry,desc_words,desc_sent
0,"Data Analyst, Center on Immigration and Justic...",$37K-$66K (Glassdoor est.),Are you eager to roll up your sleeves and harn...,Vera Institute of Justice\n3.2,"New York, NY",Social Assistance,1162,48
1,Quality Data Analyst,$37K-$66K (Glassdoor est.),Overview\n\nProvides analytical and technical ...,Visiting Nurse Service of New York\n3.8,"New York, NY",Health Care Services & Hospitals,709,48
2,"Senior Data Analyst, Insights & Analytics Team...",$37K-$66K (Glassdoor est.),We’re looking for a Senior Data Analyst who ha...,Squarespace\n3.4,"New York, NY",Internet,802,24
3,Data Analyst,$37K-$66K (Glassdoor est.),Requisition NumberRR-0001939\nRemote:Yes\nWe c...,Celerity\n4.1,"New York, NY",IT Services,888,28
4,Reporting Data Analyst,$37K-$66K (Glassdoor est.),ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,FanDuel\n3.9,"New York, NY",Sports & Recreation,573,13


In [55]:
jobs_20.describe()

Unnamed: 0,desc_words,desc_sent
count,6162.0,6162.0
mean,552.812074,18.711782
std,321.742581,13.699206
min,3.0,1.0
25%,320.0,9.0
50%,510.0,16.0
75%,726.0,26.0
max,3420.0,106.0


In [79]:
# remove stopwords

def no_stopwords(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    word_list = []
    for word in words:
        if word.casefold() not in stop_words:
            word_list.append(word)
    return(word_list)

jobs_20['desc_cleaned'] = jobs_20['job_description'].apply(no_stopwords)

In [80]:
jobs_20.head()

Unnamed: 0,job_title,salary_estimate,job_description,company_name,location,industry,desc_words,desc_sent,desc_cleaned,desc_words_clean
0,"Data Analyst, Center on Immigration and Justic...",$37K-$66K (Glassdoor est.),Are you eager to roll up your sleeves and harn...,Vera Institute of Justice\n3.2,"New York, NY",Social Assistance,1162,48,"[eager, roll, sleeves, harness, data, drive, p...",6162
1,Quality Data Analyst,$37K-$66K (Glassdoor est.),Overview\n\nProvides analytical and technical ...,Visiting Nurse Service of New York\n3.8,"New York, NY",Health Care Services & Hospitals,709,48,"[Overview, Provides, analytical, technical, su...",6162
2,"Senior Data Analyst, Insights & Analytics Team...",$37K-$66K (Glassdoor est.),We’re looking for a Senior Data Analyst who ha...,Squarespace\n3.4,"New York, NY",Internet,802,24,"[’, looking, Senior, Data, Analyst, love, ment...",6162
3,Data Analyst,$37K-$66K (Glassdoor est.),Requisition NumberRR-0001939\nRemote:Yes\nWe c...,Celerity\n4.1,"New York, NY",IT Services,888,28,"[Requisition, NumberRR-0001939, Remote, :, Yes...",6162
4,Reporting Data Analyst,$37K-$66K (Glassdoor est.),ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,FanDuel\n3.9,"New York, NY",Sports & Recreation,573,13,"[FANDUEL, GROUP, FanDuel, Group, world-class, ...",6162


In [73]:
print(jobs_20.dtypes)

job_title           object
salary_estimate     object
job_description     object
company_name        object
location            object
industry            object
desc_words           int64
desc_sent            int64
desc_cleaned        object
desc_words_clean     int64
dtype: object


In [81]:
# count words in cleaned description

def count_words_list(list):
    return len(list)

jobs_20['desc_words_clean'] = jobs_20['desc_cleaned'].apply(count_words_list)

In [82]:
jobs_20.head()

Unnamed: 0,job_title,salary_estimate,job_description,company_name,location,industry,desc_words,desc_sent,desc_cleaned,desc_words_clean
0,"Data Analyst, Center on Immigration and Justic...",$37K-$66K (Glassdoor est.),Are you eager to roll up your sleeves and harn...,Vera Institute of Justice\n3.2,"New York, NY",Social Assistance,1162,48,"[eager, roll, sleeves, harness, data, drive, p...",796
1,Quality Data Analyst,$37K-$66K (Glassdoor est.),Overview\n\nProvides analytical and technical ...,Visiting Nurse Service of New York\n3.8,"New York, NY",Health Care Services & Hospitals,709,48,"[Overview, Provides, analytical, technical, su...",548
2,"Senior Data Analyst, Insights & Analytics Team...",$37K-$66K (Glassdoor est.),We’re looking for a Senior Data Analyst who ha...,Squarespace\n3.4,"New York, NY",Internet,802,24,"[’, looking, Senior, Data, Analyst, love, ment...",509
3,Data Analyst,$37K-$66K (Glassdoor est.),Requisition NumberRR-0001939\nRemote:Yes\nWe c...,Celerity\n4.1,"New York, NY",IT Services,888,28,"[Requisition, NumberRR-0001939, Remote, :, Yes...",609
4,Reporting Data Analyst,$37K-$66K (Glassdoor est.),ABOUT FANDUEL GROUP\n\nFanDuel Group is a worl...,FanDuel\n3.9,"New York, NY",Sports & Recreation,573,13,"[FANDUEL, GROUP, FanDuel, Group, world-class, ...",375


In [83]:
jobs_20.describe()

Unnamed: 0,desc_words,desc_sent,desc_words_clean
count,6162.0,6162.0,6162.0
mean,552.812074,18.711782,400.83171
std,321.742581,13.699206,228.758551
min,3.0,1.0,3.0
25%,320.0,9.0,235.0
50%,510.0,16.0,371.0
75%,726.0,26.0,521.0
max,3420.0,106.0,2162.0
