# Web Scraping - Job Postings - Ver4


Web scraper to identify new words/skills/keywords through NLP - first instance, threshold, time-series.
100 jobs per city sorted by date. Or jobs posted over past month in each city sorted by date. 

In [1]:
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.wordnet import WordNetLemmatizer

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


In [3]:
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json
from time import sleep, time
import lxml
import re
from urllib.parse import urljoin
from textblob import TextBlob

In [5]:
cit = input('Please, enter a city:\n')
stat = input('Please, enter a state:\n')
city = str(cit.replace(' ', '+'))
state = str(stat.replace(' ','+'))
location = city+"%2C+"+state+"&radius=50&sort=date"

print(f'Searching {city},{state}. Please wait...')

Please, enter a city:
Atlanta
Please, enter a state:
GA
Searching Atlanta,GA. Please wait...


In [6]:
data = {'title': [],
        'company': [], 
        'location': [],
        'description': [],}

In [7]:
search_url = "https://www.indeed.com/jobs?q=title%3A(%22data+scientist%22+OR+%22data+science%22+OR+%22data+analyst%22)&l="+location

In [8]:
start = time()

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    clean = re.sub(cleanr, ' ', str(raw_html))
    cleaner = clean.strip()
    cleantext = re.sub('\n',' ', cleaner)
    return cleantext

def export_table(data):
    table = pd.DataFrame(data, columns=['title', 'company', 'location', 'description'])
    table.index = table.index + 1
    table.to_csv('/users/dmauger/Flatiron/Projects/Module5-Project-DM/' +
                 'job_scrape6.csv', mode='a', encoding='utf-8', index=False)
    
    desc = pd.DataFrame(data['description'])
    desc.index = desc.index + 1
    desc.to_csv('/users/dmauger/Flatiron/Projects/Module5-Project-DM/' + 'job_desc.csv', mode='a', encoding='utf-8', index=False)
    print('Scraping done. Here are the results:')
    print(table.info())
    print(desc.info())

def job_details(job):

    r = requests.get(job)
    r.encoding = 'utf-8'
    sleep(1)

    html_content = r.text
    soup = BeautifulSoup(html_content, 'html.parser')

    try:
        title = soup.find('h3', class_='jobsearch-JobInfoHeader-title').text
    except:
        title = 'NaN'
    
    try:
        company = soup.find_all('div', class_="jobsearch-InlineCompanyRating")[-1].find_all('div')[0].text
    except:
        company = 'NaN'
        
    try:   
        location = soup.find_all('div', class_="jobsearch-InlineCompanyRating")[-1].find_all('div')[-1].text
    except:
        location = 'NaN'
        
    try:
        description = soup.find_all('div', class_="jobsearch-JobComponent-description")
    except:
        description = 'NaN'

    data['title'].append(title)
    data['company'].append(company)
    data['location'].append(location)
    data['description'].append(cleanhtml(description))
    
def extract_title(search_url):
    
    sleep(1)
    page = requests.get(search_url)
    bs = BeautifulSoup(page.content, 'html.parser')
        
    links = []
    for div in bs.find_all('div', class_="title"):
        for a in div.find_all('a',href=True):
            links.append(urljoin('https://indeed.com', a['href']))
        
    for job in links:
        job_details(job)

    next_page_text = bs.find('div', class_="pagination").find_all('a')
    next_page = [link.get('href') for link in next_page_text][-1]
                
    if '&start=50' not in next_page:
        next_page_url = (urljoin('https://indeed.com', cleanhtml(next_page)))
        print(next_page_url)
        extract_title(next_page_url)
    else:
        export_table(data)
        
end = time()
print('Time Taken: ', end - start)

Time Taken:  0.0015077590942382812


In [9]:
start = time()
extract_title(search_url)
end = time()
print('Time Taken: ', end - start)

https://indeed.com/jobs?q=title%3A%28%22data+scientist%22+OR+%22data+science%22+OR+%22data+analyst%22%29&l=Atlanta%2C+GA&radius=50&sort=date&start=10
https://indeed.com/jobs?q=title%3A%28%22data+scientist%22+OR+%22data+science%22+OR+%22data+analyst%22%29&l=Atlanta%2C+GA&radius=50&sort=date&start=20
https://indeed.com/jobs?q=title%3A%28%22data+scientist%22+OR+%22data+science%22+OR+%22data+analyst%22%29&l=Atlanta%2C+GA&radius=50&sort=date&start=30
https://indeed.com/jobs?q=title%3A%28%22data+scientist%22+OR+%22data+science%22+OR+%22data+analyst%22%29&l=Atlanta%2C+GA&radius=50&sort=date&start=40
Scraping done. Here are the results:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 1 to 72
Data columns (total 4 columns):
title          72 non-null object
company        72 non-null object
location       72 non-null object
description    72 non-null object
dtypes: object(4)
memory usage: 2.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 1 to 72
Data colum

In [10]:
df = pd.read_csv('/users/dmauger/Flatiron/Projects/Module5-Project-DM/job_scrape6.csv')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537 entries, 0 to 536
Data columns (total 4 columns):
title          537 non-null object
company        520 non-null object
location       520 non-null object
description    537 non-null object
dtypes: object(4)
memory usage: 16.9+ KB


In [10]:
with open('/users/dmauger/Flatiron/Projects/Module5-Project-DM/job_desc.csv') as f:
    data = f.read()
print(data)

description
"[     Foot Locker is seeking a Data Analyst to join the Data Team within Foot Locker’s Information Systems &amp; Technology (IS&amp;T) Department. Incumbent will be responsible for data analytics, within an agile data science team, to improve customer experience and inform business strategy.       Our global house-of-brands inspires and empowers youth culture. Relentlessly committed to fuel a shared passion for self-expression, we create unrivaled experiences at the heart of the sport and sneaker communities through the power of our people. If you want to be a part of something bigger than you can imagine, you’ve come to the right place. To learn more about the incredible impact we’re making on both our local and global communities, Click Here!    RESPONSIBILITIES     Understand business inquiries for medium to high complexity analytics and develop innovative analytic solutions, aligning both data science team and business clients on approach   Working with the product own

In [None]:
# data = df.description

In [11]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
data = nltk.regexp_tokenize(data, pattern)

In [12]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``','--']

In [None]:
lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize('analysts')


In [13]:
def process_desc(description):
    tokens = nltk.word_tokenize(description)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return stopwords_removed    

In [14]:
processed_data = list(map(process_desc, data))

In [15]:
processed_data[:10]

[['description'],
 ['foot'],
 ['locker'],
 [],
 ['seeking'],
 [],
 ['data'],
 ['analyst'],
 [],
 ['join']]

In [16]:
total_vocab = set()
for comment in processed_data:
    total_vocab.update(comment)
len(total_vocab)

3333

In [17]:
desc_concat = []
for desc in processed_data:
    desc_concat += desc

In [18]:
desc_least = FreqDist(list(desc_concat))
desc_dict = FreqDist(dict(desc_least.most_common()[-10:]))
desc_dict


FreqDist({'utilization': 1, 'timeliness': 1, 'resource': 1, 'heads': 1, 'tends': 1, 'advise': 1, 'affordable': 1, 'liability': 1, 'chcp': 1, 'section': 1})

In [19]:
desc_freqdist = FreqDist(desc_concat)
desc_freqdist.most_common(200)

[('data', 726),
 ('experience', 363),
 ('business', 226),
 ('work', 200),
 ('skills', 154),
 ('team', 146),
 ('science', 138),
 ('analysis', 135),
 ('ability', 133),
 ('knowledge', 126),
 ('analytics', 119),
 ('required', 115),
 ('learning', 111),
 ('years', 109),
 ('management', 99),
 ('including', 99),
 ('job', 98),
 ('degree', 97),
 ('solutions', 92),
 ('development', 89),
 ('research', 83),
 ('etc', 81),
 ('machine', 81),
 ('time', 81),
 ('tools', 80),
 ('preferred', 78),
 ('amp', 77),
 ('information', 76),
 ('technical', 75),
 ('computer', 73),
 ('opportunity', 72),
 ('working', 71),
 ('new', 71),
 ('process', 70),
 ('statistical', 69),
 ('techniques', 69),
 ('use', 67),
 ('e', 67),
 ('using', 65),
 ('requirements', 64),
 ('us', 64),
 ('problems', 63),
 ('analytical', 63),
 ('reports', 63),
 ('sql', 62),
 ('insurance', 62),
 ('statistics', 61),
 ('services', 61),
 ('clients', 60),
 ('provide', 60),
 ('support', 60),
 ('environment', 60),
 ('complex', 60),
 ('quality', 59),
 ('syst

In [20]:
len(desc_freqdist)

3333

In [21]:
total_word_count = sum(desc_freqdist.values())
desc_top_50 = desc_freqdist.most_common(50)
print("Word\t\t\tNormalized Frequency")
for word in desc_top_50:
    normalized_frequency = word[1] / total_word_count
    print("{} \t\t\t {:.4}".format(word[0], normalized_frequency))

Word			Normalized Frequency
data 			 0.02924
experience 			 0.01462
business 			 0.009103
work 			 0.008056
skills 			 0.006203
team 			 0.005881
science 			 0.005559
analysis 			 0.005438
ability 			 0.005357
knowledge 			 0.005075
analytics 			 0.004793
required 			 0.004632
learning 			 0.004471
years 			 0.004391
management 			 0.003988
including 			 0.003988
job 			 0.003947
degree 			 0.003907
solutions 			 0.003706
development 			 0.003585
research 			 0.003343
etc 			 0.003263
machine 			 0.003263
time 			 0.003263
tools 			 0.003222
preferred 			 0.003142
amp 			 0.003102
information 			 0.003061
technical 			 0.003021
computer 			 0.00294
opportunity 			 0.0029
working 			 0.00286
new 			 0.00286
process 			 0.00282
statistical 			 0.002779
techniques 			 0.002779
use 			 0.002699
e 			 0.002699
using 			 0.002618
requirements 			 0.002578
us 			 0.002578
problems 			 0.002538
analytical 			 0.002538
reports 			 0.002538
sql 			 0.002497
insurance 			 0.002497
statistics 			 

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tf_idf_data_train = vectorizer.fit_transform(data)
# tf_idf_data_test = vectorizer.transform(newsgroups_test.data)

In [23]:
tf_idf_data_train.shape

(36100, 3420)

In [24]:
non_zero_cols = tf_idf_data_train.nnz / float(tf_idf_data_train.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Articles: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(tf_idf_data_train.shape[1]))
print('Percentage of columns containing 0: {}'.format(percent_sparse))

Average Number of Non-Zero Elements in Vectorized Articles: 0.97398891966759
Percentage of columns containing 0: 0.9997152079182259


In [25]:
def count_vectorize(data, vocab=None):
    if vocab:
        unique_words = vocab
    else:
        unique_words = list(set(data))
    
    data_dict = {i:0 for i in unique_words}
    
    for word in data:
        data_dict[word] += 1
    
    return data_dict

test_vectorized = count_vectorize(desc_freqdist)
print(test_vectorized)



In [None]:
wordcloud = WordCloud().generate()

In [None]:
for word in words:
    if word not in sw:
        words_ns.append(word)

In [None]:
# driver = webdriver.Chrome(executable_path='chromedriver')
# driver.get('https://www.lazada.sg/#')


In [None]:
# description = str(df)

In [None]:
# tokens = [t for t in df.split()]
# print(tokens)

In [None]:
# from nltk.corpus import stopwords
# sr= stopwords.words('english')
# clean_tokens = tokens[:]
# for token in tokens:
#     if token in stopwords.words('english'):
        
#         clean_tokens.remove(token)
# freq = nltk.FreqDist(clean_tokens)
# for key,val in freq.items():
#     print(str(key) + ':' + str(val))
# freq.plot(20, cumulative=False);

In [None]:
# list_of_cities = {"l=New+York%2C+NY",
#                   "l=Los+Angeles%2C+CA",
#                   "l=Chicago%2C+IL",
#                   "l=Dallas%2C+TX",
#                   "l=Houston%2C+TX",
#                   "l=Washington%2C+DC",
#                   "l=Miami%2C+FL",
#                   "l=Philadelphia%2C+PA",
#                   "l=Atlanta%2C+GA",
#                   "l=Boston%2C+MA",
#                   "l=Phoenix%2C+AZ",
#                   "l=San+Francisco%2C+CA",
#                   "l=Seattle%2C+WA",
#                   "l=Denver%2C+CO",
#                   "l=Trenton%2C+NJ",
#                   "l=Austin%2C+TX",
#                   "l=Raleigh-Durham%2C+NC",
#                   "l=Charlotte%2C+NC",
#                   "l=Boulder%2C+CO",
#                   "l=San+Jose%2C+CA"}