In [8]:
import datetime as dt
import pandas as pd

import nltk
import re
import requests
import string

from bs4 import BeautifulSoup, SoupStrainer
from nltk.tokenize import sent_tokenize # Sentence Tokenizer
from nltk.tokenize import word_tokenize # Word Tokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# 1) (optional) Scrape 100 Job Listings that contain the title "Data Scientist" from indeed.com

At a minimum your final dataframe of job listings should contain
- Job Title
- Job Description

In [24]:
def query_generator(keyword, location, radius=50):

    '''A function that takes a search keyword (or keywords) and a city and returns the resulting query url'''

    if ' ' in keyword:
        keyword.replace(' ', '+')
    
    
    string1 = f"https://www.indeed.com/jobs?as_and={keyword}&as_phr=&as_any=&as_not=&as_ttl=&as_cmp=&jt=all&st=&"
    string2 = f"sr=directhire&as_src=&salary=&radius=50&l={location}&fromage=any&limit=10&sort=&psf=advsrch"
    query = string1+string2

    return query

test_query = query_generator("tensorflow", "21044")
print(test_query)

https://www.indeed.com/jobs?as_and=tensorflow&as_phr=&as_any=&as_not=&as_ttl=&as_cmp=&jt=all&st=&sr=directhire&as_src=&salary=&radius=50&l=21044&fromage=any&limit=10&sort=&psf=advsrch


In [7]:


def num_search_results(query):

    '''A function that takes a search query url and returns the number of search results'''

    only_search_count = SoupStrainer(id="searchCount")
    page = requests.get(query)
    soup = BeautifulSoup(page.text, "html.parser", parse_only=only_search_count)
    s = soup.get_text()
    num_search_results = [int(item) for item in s.split(' ') if item.isdigit()][-1]
    return num_search_results

test_num_search_results = num_search_results(test_query)
# print(test_num_search_results)

def soup_generator(query, parser='html.parser'):

    '''A function that takes a query url and returns a BeautifulSoup object. html.parser is passed in as default parser'''

    page = requests.get(query)
    soup = BeautifulSoup(page.text, parser)

    return soup

test_soup = soup_generator(test_query)




def search_page_generator(query, num_search_results, limit=50):
    '''A function that takes a query url and the number of search results corresponding that query, and returns a list of urls to be scraped.'''

    urls_to_scrape = []
    i = int(num_search_results/50)
    for page_number in range(i + 1):
        url_suffix = f'&limit={limit}&start={str(page_number * 50)}'
        url = f'{query}{url_suffix}'
        urls_to_scrape.append(url)

    return urls_to_scrape

urls_to_scrape = search_page_generator(test_query, test_num_search_results)

# Plug each of the items in urls_to_scrape into soup_generator()
def extract_job_postings(url_list):
    '''A function that takes a list of urls resulting from a specific query and returns a list of BeautifulSoup objects corresponding to each job posting in the list of urls'''

    job_postings = []
    for u in url_list:
        soup = soup_generator(u)
        for result in soup.find_all('div', attrs={'data-tn-component': 'organicJob'}):
            job_postings.append(result)
    return job_postings

jobs = extract_job_postings(urls_to_scrape)
print(len(jobs))
job = jobs[200]


# extract all html elements we want to scrape

def extract_elements(job):

    ''' A function that takes a scraped job posting and returns a dict representing each separate element (job_title, company, location, etc)'''

    pass

# * job_id

job_id = job.find('h2', attrs={"class": "jobtitle"})['id']
print(job_id)

# * timestamp (date & time scraped - UTC)


timestamp = dt.datetime.utcnow()
format = "%d-%m-%Y %-H:%M"
timestamp_formatted = timestamp.strftime(format)
print(timestamp_formatted)
# * job_title

job_title = job.find('a', attrs={'data-tn-element':"jobTitle"}).text.strip().capitalize()

print(job_title)

# * company

company = job.find('span', class_='company').text.strip()

print(company)

# * location

location = job.find('span', class_='location').get_text()
print(location)
# * date_posting

post_date = job.find('span', class_='date').get_text()


def format_post_date(post_date, timestamp):

    '''A function that takes in a timestamp indicating when a job posting was scraped and a string such as "Publiée il y a 2 jours", and returns a formatted date string such as "03-05-2018"'''

    date = post_date.split(' ')

    if "l'instant" in date or "Aujourd'hui" in date:
        days_ago = 0
    elif "jour" in date:
        days_ago = 1
    elif "jours" in date and "30+" not in date:
        days_ago = int(date[-2])
    else:
        days_ago = None

    if not days_ago:
        return "30+ days"

    delta = dt.timedelta(days=days_ago)

    format = "%d-%m-%Y"
    date_scraped = timestamp.date()
    date_posted = (date_scraped - delta).strftime(format)
    return date_posted


formatted_post_date = format_post_date(post_date, timestamp)
print(formatted_post_date)

# * salary_range

try:
    salary = job.find('span', class_='salary no-wrap').text.strip()
    print(salary)
except AttributeError:
    print('NA')

# * job_summary

summary = job.find('span', class_='summary').text.strip()
print(summary)

# * job_link

job_link = "https://www.indeed.com" + job.find('h2', attrs={"class": "jobtitle"}).find('a')['href']
print(job_link)

## 2) Use NLTK to tokenize / clean the listings 

In [0]:
##### Your Code Here #####

# 3) Use Scikit-Learn's CountVectorizer to get word counts for each listing.

In [0]:
##### Your Code Here #####

# 4) Visualize the most common word counts

In [0]:
##### Your Code Here #####

 # 5) Use Scikit-Learn's tfidfVectorizer to get a TF-IDF feature matrix

In [0]:
##### Your Code Here #####

## Stretch Goals

 - Scrape Job Listings for the job title "Data Analyst". How do these differ from Data Scientist Job Listings
 - Try and identify requirements for experience specific technologies that are asked for in the job listings. How are those distributed among the job listings?
 - Use a clustering algorithm to cluster documents by their most important terms. Do the clusters reveal any common themes?
  - **Hint:** K-means might not be the best algorithm for this. Do a little bit of research to see what might be good for this. Also, remember that algorithms that depend on Euclidean distance break down with high dimensional data.