In [1]:
from splinter import Browser
from bs4 import BeautifulSoup
import nltk
import matplotlib.pyplot as plt
import os
import pymongo
import pandas as pd
import re
import time

In [None]:
# use pymongo to store all info from LinkedIN
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.linkedin_db

In [3]:
# chrome_driver_path = os.path.abspath(r"C:\Users\haneu\Desktop\Data Analytics\6_mongo_webScrape\chromedriver.exe")
executable_path = {'executable_path': "chromedriver.exe"}
browser = Browser('chrome', **executable_path, headless=False)
url = "https://www.glassdoor.ca/index.htm"
browser.visit(url)

In [4]:
print("What job are you looking for?")
job = input()
job_type = browser.find_by_id("KeywordSearch")
job_type.fill(job)

print("Where do you want to find your job?")
job_location = input()
location = browser.find_by_id("LocationSearch")
location.fill(job_location)

# Clicking button
browser.find_by_id("HeroSearchButton").click()

What job are you looking for?
Data Scientist
Where do you want to find your job?
Singapore


In [None]:
# ------------- loop through pages ------------

html = browser.html
soup = BeautifulSoup(html, "html.parser")
result = soup.find("div", class_="pagingControls").ul
pages = result.find_all("li")

print(pages)

for page in pages:
    # run if <a> exists since un-clickable do not have <a> skipping < and pg1
    if page.a:
        # within <a> tag click except next button         
        if not page.find("li", class_="Next"):
            try:
                browser.click_link_by_href(page.a['href'])

                # --------- call scrape data function here ---------
                
            except:
                print("This is the last page")
     

In [5]:
position = []
exp_level = []
company = []
employment_type = []
location = []
job_desc = []

# ------------ Scraping data for each page ------------
def scrape():
    # Getting html of first page
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")
    jobs = soup.find_all("li", class_="jl")

    for job in jobs:
            
        # Store all info into a list         
        position.append(job.find("div", class_="jobTitle").a.text)
        # ex: Tommy - Singapore
        comp_loc = job.find("div", class_="empLoc").div.text
        comp, loc = comp_loc.split("–")
        # print(comp)
        company.append(comp.strip())
        location.append(loc.strip())
        
        browser.click_link_by_href(job.find("a", class_="jobLink")["href"])
        
        # ------------- Scrape Job descriptions within a page -----------
        # from current html since if you click job_posting it render new html
        html = browser.html
        soup = BeautifulSoup(html, "html.parser")
#         print(soup.find("div", class_="desc").text)
        job_desc.append(soup.find("div", class_="desc").text)
    
        # It is because if you are going too fast it skips some jobs desc.         
        time.sleep(3)
        
        
scrape()

In [None]:
# checking duplicate scrape
# Total 30 jobs in each page.
print(len(job_desc))
print(len(set(job_desc)))

# get list of it and turn it back to a list. 
# job_desc = set(job_desc)
# job_desc = list(job_desc)

In [None]:
# ------------- Text classification to classify technical skill words --------------
# Some words are connected with / ex:"sql/database", so split them.
for job in job_desc:
    ", ".join(job.split('/'))
job_desc = [", ".join(job.split('/')) for job in job_desc]

# each item is a list of tokenized job_descriptions
tok = [nltk.word_tokenize(job.lower()) for job in job_desc]



# from nltk.corpus import stopwords
stop = stopwords.words('english')
def stopword_deleter(tokenized_job_desc):
    """ ignore stop words, bullets, etc. And put it into one list """
    final_word_list = []
    for lists in tokenized_job_desc:
        for item in lists:
            if len(item)>2 and (item not in stop):
                # Some words have \\ at the end, remove them.           
                final_word_list.append(item.replace("\\",""))
    return final_word_list
# ------------ Lematize
cleaned_list = stopword_deleter(tok)
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_list = [lemmatizer.lemmatize(word,pos="v") for word in cleaned_list]
print(lemmatized_list)

In [None]:
# Taking a look at POS tags
pos_tag = nltk.pos_tag(lemmatized_list)
pos_df = pd.DataFrame(pos_tag, columns=["Word", "POS"])
pos_sum = pos_df.groupby("POS").count()
pos_sum.sort_values(["Word"], ascending=False)
pos_tag

In [None]:
filtered_pos_tag = []

for tag in pos_tag:
    if tag[1] =="NN" or tag[1] == "NNS" or tag[1] =="NNP" or tag[1] == "NNPS":
        filtered_pos_tag.append(tag)
filtered_pos_tag

In [None]:
freq = nltk.FreqDist(filtered_pos_tag)
most_freq_words = freq.most_common(100)


most_freq_words

In [None]:
# Find 100 most frequent words
freq = nltk.FreqDist(lemmatized_list)
most_freq_words = freq.most_common(100)


most_freq_words

In [None]:
df = pd.DataFrame(most_freq_words, columns=("Words", "Count"))
df.head()

In [None]:
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stop,
                          max_words=100,
                          max_font_size=50, 
                          random_state=42
                         ).generate(str(most_freq_words))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import re

vect = CountVectorizer(max_df=0.8, stop_words=stop, ngram_range=(1,2))

x = vect.fit_transform(lemmatized_list)
list(vect.vocabulary_)

In [None]:
def get_top_n_words(cleaned_corpus, n=None):
    vec = CountVectorizer().fit(cleaned_corpus)
    bag_of_words = vec.transform(cleaned_corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      
                   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]
top_words = get_top_n_words(lemmatized_list, n=20)
top_df = pd.DataFrame(top_words)

# Visualize
top_words = get_top_n_words(lemmatized_list, n=20)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_title("1-gram words")
g.set_xticklabels(g.get_xticklabels(), rotation=30);

In [None]:
# For 2-grams
def get_top_n2_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(2,2),  
            max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]
top2_words = get_top_n2_words(lemmatized_list, n=20)
top2_df = pd.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]
top2_df