In [1]:

# Necessary Imports for LinkedIn Scraping

In [2]:


import logging
from linkedin_jobs_scraper import LinkedinScraper
from linkedin_jobs_scraper.events import Events,EventData,EventMetrics
from linkedin_jobs_scraper.query import Query,QueryOptions,QueryFilters
from linkedin_jobs_scraper.filters import RelevanceFilters, TimeFilters,TypeFilters,ExperienceLevelFilters, RemoteFilters


# Necessary Imports for Extracting Common Words

In [3]:


import collections
import pandas as pd
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

In [4]:


logging.basicConfig(level = logging.INFO)

In [5]:


job_postings = []

In [6]:


def on_data(data: EventData):
    job_postings.append([data.job_id,data.location,data.title,data.company,data.date,data.link,data.description])

In [7]:


def on_error(error):
    print('[ON_ERROR]', error)

In [8]:


def on_end():
    print('[ON_END]')

In [9]:


chrome_driver_path = '/Users/marcosespinosa/Downloads/chromedriver'

In [10]:


scraper = LinkedinScraper(
    chrome_executable_path=chrome_driver_path,
    chrome_options=None,
    headless=True,
    max_workers=1,
    slow_mo=1.3,
    page_load_timeout=20)

INFO:li:scraper:('Using strategy AnonymousStrategy',)


In [11]:


scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)

In [12]:


queries = [
    Query(
        query='Python',
        options=QueryOptions(
            locations=['United States','Tampa,FL'],
            apply_link = True,
            limit = 27,
            filters=QueryFilters(
                relevance=RelevanceFilters.RECENT,
                time=TimeFilters.MONTH,
                type=[TypeFilters.FULL_TIME],
                experience=None,
            )
        )
    ),
]

In [13]:


scraper.run(queries)


# In[27]: Create Pandas Dataframe from ON_DATE results


df = pd.DataFrame(job_postings,columns=['Job_ID','Location','Title','Company','Date','Link','Description'])

INFO:li:scraper:('Starting new query', "Query(query=Python options=QueryOptions(limit=27 locations=['United States', 'Tampa,FL'] filters=QueryFilters(relevance=RelevanceFilters.RECENT time=TimeFilters.MONTH type=[<TypeFilters.FULL_TIME: 'F'>]) optimize=False apply_link=True))")
INFO:li:scraper:('Chrome debugger url', 'http://localhost:58859')
INFO:li:scraper:('[Python][United States]', 'Opening https://www.linkedin.com/jobs/search?keywords=Python&location=United+States&sortBy=DD&f_TPR=r2592000&f_JT=F&start=0')
INFO:li:scraper:('[Python][United States]', 'Trying first selectors set')
INFO:li:scraper:('[Python][United States]', 'Trying second selectors set')
INFO:li:scraper:('[Python][United States]', 'OK')
INFO:li:scraper:('[Python][United States]', 'Starting pagination loop')
INFO:li:scraper:('[Python][United States]', 'Found 24 jobs')
INFO:li:scraper:('[Python][United States][1]', 'Processed')
INFO:li:scraper:('[Python][United States][2]', 'Processed')
INFO:li:scraper:('[Python][Unite

[ON_ERROR] Message: javascript error: Cannot read properties of null (reading 'scrollIntoView')
  (Session info: headless chrome=105.0.5195.102)

Traceback (most recent call last):
  File "/Users/marcosespinosa/opt/anaconda3/lib/python3.9/site-packages/linkedin_jobs_scraper/strategies/anonymous_strategy.py", line 267, in run
    job_id, job_link, job_title, job_company, job_place, job_date = driver.execute_script(
  File "/Users/marcosespinosa/opt/anaconda3/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py", line 634, in execute_script
    return self.execute(command, {
  File "/Users/marcosespinosa/opt/anaconda3/lib/python3.9/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "/Users/marcosespinosa/opt/anaconda3/lib/python3.9/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exception

INFO:li:scraper:('[Python][United States][6]', 'Processed')
ERROR:li:scraper:('[Python][United States][7]', 'Timeout on loading job details')
NoneType: None
INFO:li:scraper:('[Python][United States][7]', 'Processed')
INFO:li:scraper:('[Python][United States][8]', 'Processed')
INFO:li:scraper:('[Python][United States][9]', 'Processed')
INFO:li:scraper:('[Python][United States][10]', 'Processed')
ERROR:li:scraper:('[Python][United States][11]', 'Timeout on loading job details')
NoneType: None
ERROR:li:scraper:('[Python][United States][11]', 'Timeout on loading job details')
NoneType: None
INFO:li:scraper:('[Python][United States][11]', 'Processed')
INFO:li:scraper:('[Python][United States][12]', 'Processed')
INFO:li:scraper:('[Python][United States][13]', 'Processed')
ERROR:li:scraper:('[Python][United States][14]', 'Timeout on loading job details')
NoneType: None
ERROR:li:scraper:('[Python][United States][14]', 'Timeout on loading job details')
NoneType: None
ERROR:li:scraper:('[Python]

INFO:li:scraper:('[Python][Tampa,FL][25]', 'Processed')
ERROR:li:scraper:('[Python][Tampa,FL][26]', 'Timeout on loading job details')
NoneType: None
INFO:li:scraper:('[Python][Tampa,FL][26]', 'Processed')
INFO:li:scraper:('[Python][Tampa,FL][27]', 'Processed')


[ON_END]


In [14]:
df.head()

Unnamed: 0,Job_ID,Location,Title,Company,Date,Link,Description
0,3253034666,United States,Python Developer,"Supertek, LLC",2022-09-05,https://www.linkedin.com/jobs/view/python-deve...,Python Engineer\nJob Description\n\nJob Type: ...
1,3253051563,United States,Data Scientist,Alkymi,2022-09-05,https://www.linkedin.com/jobs/view/data-scient...,"At Alkymi, we’re on a mission to supercharge h..."
2,3245362833,United States,Python Developer,Polymer SaaS DLP,2022-08-30,https://www.linkedin.com/jobs/view/python-deve...,About Polymer\n\n\n\n\nPolymer is a No-Code Da...
3,3254820222,United States,"Software Engineer, University Graduate (Busine...",TikTok,2022-09-05,https://www.linkedin.com/jobs/view/software-en...,Responsibilities\n\nTikTok is the leading dest...
4,3242829720,United States,Python Backend Engineer,Geomagical Labs,2022-08-29,https://www.linkedin.com/jobs/view/python-back...,Geomagical Labs is crafting 3D AI experiences ...


In [18]:
wordcount ={}

In [44]:
stopwords = ['and','to','the']

In [45]:
for text in df['Description']:
    for word in text.lower().split():
        word = word.replace(".","")
        word = word.replace(",","")
        word = word.replace(":","")
        word = word.replace("\"","")
        word = word.replace("!","")
        word = word.replace("â€œ","")
        word = word.replace("â€˜","")
        word = word.replace("*","")
        if word not in stopwords:
            if word not in wordcount:
                wordcount[word] = 1
            else:
                wordcount[word] += 1

In [46]:
word_counter = collections.Counter(wordcount)

In [47]:
lst = word_counter.most_common(100)

In [48]:
lst_df = pd.DataFrame(lst, columns = ['Word','Count'])

In [49]:
lst_df

Unnamed: 0,Word,Count
0,and,884
1,of,680
2,in,582
3,a,528
4,to,516
...,...,...
95,benefits,48
96,join,48
97,able,48
98,can,48
