In [None]:
# Importing Libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait as Wait
from selenium.webdriver.support import expected_conditions as EC

import matplotlib.pyplot as plt
import nltk
import seaborn as sns

from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
# getting the keywords for the search
query = 'Mechanical Engineer'  # chang to the keyword you want
location_org = 'United States' # change to the location you want

# getting the patch of the chrome driver
chromedriver_path = '' # path to your chrome driver exe file (example: C:\Users\username\Downloads\chromedriver_win32\chromedriver.exe)
# Linkedin username and password
UserName = '' # use your linkedin username
Password = '' # use your linkedin password


In [None]:
# running the driver
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")

driver = webdriver.Chrome(options=options, executable_path=chromedriver_path)
driver.implicitly_wait(10)
wait = Wait(driver, 5)

In [None]:
# logging in to the linkedin account
driver.get('https://www.linkedin.com/login')

email_input = driver.find_element(By.ID, 'username')
password_input = driver.find_element(By.ID, 'password')
email_input.send_keys(UserName)
password_input.send_keys(Password)
password_input.send_keys(Keys.ENTER)
time.sleep(10)

In [None]:
# going over a loop for searching the job postings
total_pages = 40
for page_num in range(1, total_pages):
    print(f'page {page_num} ...')
    url = f'https://www.linkedin.com/jobs/search/?keywords={query}&location={location_org}&start={25 * (page_num - 1)}'
    driver.get(url)
    
    job_title_list = []
    company_name_list = []
    job_desc_list = []
    job_id_list = []

    # job id collection
    job_ids = []


    # scrolling down the page and collecting the data - these scrolling steps are based on the screen size and selected randomly
    # they are not optimized and can be changed based on the screen size 
    steps_scrolling = [500, 1000, 1400, 1800, 2200, 2400, 2700, 3000, 3300, 3500]

    for step in steps_scrolling:
        driver.get(url)
        time.sleep(10)  # Wait for 5 seconds
        element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="main"]/div/div[1]/div')))
        driver.execute_script(f"arguments[0].scroll(0, {step});", element)

        time.sleep(10)  # Wait for 5 seconds
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        

        # let's extract the job title for the jobs in the page
        job_title = soup.find_all('div', {'class': 'full-width artdeco-entity-lockup__title ember-view'})#.get_text().strip()
        # let's extract the company name for the jobs in the page
        company_name = soup.find_all('span', {'class': 'job-card-container__primary-description'})#.get_text().strip()
        # let's extract the location for the jobs in the page
        location_salary = soup.find_all('li', {'class': 'job-card-container__metadata-item'})#.get_text().strip()
        # let's extract the job ids
        ids_object = soup.find_all('li', {'class': 'ember-view jobs-search-results__list-item occludable-update p0 relative scaffold-layout__list-item'})
        job_ids = [ind['data-occludable-job-id'] for ind in ids_object]

        for counter in range(len(job_ids)):
            try:
                job_title_list.append( job_title[counter].get_text().strip() )
            except:
                job_title_list.append(None)
            
            try:
                company_name_list.append( company_name[counter].get_text().strip() )
            except:
                company_name_list.append(None)

        # going over the job ids and collecting the job descriptions
        for job_id in job_ids:
            # checking if job id is already in the list and collect the job description if that is not the case
            if job_id not in job_id_list:
                job_id_list.append(job_id)
                try:
                    url_job = f'https://www.linkedin.com/jobs/search/?currentJobId={job_id}&keywords={query}&location={location_org}&start={25 * (page_num - 1)}'
                    driver.get(url_job)
                    time.sleep(5)  # Wait for 5 seconds
                    soup_job = BeautifulSoup(driver.page_source, 'html.parser')
                    job_desc = soup_job.find_all('div', {'class': 'jobs-search__job-details--container'})
                    job_desc = job_desc[0].find('article', class_='jobs-description__container m4').get_text().strip()
                    job_desc_list.append(job_desc)
                except:
                    job_desc_list.append(None)
                    continue
            else:
                job_id_list.append(None)
                job_desc_list.append(None)
                continue

    # creating a dataframe from these lists
    df_tmp = pd.DataFrame({'job_title': job_title_list,
                        'company_name': company_name_list,
                        'job_desc': job_desc_list,
                        'job_id': job_id_list
                        })

    # dropping the duplicates based on the job title, company name, and location
    df_tmp.drop_duplicates(subset=['job_title', 'company_name'], inplace=True)
    df_tmp.reset_index(drop=True, inplace=True)

    # let's add the dataframe to the main dataframe
    if page_num == 1:
        df = df_tmp
    else:
        df = pd.concat([df, df_tmp], axis=0)

df.reset_index(drop=True, inplace=True)


In [None]:
# saving the dataframe as a scv file
df.to_csv(f'{query}_{location_org}_jobs.csv', index=False)

In [None]:
def generate_plots(df_input, n_top = 10):
    plt.rcParams.update({'font.size': 15})
    # plotting n_top number of jobs
    plt.figure(figsize=(10,5))
    x_plot = df_input['job_title'].value_counts().index
    sns.barplot(x=x_plot[:n_top], y=df_input['job_title'].value_counts()[:n_top], color='b')
    plt.xlabel('Job Title')
    plt.ylabel('Count')
    plt.grid(True)
    plt.xticks(rotation=75)

    # plotting n_top number of companies
    x_plot = df_input['company_name'].value_counts().index
    plt.figure(figsize=(10,5))
    sns.barplot(x=x_plot[:n_top], y=df_input['company_name'].value_counts()[:n_top], color='b')
    plt.xlabel('Hiring Institution Name')
    plt.ylabel('Count')
    plt.grid(True)
    plt.xticks(rotation=75)

    

def text_analyzer(df_input, other_words_excluded, colum_name ,top_N=10): 

    txt = df_input[colum_name].str.replace(r'\n', ' ', regex=True).str.replace(r'\r', ' ', regex=True).str.replace(r'\t', ' ', regex=True).str.lower().str.cat(sep=' ')
    words = nltk.tokenize.word_tokenize(txt)

    # excluding the stop words
    stopwords = nltk.corpus.stopwords.words('english')
    words_except_stop = [w for w in words if (w not in stopwords) and (w not in other_words_excluded)]

    words_except_stop_dist = nltk.FreqDist(words_except_stop)

    print('All frequencies, excluding Stopwords & Pubcuations:')
    print('=' * 60)
    rslt = pd.DataFrame(words_except_stop_dist.most_common(top_N), columns=['Word', 'Frequency'])
    print(rslt)
    print('=' * 60)

    plt.figure(figsize=(10,6))
    sns.barplot(data=rslt, x="Word", y="Frequency", color='b')
    plt.xticks(rotation=80)
    plt.title(f"{top_N} most frequent words used")


In [None]:
generate_plots(df_input=df, n_top = 10)

In [None]:
# excluding irrelevant words from the descriptions
others = [",", "mechanical", "engineering", "engineer", "'", "about", ":", ";", ")", "(","’",".", \
          "work", "job", "&", "including", "years", "skills", "team", "ability", \
            "development", "requirement", "requirements", "company", "required", "new", \
                "knowledge", "degree", "position", "related", "experience", "projects", "products", "system",\
                    "working", "status", "employment", "opportunity", "opportunities", "role", "benefits",\
                      "test", "$", "develop", "-", "'s", "designs", "engineers","information", "and/or", "may",\
                        "provide", "must", "qualifications", "industry", "environment",
                        "apply", "time", "processes", "disability", "!"]
text_analyzer(df_input=df, other_words_excluded=others, colum_name='job_desc' ,top_N=20)