In [None]:
"""
Authour: Naresh Kumar Chava
Date: 02/10/2021

Desc: Scrape Linkedin posts hiring for a specific job key words and output a file consisting 

1. Link to post
2. Post Description
3. Link to Job (if any)
4. Link to Author's profile
5. Authour location (filtering purposes)


"""

In [4]:
#Import LI scraping custom utility
from liUtils import *


#Web interactions
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

#Web scrapping
import bs4
#Username & password prompts
import getpass
#Clipboard operations
import pyperclip

import time 
import pandas as pd


In [5]:
#Open new browser controlled by selenium
browser=webdriver.Chrome("C:\Storage\Softwares\chromedriver.exe")
#Prompt for user name and password
username=getpass.getpass(prompt='Linkedin User Name: ', stream=None) 
password=getpass.getpass(prompt='Linkedin Password: ', stream=None) 

#Log into Linkedin
linkedin_login(browser,username,password)

In [None]:
#Linkedin search term
search_term="senior data scientist"
#Change as required
date_filter="past-week" ##Possible values: 'past-24h','past-week','past-month'
# date_filter="past-24h" ##Possible values: 'past-24h','past-week','past-month'
#Output file name
out_file=f"LI_posts_{search_term}_{date_filter}_.xlsx"

In [None]:
#switch to first window in case multiple windows are open
browser.switch_to.window(browser.window_handles[0])

#Find search button and search for the given keyword
try:
    browser.find_elements_by_xpath("//button[@aria-label='Click to start a search']")[0].click()
except:
    time.sleep(0)
time.sleep(1)
search = WebDriverWait(browser, 20).until(
EC.element_to_be_clickable((By.XPATH, "//input[@aria-label='Search']")))
search.send_keys(Keys.CONTROL + "a")
search.send_keys(Keys.DELETE)
search.send_keys(search_term)
search.send_keys(Keys.RETURN)
#Wait 2 seconds for the page to load
time.sleep(2)
#Click on Posts filter
try:
    browser.find_elements_by_xpath("//button[@aria-label='Posts']")[0].send_keys(Keys.RETURN)
except:
    time.sleep(2)
    browser.find_elements_by_xpath("//button[@aria-label='Posts']")[0].send_keys(Keys.RETURN)
time.sleep(3)

# browser.switch_to.window(browser.window_handles[-1])

#Filter Date posted to Past 24 hours 
browser.find_elements_by_xpath("//button[@aria-label='Date posted filter. Clicking this button displays all Date posted filter options.']" )[0].click()
filterElement=browser.find_elements_by_xpath(f"//input[@type='radio' and @value='{date_filter}']" )[0]
browser.execute_script("arguments[0].click();", filterElement)
#Click apply button
src = browser.page_source
soup = bs4.BeautifulSoup(src, 'lxml')
filter_button_id=soup.find("div",{"id":'hoverable-outlet-date-posted-filter-value'})\
    .find("button",{'aria-label':'Apply current filter to show results'}).get('id')
browser.find_elements_by_xpath(f"//button[@id='{filter_button_id}']")[0].click()

time.sleep(2)

#Get number of pages
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
pagination_src = browser.page_source
pagination_soup = bs4.BeautifulSoup(pagination_src, 'lxml')
page_buttons=pagination_soup.find_all("li",{'class':'artdeco-pagination__indicator artdeco-pagination__indicator--number ember-view'})
n_pages=int(page_buttons[-1].find('button')["aria-label"].replace('Page','').strip())
# n_pages=1
output=[['Name','Title','Location','Profile_link','Post_link','Post_content','Job_link']]
#Zoom out to fit all posts
# browser.execute_script("document.body.style.zoom='70%'")
for page in range(1,n_pages+1):
    if page!=1:
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        #Click on next page
        browser.find_elements_by_xpath(f"//button[@aria-label='Page {str(page)}']")[0].send_keys(Keys.RETURN)
        time.sleep(2)
    #Get new page source code
    src = browser.page_source
    soup = bs4.BeautifulSoup(src, 'lxml')

    #Posts
    post_containers=soup.find_all("li",{"class":'reusable-search__result-container artdeco-card search-results__hide-divider mb2'})
    #Iterate over post containers and extract required info
    browser.execute_script("window.scrollTo(0, -document.body.scrollHeight);")
    scroll=0.25
    for post_container in post_containers:
        #post details
        try:
            lnk=post_container.find_all("a", href=True)[-1]['href']
            
        except:
            lnk=post_link(browser,post_container)

        #Content link
        try:
            content=post_content(lnk)
        except:
            content=[]
        hiring_flag=any(key in content[0] for key in ['hiring','looking']) if len(content)>0 else False
        #Author details
        authour_ln_link=post_container.find("a", href=True)['href']
        person_profile= True if 'company' not in authour_ln_link else False
        #Skip if posted by a company page
        if person_profile and hiring_flag:
            try:
                profile=author_profile(authour_ln_link)
            except:
                profile=[]
            output+=[profile+[lnk]+content]
        #Increase scroll
        browser.execute_script(f"window.scrollTo(0, document.body.scrollHeight*{scroll});")
        scroll=scroll+0.25
        
header=output[0]
table=output[1:]
out_df=pd.DataFrame(table,columns=header)
#Close browser
browser.close()
#Write output to a excel file
out_df.to_excel(out_file)

In [None]:
#Other Browser operations
# browser.execute_script("document.body.style.zoom='80%'")
# browser.maximize_window()
# browser.set_window_size(1500, 1800)
# browser.switch_to.window(browser.window_handles[0])
