In [2]:
from requests_html import HTML,HTMLSession
from bs4 import BeautifulSoup as bs
import datetime
import pandas as pd
import requests
import pygsheets
import logging
import gspread
from gspread_dataframe import set_with_dataframe
from google.oauth2.service_account import Credentials
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
scopes = ['https://www.googleapis.com/auth/spreadsheets',
          'https://www.googleapis.com/auth/drive']
logger = logging.getLogger(__name__)

In [3]:
gc = "googledrive/searchconsole-364317-ee5fb100ebef.json"
credentials = Credentials.from_service_account_file(gc, scopes=scopes)
gc1 = gspread.authorize(credentials)
gauth = GoogleAuth()
drive = GoogleDrive(gauth)
# open a google sheet
gs = gc1.open('WebResults')
def write_df(df,gs,sheet_index):
    try:
        wks = gs.get_worksheet(sheet_index)
        # check if there are existing data and shape
        existing_data = get_existing_jobs(wks)
        if len(existing_data) != 0:
            df_values = df.values.tolist()
            gs.values_append('joblisting', {'valueInputOption': 'RAW'}, {'values': df_values})
        else:
            wks.set_dataframe(df,f'A1')
            set_with_dataframe(worksheet=wks, dataframe=df, include_index=False,include_column_header=True)
    except Exception as e:
        print(e)
        
def get_existing_jobs(gs,sheet_index):
    try:
        wks = gs.get_worksheet(sheet_index)
        available = [i for i in wks.col_values(4) if i != "job_id"]
        return available
    except Exception as e:
        return None

files = get_existing_jobs(gs,2)

In [175]:
class LinkedInScrap:
    def __init__(self,existing_jobs):
        self.existing_jobs = existing_jobs
    
    def check_exising_jobs(self,jobid):
        if jobid in self.existing_jobs:
            raise Exception(f"Job with ID '{jobid}' already exists,skipping...")

    @staticmethod
    def cleanupcriteria(df_json):
        result = {}
        for d in df_json:
            result.update(d)
        return result
    def external_link(self, job_link):
        """this function will return the external link if the job has been posted.


        Args:
            job_link (string): link to job to be scrapped.

        Returns:
            _type_: exact link redirect
        """
        resp = requests.get(job_link)
        soup = bs(resp.text, "html.parser")
        apply_btn_link = (
            soup.find(class_="sign-up-modal__direct-apply-on-company-site")
            .find("a")
            .get("href")
        )
        apply_btn_link = requests.get(apply_btn_link).url
        return apply_btn_link

    def searchlinkedinjobs(self, start=0):
        """this function will return the search result linkedin job search


        Args:
            start (int, optional): . Defaults to 0.

        Returns:
            response:
        """
        params = {
            "keywords": "medical services jobs",
            "location": "Kenya",
            # "geoId": "100710459",
            # "trk": "public_jobs_jobs-search-bar_search-submit",
            "start": start,
        }
        response = requests.get(
            "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search",
            params=params,
        )
        return response

    def jobdetails_info(self, job_api_details):
        
        resp = requests.get(job_api_details)
        jobdesc = bs(resp.text, "html.parser")
        job_desc_details = jobdesc.find(
            class_="show-more-less-html__markup show-more-less-html__markup--clamp-after-5"
        )
        job_desc_details = str(job_desc_details).replace('show-more-less-html__markup show-more-less-html__markup--clamp-after-5','job-desc')
        jobcriterialist = [
            {
                i.find(class_="description__job-criteria-subheader")
                .text.strip(): i.find(class_="description__job-criteria-subheader")
                .find_next_sibling()
                .text.strip()
            }
            for i in jobdesc.find(
                "ul", class_="description__job-criteria-list"
            ).find_all("li")
        ]
        jobcriterialist = LinkedInScrap.cleanupcriteria(jobcriterialist)

        out = {"jobcriterialist": jobcriterialist, "job_desc_details": job_desc_details}
        return out

    def jobdetails(self, listings):
        try:
            job_title = listings.find(class_="base-search-card__title").text.strip()
            company = listings.find(class_="base-search-card__subtitle").text.strip()
            # location     = listings.find(class_ = 'job-search-card__location').text.strip()
            job_link = listings.find(class_="base-card__full-link")
            if job_link:
                job_link = job_link.get("href")
                jobid = job_link.split("?")[0].split("-")[-1]
                self.check_exising_jobs(jobid)
            
                job_api_details = (
                    f"https://ke.linkedin.com/jobs-guest/jobs/api/jobPosting/{jobid}"
                )
                # get link to application
                print("Getting application details")
                application_link = self.external_link(job_link)
                logger.debug("Getting Job details")
                details = self.jobdetails_info(job_api_details)
            company_link = listings.find(class_="hidden-nested-link")
            if company_link:
                company_link = company_link.get("href")
            company_logo = (
                listings.find("div", class_="search-entity-media")
                .find("img")
                .get("data-delayed-url")
            )
            location = listings.find(class_="job-search-card__location").text.strip()
            posting_time = listings.time.get("datetime")
            res = {
                "job_title": job_title,
                "company": company,
                "job_link": job_link,
                "job_id": jobid,
                "application_link": application_link,
                "job_api_details": job_api_details,
                "company_logo": company_logo,
                "location": location,
                "posting_time": posting_time,
                "crawl_time": str(datetime.datetime.now()),
                # "url": response.url,
                "details": details,
            }
            
            return res
        except Exception as e:
            print(e)

    def post_data(self,res):
        try:
            post_data = {
                'companylogo'      :  res['company_logo'], 
                'companyname'      :  res['company'] ,
                'location'         :  res['location'] ,
                'postingdate'      :  res['posting_date'] ,
                'job_desc'         :  f"""{res['details']}""",
                'applicationlinks' :  res['application_link'] 
            }
            return post_data
        except Exception as e:
            return None
        
    def main(self,end):
        start = 0
        jd = []
        while True:
            logger.debug("getting links")
            response = self.searchlinkedinjobs(start)
            soup = bs(response.text, 'html.parser').find_all("li")
            logger.debug("Parsing getting links")
            for i in range(len(soup)):
                logger.debug(f'{i} of {len(soup)}')
                
                new_jd = self.jobdetails(soup[i])
                jd.append(new_jd)
            start = start + 25
            logger.debug('current start', start,end = "\r")
            if start >= end:
                break  
        return jd 

init = LinkedInScrap(existing_jobs=files)
jd = init.main(end=50)

Getting application details
'NoneType' object has no attribute 'find'
local variable 'jobid' referenced before assignment
Getting application details
Getting application details
'NoneType' object has no attribute 'find'
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
local variable 'jobid' referenced before assignment
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details
Getting application details


In [224]:
pd.json_normalize(list(filter(None,jd)))['details.jobcriterialist.Industries'].value_counts()

Hospitals and Health Care                                                      9
Government Administration                                                      8
Wellness and Fitness Services                                                  4
Financial Services                                                             3
Banking and Financial Services                                                 2
Non-profit Organization Management                                             2
Software Development                                                           2
Research Services, Biotechnology Research, and Pharmaceutical Manufacturing    1
Travel Arrangements                                                            1
Personal Care Product Manufacturing and Manufacturing                          1
Medical Equipment Manufacturing, Oil and Gas, and Hospitals and Health Care    1
International Affairs                                                          1
Biotechnology Research and P

In [206]:
import re


def cleanupindustry(label):
    if re.search("Hospitals|Health|Care", label.lower()):
        label = "Hospitalsandhealthcare"
    elif re.search("government ", label.lower()):
        label = "government"
    elif re.search("business|finance|banking ", label.lower()):
        label = "business"
    elif re.search("non-profit|non-government|ngo|private", label.lower()):
        label = "ngo"
    elif re.search("computer|technology|software", label.lower()):
        label = "ict"
    else:
        label = "Other"
    
    return label


#



['Manegarial', 'ict']

In [226]:
import re 
def cleanuplabel(label):
    label = label.lower()
    categories = {
        "sales|sale|marketing": "sales&marketing",
        "management": "manegarial",
        "engineering": "engineering",
        "ict|information|technology": "ict",
        "human resources|human|resources": "human resources",
        "purchasing and supply chain|purchasing|supply": "purchasing & supply chain",
        "finance|banking|account": "finance&accounting",
        "administrative|admin|administrator": "administrative",
        "consult": "consulting",
        "research": "research",
        "analyst": "analyst",
        "business|development": "business development",
        "data": "datascience",
        "education|training": "education",
        "design|creative": "graphicdesign",
        "health|care|provider|doctor|hospital": "medical&healthcare",
        "building|appliances|electrical|electronics|manufacturing": "building&construction"
    }
    for pattern, category in categories.items():
        if re.search(pattern, label):
            return category
    return "Other"
list(map(lambda x: x if len(x) > 0 else ['Other'],set([i for i in list(map(cleanuplabel,"Other, Information Technology, and Management".split(" ")+["Other, Information Technology, and Management"])) if i != "Other"])))

['manegarial', 'ict']

In [228]:
# labels = "Other, Information Technology, and Management".split(", ")
# result = ['Other' if len(label) == 0 else label for label in set(cleanuplabel(label) for label in "Other, Information Technology, and Management".split(", ")) if label != 'Other']
list(filter(None,jd))[0]['details']['jobcriterialist']['Job function']

'Other'

In [None]:
import logging
import gspread
from gspread_dataframe import set_with_dataframe
from google.oauth2.service_account import Credentials
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

class DriveTools:
    scopes = ['https://www.googleapis.com/auth/spreadsheets',
              'https://www.googleapis.com/auth/drive']

    def __init__(self, creds_file, workbook):
        self.creds_file = creds_file
        self.gauth = GoogleAuth()
        credentials = Credentials.from_service_account_file(self.creds_file, scopes=self.scopes)
        self.gc1 = gspread.authorize(credentials)
        self.gs = self.gc1.open(workbook)
        
       
    def get_existing_jobs(self, sheet_index):
        try:
            wks = self.gs.get_worksheet(sheet_index)
            available = [i for i in wks.col_values(4) if i != "job_id"]
            return available
        except Exception as e:
            return None
    
    def write_df(self, df, sheet_index):
        try:
            wks = self.gs.get_worksheet(sheet_index)
            # check if there are existing data and shape
            existing_data = self.get_existing_jobs(sheet_index)
            if len(existing_data) != 0:
                df_values = df.values.tolist()
                self.gs.values_append('joblisting', {'valueInputOption': 'RAW'}, {'values': df_values})
            else:
                # set_with_dataframe(wks,df, 'A1',include_index=False,)
                set_with_dataframe(worksheet=wks, dataframe=df, include_index=False,include_column_header=True, resize=True)
        except Exception as e:
            print(e)
     
gd = DriveTools(credsfile,workbook="WebResults")
# gd.write_df(df.head(1),2)

In [None]:
from googledrive.drivetools import DriveTools
from jobsites.linkedin import LinkedInScrap
from blogspot.postingengine import *


credsfile = "googledrive/searchconsole-364317-ee5fb100ebef.json"
gd = DriveTools(credsfile, workbook="WebResults")
existing_jobs = gd.get_existing_jobs(2)
jobs = LinkedInScrap(existing_jobs)
joblisting = jobs.main(end=2)
top50 = list(filter(None, joblisting))

In [None]:
top50[0]["details"]["jobcriterialist"]

In [None]:
post_ = post_body(
    company_logo=top50[0]["company_logo"],
    company=top50[0]["company"],
    location=top50[0]["location"],
    posting_time=top50[0]["posting_time"],
    details=top50[0]['details']['job_desc_details'],
    application_link=top50[0]["application_link"],
)

In [None]:
post_