In [113]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime, timedelta, timezone
import time
import pandas as pd

In [114]:
class IndeedClient:

    def __init__(self):
        self.output = []
    
    def create_driver(self):
        chrome_options = Options()
        chrome_options.add_argument('--headless=new')
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36')
    
        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.implicitly_wait(5)
        self.wait = WebDriverWait(self.driver, 5)

    def get_scraped_items(self):
        return self.output

    def close(self):
        self.driver.close()

In [115]:
class IndeedJobClient(IndeedClient):

    @staticmethod
    def _extract_shorthand(url):
        return url.split("/")[-1]

    @staticmethod
    def _create_review_url(url):
        return f"{url}/reviews"
    
    def _scrape_all_pages(self, job):    
        all_scraped_information = []
        has_next_page: bool = True
        page_number = 0
        curr_url = self.generate_job_listing_url(job, page_number)

        while has_next_page:
            try:
                # Gets a maximum of 150 job postings for each job
                if page_number >= 10: break

                print(f"scraping page {page_number + 1} of {job}")

                page_number += 1
                self.driver.get(curr_url)
                job_postings_div = self.driver.find_element(By.ID, 'mosaic-provider-jobcards')
                job_postings_list = job_postings_div.find_element(By.TAG_NAME, "ul").find_elements(By.XPATH, './li')

                if len(job_postings_list) != 18: has_next_page = False

                collection = []
                self._scrape_one_page(job, job_postings_list, collection)
                all_scraped_information.extend(collection)
                curr_url = self.generate_job_listing_url(job, page_number)

                time.sleep(5)
            except Exception as e: 
                print(e)

        print(f"scraping for {job} is completed")
        return all_scraped_information

    def _scrape_one_page(self, default_job, job_postings_list, collection):
        for index, posting in enumerate(job_postings_list):
            if index not in (5, 11, 17):
                collection.append(self._scrape_one_posting(default_job, posting))

    def _scrape_one_posting(self, default_job, posting):
        output = {}

        # Scroll to where the posting exists
        self.driver.execute_script("arguments[0].scrollIntoView(true);", posting)
        time.sleep(2)

        # Show the job posting information on the right side of the screen
        posting.click()
        posting_description = self.wait.until(EC.presence_of_element_located((By.ID, "jobsearch-ViewjobPaneWrapper")))

        # Company Name
        try:
            company_name = posting_description.find_element(By.TAG_NAME, 'a').text 
        except:
            return

        # Company URL
        try: 
            company_url = posting_description.find_element(By.TAG_NAME, 'a').get_attribute("href").split("?")[0]
        except:
            return 

        # Job Title
        try:
            job_title = posting_description.find_element(By.XPATH, "//h2[@data-testid='jobsearch-JobInfoHeader-title']").find_element(By.TAG_NAME, 'span').text.split('\n')[0]
        except:
            job_title = default_job

        # Apply Now URL
        try:
            apply_container = posting_description.find_element(By.ID, 'applyButtonLinkContainer')
            apply_now_url = apply_container.find_element(By.TAG_NAME, 'button').get_attribute('href')
        except:
            apply_now_url = None

        # Job description
        try:
            description = posting_description.find_element(By.ID, 'jobDescriptionText').text
        except:
            return

        # Output all the information
        output['mainJob'] = default_job
        output['companyName'] = company_name
        output['companyUrl'] = company_url
        output['companyReviewUrl'] = self._create_review_url(company_url)
        output['companyShorthand'] = self._extract_shorthand(company_url)
        output['jobTitle'] = job_title
        output['applyNowUrl'] = apply_now_url
        output['jobDescription'] = description

        return output
        
    def scrape_job_listings(self, lst_of_jobs: list[str]):
        for job in lst_of_jobs:
            self.create_driver()
            self.perform_initial_job_cleanups()

            self.output.extend(self._scrape_all_pages(job))
            self.close()

            time.sleep(5)

        self.add_expiration_date()

    def perform_initial_job_cleanups(self):
        self.driver.get(self.generate_job_listing_url('software engineer', 0))
        time.sleep(3)
        self.clear_popups()

    def clear_popups(self):
        self.driver.refresh()
        time.sleep(3)

    def generate_job_listing_url(self, job, page_number):
        base_url = f'https://sg.indeed.com/jobs?q={"+".join(job.split())}&l=Singapore&radius=10&fromage=1&start={page_number * 10}'
        return base_url

    def add_expiration_date(self):
        for dic in self.output:
            dic['expiration_date'] = datetime.now(timezone.utc) + timedelta(days=30)

In [124]:
class IndeedCompanyClient(IndeedClient):

    def _is_correct_url(self, url):
        return "sg.indeed.com" in url

    def _is_float(self, s):
        try:
            float(s)
            return True
        except:
            return False

    def _get_ratings_by_category(self, categories_block, company_info_output):
        categorical_elements = categories_block.find_elements(By.XPATH, './child::div')
        for item in categorical_elements:
            list_of_items = item.text.split("\n")

            rating = list_of_items[0]
            category = list_of_items[1].replace(" ", "").replace("/", "")

            company_info_output[f"company{category}Rating"] = None if not self._is_float(rating) else float(rating)

    def _get_overall_rating(self, overall_rating_block, company_info_output):
        rating, review_total_string = overall_rating_block.text.split('\n')
        counts_string = review_total_string.split("Based on ")[1].split(" ")[0]

        company_info_output['companyOverallRating'] = float(rating)
        company_info_output['companyReviewCounts'] = int("".join(counts_string.split(",")))

    def _get_histogram_rating(self, histogram_block, company_info_output):
        histogram_elements = histogram_block.find_element(By.TAG_NAME, 'div').find_elements(By.XPATH, "./child::div")

        for elem in histogram_elements:
            rating_value, rating_counts = elem.text.split("\n")
            if "K" in rating_counts:
                rating_counts = int(float(rating_counts.split("K")[0]) * 1000)
            else:
                rating_counts = int(rating_counts)
        
            company_info_output[f"companyTotal{rating_value}Star"] = rating_counts
    
    def _get_company_name(self, company_info_output, company_shorthand):
        try:
            company_name = self.driver.find_element(By.CSS_SELECTOR, ".css-19rjr9w.e1wnkr790").text
            company_info_output['companyName'] = company_name
        except:
            company_info_output['companyName'] = company_shorthand

    def _scrape_company_stats(self, company_url, company_shorthand, output):
        self.driver.get(company_url)
        time.sleep(1)
        self._get_company_name(output, company_shorthand)
        
        try:
            rating_block = self.driver.find_element(By.XPATH, "//div[@data-tn-component='rating-histogram']")
            overall_rating_block, histogram_block, category_ratings_block = rating_block.find_elements(By.XPATH, "./child::div")

            self._get_overall_rating(overall_rating_block, output)
            self._get_histogram_rating(histogram_block, output)
            self._get_ratings_by_category(category_ratings_block, output)

        except Exception as e:
            print(e)

    def scrape_companies_stats(self, company_urls: list[dict]):
        for i in range(0, len(company_urls), 20):
            self.create_driver()

            for company_dict in company_urls[i:i+20]:

                review_url = company_dict.get('companyReviewUrl')
                company_url = company_dict.get('companyUrl')
                company_shorthand = company_dict.get('companyShorthand')

                if self._is_correct_url(review_url):
                    company_info = {}
                    company_info['companyShorthand'] = company_shorthand
                    company_info['companyUrl'] = company_url
                    company_info['companyReviewUrl'] = review_url

                    self._scrape_company_stats(review_url, company_shorthand, company_info)
                    self.output.append(company_info)
                    print(company_info)
                    time.sleep(2)
            
            self.close()
            

In [63]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://geraldho80:WAf5hj1MNPZxrPVF@cluster0.xtz5a2z.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi("1"))

# connecting to indeed db
db = client["indeed"]
collection = db["jobDescriptions"]

# collection.insert_many()
collection.create_index('expiration_date', expireAfterSecond=0)

In [92]:
review_urls = collection.distinct('companyReviewUrl')

In [126]:
test_driver_company = IndeedCompanyClient()
test_driver_company.scrape_companies_stats(documents)

{'companyShorthand': 'Singhealth', 'companyUrl': 'https://sg.indeed.com/cmp/Singhealth', 'companyReviewUrl': 'https://sg.indeed.com/cmp/Singhealth/reviews', 'companyName': 'SingHealth', 'companyOverallRating': 3.4, 'companyReviewCounts': 19, 'companyTotal5Star': 2, 'companyTotal4Star': 7, 'companyTotal3Star': 7, 'companyTotal2Star': 2, 'companyTotal1Star': 1, 'companyWorkLifeBalanceRating': 3.2, 'companySalaryBenefitsRating': 3.4, 'companyJobsecurityadvancementRating': 3.0, 'companyManagementRating': 2.9, 'companyCultureRating': 2.9}
{'companyShorthand': 'Agency-For-Science,-Technology-and-Research-(a*star)', 'companyUrl': 'https://sg.indeed.com/cmp/Agency-For-Science,-Technology-and-Research-(a*star)', 'companyReviewUrl': 'https://sg.indeed.com/cmp/Agency-For-Science,-Technology-and-Research-(a*star)/reviews', 'companyName': 'Agency for Science, Technology and Research (A*STAR)', 'companyOverallRating': 3.7, 'companyReviewCounts': 81, 'companyTotal5Star': 20, 'companyTotal4Star': 30, 

In [None]:
from pymongo import UpdateOne

collection = db['companyStats']

bulk_operations = []
for doc in test_driver_company.get_scraped_items():
    # Define filter to match documents with the same 'companyShorthand'
    filter_criteria = {'companyShorthand': doc['companyShorthand']}
    # Define update operation to replace the existing document with the new one
    update_operation = UpdateOne(filter_criteria, {'$set': doc}, upsert=True)
    # Add the update operation to the bulk write operations
    bulk_operations.append(update_operation)

# Execute bulk write operations
result = collection.bulk_write(bulk_operations)



In [None]:
test_driver_company.get_scraped_items()

In [117]:
pipeline = [
    {
        "$group": {
            "_id": {
                "companyShorthand": "$companyShorthand",
                "companyUrl": "$companyUrl",
                "companyReviewUrl": "$companyReviewUrl",
            },
            "count": {"$sum": 1},
        }
    },
    {
        "$project": {
            "companyShorthand": "$_id.companyShorthand",
            "companyUrl": "$_id.companyUrl",
            "companyReviewUrl": "$_id.companyReviewUrl",
            "_id": 0,
        }
    },
]

documents = list(collection.aggregate(pipeline))

In [118]:
test_new_driver_company = IndeedCompanyClient()

test_new_driver_company.scrape_companies_stats(
    documents
)

Message: no such element: Unable to locate element: {"method":"xpath","selector":"//div[@data-tn-component='rating-histogram']"}
  (Session info: chrome=123.0.6312.58); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x000000010126c3e8 chromedriver + 4326376
1   chromedriver                        0x00000001012648b0 chromedriver + 4294832
2   chromedriver                        0x0000000100e90088 chromedriver + 278664
3   chromedriver                        0x0000000100ed2a80 chromedriver + 551552
4   chromedriver                        0x0000000100f0b4f8 chromedriver + 783608
5   chromedriver                        0x0000000100ec74e4 chromedriver + 505060
6   chromedriver                        0x0000000100ec7f5c chromedriver + 507740
7   chromedriver                        0x000000010122f984 chromedriver + 4077956
8   chromedriver       

In [121]:
print(len(documents))

80


In [119]:
test_new_driver_company.get_scraped_items()

[{'companyShorthand': 'Singhealth',
  'companyUrl': 'https://sg.indeed.com/cmp/Singhealth',
  'companyReviewUrl': 'https://sg.indeed.com/cmp/Singhealth/reviews',
  'companyName': 'SingHealth',
  'companyOverallRating': 3.4,
  'companyReviewCounts': 19,
  'companyTotal5Star': 2,
  'companyTotal4Star': 7,
  'companyTotal3Star': 7,
  'companyTotal2Star': 2,
  'companyTotal1Star': 1,
  'companyWorkLifeBalanceRating': 3.2,
  'companySalaryBenefitsRating': 3.4,
  'companyJobsecurityadvancementRating': 3.0,
  'companyManagementRating': 2.9,
  'companyCultureRating': 2.9},
 {'companyShorthand': 'Agency-For-Science,-Technology-and-Research-(a*star)',
  'companyUrl': 'https://sg.indeed.com/cmp/Agency-For-Science,-Technology-and-Research-(a*star)',
  'companyReviewUrl': 'https://sg.indeed.com/cmp/Agency-For-Science,-Technology-and-Research-(a*star)/reviews',
  'companyName': 'Agency for Science, Technology and Research (A*STAR)',
  'companyOverallRating': 3.7,
  'companyReviewCounts': 81,
  'comp

In [120]:
collection = db["companyStats"]
collection.insert_many(test_driver_company.get_scraped_items())

InsertManyResult([ObjectId('65ff1af5bd3940efc69d6edc'), ObjectId('65ff1af5bd3940efc69d6edd'), ObjectId('65ff1af5bd3940efc69d6ede'), ObjectId('65ff1af5bd3940efc69d6edf'), ObjectId('65ff1af5bd3940efc69d6ee0'), ObjectId('65ff1af5bd3940efc69d6ee1'), ObjectId('65ff1af5bd3940efc69d6ee2'), ObjectId('65ff1af5bd3940efc69d6ee3'), ObjectId('65ff1af5bd3940efc69d6ee4'), ObjectId('65ff1af5bd3940efc69d6ee5'), ObjectId('65ff1af5bd3940efc69d6ee6'), ObjectId('65ff1af5bd3940efc69d6ee7'), ObjectId('65ff1af5bd3940efc69d6ee8'), ObjectId('65ff1af5bd3940efc69d6ee9'), ObjectId('65ff1af5bd3940efc69d6eea'), ObjectId('65ff1af5bd3940efc69d6eeb'), ObjectId('65ff1af5bd3940efc69d6eec'), ObjectId('65ff1af5bd3940efc69d6eed'), ObjectId('65ff1af5bd3940efc69d6eee'), ObjectId('65ff1af5bd3940efc69d6eef'), ObjectId('65ff1af5bd3940efc69d6ef0'), ObjectId('65ff1af5bd3940efc69d6ef1'), ObjectId('65ff1af5bd3940efc69d6ef2'), ObjectId('65ff1af5bd3940efc69d6ef3'), ObjectId('65ff1af5bd3940efc69d6ef4'), ObjectId('65ff1af5bd3940efc69d6e

In [50]:
from pymongo import UpdateOne
from datetime import datetime, timezone, timedelta
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://geraldho80:WAf5hj1MNPZxrPVF@cluster0.xtz5a2z.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi("1"))
db = client['indeed']

collection = db["jobDescriptions"]

date_scraped = datetime.now(timezone.utc)
time_filter = date_scraped + timedelta(days=29)

print(time_filter)


pipeline = [
    {"$match": {"expiration_date": datetime(2024, 4, 22, 17, 40, 1, 239000)}},
    {"$group": {"_id": "$companyUrl", "document": {"$first": "$$ROOT"}}},
    {"$replaceRoot": {"newRoot": "$document"}},
    {
        "$project": {
            "_id": 0,
            "companyShorthand": "$companyShorthand",
            "companyUrl": "$companyUrl",
            "companyReviewUrl": "$companyReviewUrl",
        }
    },
]

documents = list(collection.aggregate(pipeline))
documents

2024-04-22 07:46:14.374642+00:00


[{'companyShorthand': 'Nodeflair',
  'companyUrl': 'https://sg.indeed.com/cmp/Nodeflair',
  'companyReviewUrl': 'https://sg.indeed.com/cmp/Nodeflair/reviews'},
 {'companyShorthand': 'Combuilder-Pte-Ltd',
  'companyUrl': 'https://sg.indeed.com/cmp/Combuilder-Pte-Ltd',
  'companyReviewUrl': 'https://sg.indeed.com/cmp/Combuilder-Pte-Ltd/reviews'},
 {'companyShorthand': 'Shopee',
  'companyUrl': 'https://sg.indeed.com/cmp/Shopee',
  'companyReviewUrl': 'https://sg.indeed.com/cmp/Shopee/reviews'},
 {'companyShorthand': 'JPMorgan-Chase-&-Co-7555c073',
  'companyUrl': 'https://sg.indeed.com/cmp/JPMorgan-Chase-&-Co-7555c073',
  'companyReviewUrl': 'https://sg.indeed.com/cmp/JPMorgan-Chase-&-Co-7555c073/reviews'},
 {'companyShorthand': 'Apple',
  'companyUrl': 'https://sg.indeed.com/cmp/Apple',
  'companyReviewUrl': 'https://sg.indeed.com/cmp/Apple/reviews'},
 {'companyShorthand': 'Nodeflair',
  'companyUrl': 'https://sg.indeed.com/cmp/Nodeflair',
  'companyReviewUrl': 'https://sg.indeed.com/cm

In [17]:
datetime.now().date() + timedelta(days=30)

datetime.date(2024, 4, 23)

In [20]:
datetime.now().date().isoformat()

'2024-03-24'

In [40]:
results = collection.find({}, {"_id": 0, 'expiration_date': 1})

In [41]:
results

<pymongo.cursor.Cursor at 0x1093b8dc0>

In [42]:
list(results)

[{'expiration_date': datetime.datetime(2024, 4, 22, 17, 40, 1, 239000)},
 {'expiration_date': datetime.datetime(2024, 4, 22, 17, 40, 1, 239000)},
 {'expiration_date': datetime.datetime(2024, 4, 22, 17, 40, 1, 239000)},
 {'expiration_date': datetime.datetime(2024, 4, 22, 17, 40, 1, 239000)},
 {'expiration_date': datetime.datetime(2024, 4, 22, 17, 40, 1, 239000)},
 {'expiration_date': datetime.datetime(2024, 4, 22, 17, 40, 1, 239000)},
 {'expiration_date': datetime.datetime(2024, 4, 22, 17, 40, 1, 239000)},
 {'expiration_date': datetime.datetime(2024, 4, 22, 17, 40, 1, 239000)},
 {'expiration_date': datetime.datetime(2024, 4, 22, 17, 40, 1, 239000)},
 {'expiration_date': datetime.datetime(2024, 4, 22, 17, 40, 1, 239000)},
 {'expiration_date': datetime.datetime(2024, 4, 22, 17, 40, 1, 239000)},
 {'expiration_date': datetime.datetime(2024, 4, 22, 17, 40, 1, 239000)},
 {'expiration_date': datetime.datetime(2024, 4, 22, 17, 40, 1, 239000)},
 {'expiration_date': datetime.datetime(2024, 4, 22,

In [43]:
query = {'expiration_date': datetime(2024, 4, 22, 17, 40, 1, 239000)}
results = collection.find(query)
list(results)

[{'_id': ObjectId('65ff100abd3940efc69d6e38'),
  'mainJob': 'cloud engineer',
  'companyName': 'NodeFlair',
  'companyUrl': 'https://sg.indeed.com/cmp/Nodeflair',
  'companyReviewUrl': 'https://sg.indeed.com/cmp/Nodeflair/reviews',
  'companyShorthand': 'Nodeflair',
  'jobTitle': 'Cloud Engineer',
  'applyNowUrl': 'https://sg.indeed.com/applystart?jk=3c1f447fc7391df6&from=vj&pos=bottom&mvj=0&jobsearchTk=1hpm4p950jqkk80l&spon=0&xkcb=SoCN67M3D2Bq_IRgxp0KbzkdCdPP&xfps=20a3fa05-22cf-498f-93a8-2d58e8c6d57a&sjdu=b7vh_jdkj9G0GZidtKO9loDDplm6SwWjHcxyoDIphKlAmwB258hAhRWDZcDfNfj-JHRWqlf_tKKQjTgk0_LdE3LJgYxexivXnr43_lGgVs4NV0mZKVmHQDdnFKYuOv83DNtKXk3XbvHjG-WGOGb0rzXiSprGlgQ7pe4pnH0LNzgBJbC3wZ1uNll9AhkGI2p1w9_XINGINv7OnqX9-Wv1cg&vjfrom=vjs&astse=20c6155037c090a0&assa=2556',
  'jobDescription': "Job Summary\n\nSalary\nS$7,500 - S$9,000 / Monthly\n\nJob Type\n\nSeniority\nSenior\n\nYears of Experience\nAt least 6 years\n\nTech Stacks\nPowershell AWS Terraform Windows Server Vault IIS Puppet Azure An