diff --git a/.github/workflows/flake8.yaml b/.github/workflows/flake8.yaml new file mode 100644 index 0000000..8eec5fc --- /dev/null +++ b/.github/workflows/flake8.yaml @@ -0,0 +1,33 @@ +name: Flake8 Lint + +on: + push: + branches: + - main + - beta + pull_request: + branches: + - main + - beta + +jobs: + flake8-lint: + runs-on: ubuntu-latest + + steps: + - name: Check out repository + uses: actions/checkout@v3 + + - name: Set up Python environment + uses: actions/setup-python@v4 + with: + python-version: '3.9.16' + cache: 'pip' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 + + - name: Run flake8 + run: flake8 . diff --git a/worker/main.py b/worker/main.py index 316c1ea..3c51a2b 100644 --- a/worker/main.py +++ b/worker/main.py @@ -1,139 +1,15 @@ import asyncio import random -from playwright.async_api import async_playwright -import pytz -import loguru - +from scraper import linkedin import helpers -import xpaths -import connection -import constants - - -async def scrape_linkedin( - worker_id: int, info=None, - only_popular=False, headless=False, *args, **kwargs -): - """ - Scrape LinkedIn job postings for different countries. - - :param worker_id: ID of the worker executing the scraping. - :param info: Cached info, if you wish to repeat the process. - :param only_popular: Only use popular countries. - """ - try: - async with async_playwright() as driver: - if info is None: - info = helpers.get_country_and_job(only_popular) - - loguru.logger.info( - f"[WORKER {worker_id}] This round is: {info}" - ) - country, job, job_mode = info - browser = await driver.firefox.launch( - headless=headless, - args=[ - '--start-maximized', - '--foreground', - '--disable-backgrounding-occluded-windows' - ], - firefox_user_prefs=constants.FIREFOX_SETTINGS - ) - - timezone_id = random.choice(pytz.all_timezones) - context = await browser.new_context( - timezone_id=timezone_id, - accept_downloads=True, - is_mobile=False, - has_touch=False, - proxy=helpers.get_random_proxy() - ) - - page = await context.new_page() - await page.bring_to_front() - await page.set_viewport_size( - { - "width": 1920, - "height": 1080 - } - ) - - await page.add_init_script( - constants.SPOOF_FINGERPRINT % helpers.generate_device_specs() - ) - await page.goto(helpers.get_url( - location=country, job=job, mode=job_mode - )) - - if await helpers.does_element_exists(page, xpaths.NEED_LOGIN): - loguru.logger.info(f"[WORKER {worker_id}] Login Required!") - return await scrape_linkedin( - worker_id=worker_id, only_popular=only_popular, - headless=headless, info=info - ) - - all_ads = await page.locator(xpaths.JOB_LI).all() - loguru.logger.info( - f"[WORKER {worker_id}] Found {len(all_ads)} Advertisements" - ) - exists = 0 - for index, div in enumerate(all_ads): - await asyncio.sleep(2) - if index == 100 or exists == 7: - break - await div.click( - timeout=5000 - ) - title_a_tag = page.locator(xpaths.JOB_ID_A_TAG) - ads_id = await title_a_tag.get_attribute('href') - ads_id = ads_id.split("?refId")[0].split("-")[-1] - if not helpers.does_ads_exists(ads_id): - company_name = await helpers.safe_get_element_text( - page, xpaths.COMPANY_NAME, timeout=5000 - ) - location = await helpers.safe_get_element_text( - page, xpaths.LOCATION, timeout=5000 - ) - title = await helpers.safe_get_element_text( - page, xpaths.TITLE, timeout=5000 - ) - await page.locator(xpaths.SHOW_MORE).click(timeout=5000) - info = await helpers.get_element_text( - page, xpaths.BODY_INFO, False, timeout=5000 - ) - await connection.create_ads( - ads_id, location, info.strip(), company_name, - title, 1, employement_type="", level="", - country=country, job_mode=job_mode - ) - loguru.logger.info( - f"[WORKER {worker_id}] Finished {ads_id}" - ) - - else: - loguru.logger.info( - f"[WORKER {worker_id}] {ads_id} Already exists" - ) - exists += 1 - - return - except helpers.PlayWrightTimeOutError: - pass - except Exception as e: - loguru.logger.error(e) - finally: - return await scrape_linkedin( - worker_id=worker_id, only_popular=only_popular, - headless=headless - ) async def run_scrapers(workers: int = 1, only_popular=False, headless=True): while True: tasks = [] for i in range(workers): - tasks.append(asyncio.create_task(scrape_linkedin( + tasks.append(asyncio.create_task(linkedin.scrape_linkedin( worker_id=i+1, only_popular=only_popular, headless=headless ))) await asyncio.sleep(random.randint(1, 3)) # Overhead of browsers diff --git a/worker/scraper/__init__.py b/worker/scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/worker/scraper/instagram.py b/worker/scraper/instagram.py new file mode 100644 index 0000000..50a4b65 --- /dev/null +++ b/worker/scraper/instagram.py @@ -0,0 +1,133 @@ + +from playwright.async_api import async_playwright +import helpers +import loguru +import constants +import random +import pytz +import asyncio + + +async def scrape_linkedin( + worker_id: int, info=None, + only_popular=False, headless=False, *args, **kwargs +): + """ + Scrape LinkedIn job postings for different countries. + + :param worker_id: ID of the worker executing the scraping. + :param info: Cached info, if you wish to repeat the process. + :param only_popular: Only use popular countries. + """ + try: + async with async_playwright() as driver: + if info is None: + info = helpers.get_country_and_job(only_popular) + + loguru.logger.info( + f"[WORKER {worker_id}] This round is: {info}" + ) + country, job, job_mode = info + browser = await driver.chromium.launch( + headless=headless, + args=[ + '--start-maximized', + '--foreground', + '--disable-backgrounding-occluded-windows' + ], + # firefox_user_prefs=constants.FIREFOX_SETTINGS + ) + + timezone_id = random.choice(pytz.all_timezones) + context = await browser.new_context( + timezone_id=timezone_id, + accept_downloads=True, + is_mobile=False, + has_touch=False, + ) + + page = await context.new_page() + await page.bring_to_front() + await page.set_viewport_size( + { + "width": 1920, + "height": 1080 + } + ) + + await page.add_init_script( + constants.SPOOF_FINGERPRINT % helpers.generate_device_specs() + ) + await page.goto("https://www.instagram.com/mozaffar__m42") + input("") + + # if await helpers.does_element_exists(page, xpaths.NEED_LOGIN): + # loguru.logger.info(f"[WORKER {worker_id}] Login Required!") + # return await scrape_linkedin( + # worker_id=worker_id, only_popular=only_popular, + # headless=headless, info=info + # ) + + # all_ads = await page.locator(xpaths.JOB_LI).all() + # loguru.logger.info( + # f"[WORKER {worker_id}] Found {len(all_ads)} Advertisements" + # ) + # exists = 0 + # for index, div in enumerate(all_ads): + # await asyncio.sleep(2) + # if index == 100 or exists == 7: + # break + # await div.click( + # timeout=5000 + # ) + # title_a_tag = page.locator(xpaths.JOB_ID_A_TAG) + # ads_id = await title_a_tag.get_attribute('href') + # ads_id = ads_id.split("?refId")[0].split("-")[-1] + # if not helpers.does_ads_exists(ads_id): + # company_name = await helpers.safe_get_element_text( + # page, xpaths.COMPANY_NAME, timeout=5000 + # ) + # location = await helpers.safe_get_element_text( + # page, xpaths.LOCATION, timeout=5000 + # ) + # title = await helpers.safe_get_element_text( + # page, xpaths.TITLE, timeout=5000 + # ) + # await page.locator(xpaths.SHOW_MORE).click(timeout=5000) + # info = await helpers.get_element_text( + # page, xpaths.BODY_INFO, False, timeout=5000 + # ) + # await connection.create_ads( + # ads_id, location, info.strip(), company_name, + # title, 1, employement_type="", level="", + # country=country, job_mode=job_mode + # ) + # loguru.logger.info( + # f"[WORKER {worker_id}] Finished {ads_id}" + # ) + + # else: + # loguru.logger.info( + # f"[WORKER {worker_id}] {ads_id} Already exists" + # ) + # exists += 1 + + return + except helpers.PlayWrightTimeOutError: + pass + except Exception as e: + loguru.logger.error(e) + finally: + return await scrape_linkedin( + worker_id=worker_id, only_popular=only_popular, + headless=headless + ) + + +async def run_scrapers(): + await scrape_linkedin(worker_id=1, only_popular=False, headless=False) + + +if __name__ == "__main__": + args = helpers.parse_arguments() + used_countries = asyncio.run(run_scrapers()) diff --git a/worker/scraper/linkedin.py b/worker/scraper/linkedin.py new file mode 100644 index 0000000..df215f5 --- /dev/null +++ b/worker/scraper/linkedin.py @@ -0,0 +1,129 @@ +import asyncio +import random + +from playwright.async_api import async_playwright +import pytz +import loguru + +import helpers +import xpaths +import connection +import constants + + +async def scrape_linkedin( + worker_id: int, info=None, + only_popular=False, headless=False, *args, **kwargs +): + """ + Scrape LinkedIn job postings for different countries. + + :param worker_id: ID of the worker executing the scraping. + :param info: Cached info, if you wish to repeat the process. + :param only_popular: Only use popular countries. + """ + try: + async with async_playwright() as driver: + if info is None: + info = helpers.get_country_and_job(only_popular) + + loguru.logger.info( + f"[WORKER {worker_id}] This round is: {info}" + ) + country, job, job_mode = info + browser = await driver.firefox.launch( + headless=headless, + args=[ + '--start-maximized', + '--foreground', + '--disable-backgrounding-occluded-windows' + ], + firefox_user_prefs=constants.FIREFOX_SETTINGS + ) + + timezone_id = random.choice(pytz.all_timezones) + context = await browser.new_context( + timezone_id=timezone_id, + accept_downloads=True, + is_mobile=False, + has_touch=False, + proxy=helpers.get_random_proxy() + ) + + page = await context.new_page() + await page.bring_to_front() + await page.set_viewport_size( + { + "width": 1920, + "height": 1080 + } + ) + + await page.add_init_script( + constants.SPOOF_FINGERPRINT % helpers.generate_device_specs() + ) + await page.goto(helpers.get_url( + location=country, job=job, mode=job_mode + )) + + if await helpers.does_element_exists(page, xpaths.NEED_LOGIN): + loguru.logger.info(f"[WORKER {worker_id}] Login Required!") + return await scrape_linkedin( + worker_id=worker_id, only_popular=only_popular, + headless=headless, info=info + ) + + all_ads = await page.locator(xpaths.JOB_LI).all() + loguru.logger.info( + f"[WORKER {worker_id}] Found {len(all_ads)} Advertisements" + ) + exists = 0 + for index, div in enumerate(all_ads): + await asyncio.sleep(2) + if index == 100 or exists == 7: + break + await div.click( + timeout=5000 + ) + title_a_tag = page.locator(xpaths.JOB_ID_A_TAG) + ads_id = await title_a_tag.get_attribute('href') + ads_id = ads_id.split("?refId")[0].split("-")[-1] + if not helpers.does_ads_exists(ads_id): + company_name = await helpers.safe_get_element_text( + page, xpaths.COMPANY_NAME, timeout=5000 + ) + location = await helpers.safe_get_element_text( + page, xpaths.LOCATION, timeout=5000 + ) + title = await helpers.safe_get_element_text( + page, xpaths.TITLE, timeout=5000 + ) + await page.locator(xpaths.SHOW_MORE).click(timeout=5000) + info = await helpers.get_element_text( + page, xpaths.BODY_INFO, False, timeout=5000 + ) + await connection.create_ads( + ads_id, location, info.strip(), company_name, + title, 1, employement_type="", level="", + country=country, job_mode=job_mode + ) + loguru.logger.info( + f"[WORKER {worker_id}] Finished {ads_id}" + ) + + else: + loguru.logger.info( + f"[WORKER {worker_id}] {ads_id} Already exists" + ) + exists += 1 + + return + except helpers.PlayWrightTimeOutError: + pass + except Exception as e: + loguru.logger.error(e) + finally: + return await scrape_linkedin( + worker_id=worker_id, only_popular=only_popular, + headless=headless + )