Skip to content

Commit

Permalink
fixed: typo bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
ManiMozaffar committed May 30, 2023
1 parent 436ec47 commit 72c340e
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 19 deletions.
9 changes: 9 additions & 0 deletions worker/crawlers/linkedin/controller.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from itertools import product
import random
import asyncio

from loguru import logger


from crawlers.linkedin.repository import LinkedinRepository
Expand Down Expand Up @@ -29,14 +33,19 @@ def __init__(
)
for job in job_urls
]
random.shuffle(self.job_models)

async def process(self):
for index in range(0, len(self.job_models), self.batch_size):
batch = self.job_models[index:index+self.batch_size]
logger.success(f"Processing {len(batch)} batches")
ads_urls = await self.repository.get_urls(batch)
logger.success(f"Fetched {len(ads_urls)} links")
all_ads = await self.repository.process_advertisements(ads_urls)
logger.success(f"Processed {len(ads_urls)} results")
await self.repository.save_all_data(
all_data=all_ads,
endpoint="/api/ads/",
method=HttpMethod.POST,
)
await asyncio.sleep(6) # 6 second cooldown between each batch
11 changes: 9 additions & 2 deletions worker/crawlers/linkedin/gateway.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import json
from typing import Union, List
import logging
import asyncio

from playwright.async_api import Page, Response, TimeoutError
import httpx
from loguru import logger


from crawlers.linkedin import xpaths, prompt
Expand Down Expand Up @@ -95,8 +95,15 @@ async def process_adds_info(
)
if text:
self.data.body = text
else:
self.data = None # Data's body failed validation

if hashtags:
self.data.keywords = hashtags
return None

self.data = None # Data failed validation
return None

@async_timeout(120)
async def get_response_from_theb_ai(self, chatgpt_page: Page) -> dict:
Expand All @@ -110,7 +117,7 @@ async def get_response_from_theb_ai(self, chatgpt_page: Page) -> dict:
lines = list(filter(None, result.split('\n')))
return json.loads(lines[-1])
except TimeoutError:
logging.error("Timeout exceed in get_response_from_theb_ai")
logger.error("Timeout exceed in get_response_from_theb_ai")


class AdsURLGateway(PlayWrightCrawler):
Expand Down
41 changes: 24 additions & 17 deletions worker/crawlers/linkedin/repository.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import requests
import aiohttp
import asyncio
from typing import List
import json
import logging
from loguru import logger

from core.enums import HttpMethod
from crawlers.linkedin.gateway import AdsDataGateway, AdsURLGateway
Expand Down Expand Up @@ -63,23 +63,30 @@ async def process_advertisements(
]

async def save_data_to_url(
self, data: LinkedinData, endpoint: str, method: HttpMethod
self, session: aiohttp.ClientSession,
data: LinkedinData, endpoint: str, method: HttpMethod
) -> dict:
response = requests.request(
method=method.value.upper(),
url=self.base_url + endpoint,
json=json.loads(data.json())
)
if response.status_code == 200:
logging.info("Saved data to URL")
return response
try:
async with session.request(
method=method.value.upper(),
url=self.base_url + endpoint,
json=json.loads(data.json())
) as response:
if response.status == 200:
logger.info("Saved data to URL")
return await response.json()
except Exception as error:
logger.error(f"Error saving data to URL: {error}")

async def save_all_data(
self, all_data: List[LinkedinData], endpoint: str, method: HttpMethod
):
await asyncio.gather(
*[
self.save_data_to_url(data, endpoint, method)
for data in all_data
]
)
async with aiohttp.ClientSession() as session:
await asyncio.gather(
*[
asyncio.ensure_future(
self.save_data_to_url(session, data, endpoint, method)
)
for data in all_data
]
)
24 changes: 24 additions & 0 deletions worker/tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# import asyncio

# from crawlers.linkedin.repository import LinkedinRepository
# from crawlers.linkedin.models import LinkedinData
# from core.enums import HttpMethod



# async def test():
# data = LinkedinData(
# ads_id='3601257801',
# location='\n Stockholm, Stockholm County, Sweden\n',
# country='Sweden',
# body="Here's the simplified job advertisement:\n\nJob Title: ES Test Engineer\n\nLocation: Stockholm\n\nVisa Sponsor: #Yes\n\nRequirements:\n\n• 3+ years of experience in Embedded Systems Testing/Validation.\n\n• Proficient in C/C++ and Python.\n\n• Knowledge of CAN protocols, ECU's, and Microcontrollers.\n\n• Experience in Automotive, Energy or Electrical industries.\n\n• Understanding of development processes and agile mindset.\n\n• Master’s or Bachelor’s degree in Software Engineering, Electrical Engineering or equivalent.\n\nNote: QRIOS has not provided any information about its sponsorship status in the job advertisement.\n #embedded_systems #validation #ecu #can_protocols #testing #c_cpp #on_site #microcontrollers #python #yes #Sweden\n ",
# company_name='\n QRIOS\n ',
# title='Embedded Systems Test Engineers to QRIOS Technology',
# source=1,
# employement_type='on_site',
# level='Entry level',
# keywords=["embedded_systems", "validation", "ecu", "can_protocols", "testing", "c_cpp", "on_site", "microcontrollers", "python", "yes", "Sweden"]
# )
# await LinkedinRepository(5).save_data_to_url(data=data, endpoint="/api/ads/", method=HttpMethod.POST)

# asyncio.run(test())

0 comments on commit 72c340e

Please sign in to comment.