In [None]:
import asyncio
import nest_asyncio
nest_asyncio.apply()
loop = asyncio.get_event_loop()

In [None]:
from motor.motor_asyncio import AsyncIOMotorClient


def create_client(host: str, username: str,
                  password: str, port: int,
                  db_name: str) -> AsyncIOMotorClient:
    return AsyncIOMotorClient(
            f"mongodb://{username}:{password}@{host}:{port}/{db_name}?authSource=admin")


In [None]:
from typing import Any, List


class AsyncCRUDBase(object):

    @staticmethod
    async def get(db: Any, query: Any, **kwargs):
        return NotImplemented

    @staticmethod
    async def delete(db: Any, query: Any, **kwargs):
        return NotImplemented

    @staticmethod
    async def insert_many(db: Any, data: Any, **kwargs):
        return NotImplemented

    async def save(self, db, collection):
        return NotImplemented


class AsyncMongoCRUDBase(AsyncCRUDBase):
    """ Provides minimal support for writing to MongoDB
    """
    
    @staticmethod
    async def get(collection: Any,  query: Any, **kwargs) -> List[object]:
        result = [data async for data in collection.find(query)]
        return result

    @staticmethod
    async def delete(collection: Any, query: Any, **kwargs):
        return NotImplemented

    @staticmethod
    async def insert_many(collection: Any, data: Any, **kwargs):
        await collection.insert_many(data)

    async def save(self, collection):
        return NotImplemented


In [None]:
from pydantic import BaseModel
from bson import ObjectId
from datetime import datetime, date
from typing import Optional, List

class MongoModel(BaseModel, AsyncMongoCRUDBase):

    class Config:
        allow_population_by_field_name = True
        json_encoders = {
            datetime: lambda dt: dt.isoformat(),
            ObjectId: lambda oid: str(oid),
        }

    @classmethod
    def from_mongo(cls, data: dict):
        """We must convert _id into "id". """
        if not data:
            return data
        id = data.pop('_id', None)
        return cls(**dict(data, id=id))

    def mongo(self, **kwargs):
        exclude_unset = kwargs.pop('exclude_unset', True)
        by_alias = kwargs.pop('by_alias', True)

        parsed = self.dict(
            exclude_unset=exclude_unset,
            by_alias=by_alias,
            **kwargs,
        )

        # Mongo uses `_id` as default key. We should stick to that as well.
        if '_id' not in parsed and 'id' in parsed:
            parsed['_id'] = parsed.pop('id')

        return parsed
    
    @staticmethod
    async def insert_many(collection: Any, data: List[AsyncMongoCRUDBase], **kwargs):
        await collection.insert_many([d.mongo() for d in data])

    @classmethod
    async def get(cls, collection: Any,  query: Any, **kwargs) -> List[object]:
        result = [cls.from_mongo(data) async for data in collection.find(query)]
        return result

    async def save(self, db, collection_name:str):
        try:
            await db[collection_name].insert_one(self.mongo())
        except Exception as e:
            print(e)


In [None]:
import re

domain_pattern = re.compile("^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)")

In [None]:
from enum import Enum


class JobState(str, Enum):
    PENDING= 'pending'
    DONE = 'done'
    WORKING = 'working'
    FAILED = 'failed'


class ContentType(str, Enum):
    WEBPAGE: str = 'webpage'
    IMAGE: str = 'image'
    AUDIO: str = 'audio'
    VIDEO: str = 'video'


class JobType(str, Enum):
    """ Job types supported by spiders

    BASIC_PAGE_SCRAPING: only scrape the provided urls and return the html of those urls,
    SEARCH_RESULT_AGGREGATION: perform searches on search engines or general search page and retrieve their results,
    WEB_CRAWLING: Start from seed urls, follow all links available.
    """
    BASIC_PAGE_SCRAPING: str = 'basic_page_scraping'
    SEARCH_RESULT_AGGREGATION: str = 'search_result_aggregation'
    # WEB_CRAWLING: str = 'web_crawling'

In [None]:
class KeywordRules(BaseModel):
    include: List[str] = []
    exclude: List[str] = []


class SizeLimit(BaseModel):
    max_pages: Optional[int]
    max_size: Optional[int]


class TimeRange(BaseModel):
    past_days: Optional[int]
    date_before: Optional[date]
    date_after: Optional[date]


class RegexPattern(BaseModel):
    patterns: Optional[List[str]] = []


class ScrapeRules(BaseModel):
    """ Describes rules a spider should follow

    Fields:
        keywords: Optional[KeywordRules]
        size_limit: Optional[SizeLimit]
        time_range: Optional[TimeRange]
        regular_expressions: Optional[RegexPattern]
        max_retry: Optional[int] = 1  
    """
    keywords: Optional[KeywordRules]
    size_limit: Optional[SizeLimit]
    time_range: Optional[TimeRange]
    regular_expressions: Optional[RegexPattern]
    max_retry: Optional[int] = 1



class JobSpecification(BaseModel):
    """ Describes what kind of task a spider should perform

    Fields:
        urls: List[str]
        job_type: JobType
        scrape_rules: ScrapeRules
        data_collection: str = 'test'
    """
    urls: List[str]
    job_type: JobType
    scrape_rules: ScrapeRules
    data_collection: str = 'test'
    job_collection: str = "jobs"

In [None]:
from typing import Optional, List, Any
from datetime import datetime, timedelta

class JobStatus(MongoModel):
    job_id: str
    create_dt: datetime
    page_count: int = 0
    time_consumed: Optional[timedelta]
    current_state: JobState
    specification: JobSpecification


In [None]:
job_spec = JobSpecification(
    urls=['http://www.qq.com',
          "http://www.taobao.com",
          "http://www.baidu.com",
          'http://www.guancha.cn',
          'http://www.sina.com.cn']*5,
    job_type=JobType.BASIC_PAGE_SCRAPING,
    scrape_rules=ScrapeRules(
        sizelimit=SizeLimit(max_pages=10)
    )
)
job_spec

In [None]:
from uuid import uuid4
job_status = JobStatus(
            job_id=str(uuid4()),
            create_dt=datetime.now(),
            page_count=0,
            specification=job_spec,
            current_state=JobState.PENDING,
            time_consumed=timedelta(seconds=0))
job_status

In [None]:
import asyncio
import time


def timeit(func):
    async def process(func, *args, **params):
        if asyncio.iscoroutinefunction(func):
            print('this function is a coroutine: {}'.format(func.__name__))
            return await func(*args, **params)
        else:
            print('this is not a coroutine')
            return func(*args, **params)

    async def helper(*args, **params):
        print('{}.time'.format(func.__name__))
        start = time.time()
        result = await process(func, *args, **params)

        # Test normal function route...
        # result = await process(lambda *a, **p: print(*a, **p), *args, **params)

        print('>>>', time.time() - start)
        return result

    return helper


In [None]:
from typing import Optional, List
from datetime import datetime

class URL(BaseModel):
    """ Holds an url and its domain name.

    If domain name is not specified, it will be guessed from the url

    Fields:
        url: str
        domain: Optional[str]
    """
    url: str
    domain: Optional[str] = None

    def __init__(self, **data: Any) -> None:
        super().__init__(**data)
        parsed_domain = domain_pattern.findall(self.url)

        if self.domain is None and len(parsed_domain):
            # auto fills domain name if not provided
            self.domain = parsed_domain[0]


class HTMLData(MongoModel):
    """ Builds a html data representation

    Fields:
        url: URL
        html: str
        create_dt: datetime
        job_id: Optional[str]
        keywords: Optional[List[str]] = []
    """
    url: URL
    html: str
    create_dt: datetime
    job_id: Optional[str]
    keywords: Optional[List[str]] = []


In [None]:
def periodic(period):
    def scheduler(fcn):

        async def wrapper(*args, **kwargs):

            while True:
                asyncio.create_task(fcn(*args, **kwargs))
                await asyncio.sleep(period)

        return wrapper

    return scheduler

In [None]:
from abc import ABC
from typing import Callable
import aiohttp

class AsyncIterator:
    def __init__(self, seq):
        self.iter = iter(seq)

    def __aiter__(self):
        return self

    async def __anext__(self):
        try:
            return next(self.iter)
        except StopIteration:
            raise StopAsyncIteration


class BaseSpiderService(ABC):
    """ Defines common interface for spider services.
    """

    def get(self, data_src: URL) -> Any:
        return NotImplemented

    def get_many(self, data_src: List[URL], rules: Any) -> Any:
        return NotImplemented

class HTMLSpiderService(BaseSpiderService):

    def __init__(self, session: aiohttp.ClientSession, job_id: str = None):
        BaseSpiderService.__init__(self)
        self.session = session
        self.html_data: List[HTMLData] = []
        self.job_id = job_id
        self.page_count = 0

    async def get(self, data_src: URL) -> None:
        async with self.session.get(data_src.url) as response:
            html = await response.text()
            return html

    async def get_many(self, data_src: List[str], rules: ScrapeRules,
                       async_db_action: Callable = None, async_in_progress_callback: Callable = None,
                       async_job_done_callback: Callable = None, execute_job_callback_interval: int = 1,
                       **kwargs) -> None:
        """ Get html data given the data source
        
        Pass callback coroutines to this method when using a BackgroundTask scheduler.

        Args: 
            data_src: List[str]
            rules: ScrapeRules
            async_in_progress_callback: corountine for handling job status during scraping
            async_job_done_callback: corountine for handling job status after
            async_db_action: coroutine for handling database operations
            execute_job_callback_interval: time interval for executing in_progress_callback
            kwargs: arguments for callbacks
        """
        self.html_data = []
        self.page_count = 0
        
        loop = asyncio.get_running_loop()
        
        async def scrape(url):
            target_url = URL(url=url)
            html = await self.get(target_url)
            html_data = HTMLData(url=target_url, html=html,
                                 create_dt=datetime.now(),
                                 job_id=self.job_id)
            self.page_count += 1
            self.html_data.append(html_data)
            
        @periodic(1)
        async def tick():
#             if async_in_progress_callback:
            await asyncio.sleep(1)
            print("tik tok")
        
        def unblock_periodic_task(task):
            return lambda task: task.cancel()
            
        async def execute_db_action_after(tasks, periodic_task):
            await asyncio.gather(*tasks)
            if async_db_action:
                await async_db_action(data=self.html_data, **kwargs)
                periodic_task.cancel()
            print("done")
        
        tasks = [scrape(url) for url in data_src]
        periodic_in_progress_task = loop.create_task(tick())
#         loop.call_later(1, unblock_periodic_task(periodic_in_progress_task))
        db_task = asyncio.create_task(execute_db_action_after(tasks, periodic_in_progress_task))
        
#         await periodic_in_progress_task
        await db_task
        
            
        return self.html_data


In [None]:
session = aiohttp.ClientSession()

In [None]:
spider = HTMLSpiderService(session, job_id=str(uuid4()))

In [None]:
job_spec = JobSpecification(
    urls=[
          "http://www.taobao.com",
          "http://www.baidu.com",
          'http://www.guancha.cn',
          'http://www.sina.com.cn'],
    job_type=JobType.BASIC_PAGE_SCRAPING,
    scrape_rules=ScrapeRules(
        sizelimit=SizeLimit(max_pages=10)
    )
)

In [None]:
client = create_client(username="admin", password="root", host="localhost", port=27017, db_name="spiderDB")
test_collection = client.spiderDB.test

In [None]:
start_time = time.time()
data = asyncio.run(spider.get_many(job_spec.urls, job_spec.scrape_rules, async_db_action=HTMLData.insert_many, collection=test_collection))
used_time = time.time() - start_time
print(f"used {used_time} seconds")
print(f"Collected {len(data)} entries")

In [None]:
@timeit
async def test_scrape(spider):
    data = await spider.get_many(job_spec.urls, job_spec.scrape_rules, async_db_action=HTMLData.insert_many, collection=test_collection)
    return data

In [None]:
time.time()

In [None]:
import asyncio

async def factorial(name, number):
    f = 1
    for i in range(2, number + 1):
        print(f"Task {name}: Compute factorial({number}), currently i={i}...")
        await asyncio.sleep(1)
        f *= i
    print(f"Task {name}: factorial({number}) = {f}")
    return f

async def main():
    # Schedule three calls *concurrently*:
    L = await asyncio.gather(
        factorial("A", 2),
        factorial("B", 3),
        factorial("C", 4),
    )
    print(L)

asyncio.run(main())


In [None]:
async def find_all(test_collection):
    print(test_collection)
    data = [HTMLData(**d) async for d in test_collection.find({})]
    return data

In [None]:
test_data = HTMLData(url=URL(url='http://www.bbc.com'), html='<p>news</p>', create_dt=datetime(1976, 5, 28, 4, 21, 11, 901000), job_id='1', keywords=[])

In [None]:
asyncio.run(test_data.save(client.spiderDB, 'test'))

In [None]:
asyncio.run(HTMLData.get(client.spiderDB.test, {}))

In [None]:
from uuid import uuid4

In [None]:
test_set = [HTMLData(url=URL(url=f'http://www.{s}.com'), html=f'<p>{s}</p>', create_dt=datetime.now(), job_id=str(uuid4()), keywords=[]) for s in "abcde" ]

In [None]:
test_set

In [None]:
asyncio.run(HTMLData.insert_many(client.spiderDB.test, test_set))

In [None]:
from enum import Enum

In [None]:
class RequestStatus(str, Enum):
    """ Maps common HTTP status codes to their corresponding meanings
    """
    CREATED = 'created'
    WAITING = 'waiting'
    SUCCESS = 'success'
    TIMEOUT = 'timeout'
    CLIENT_ERROR = 'client_error'
    SERVER_ERROR = 'server_error'
    BAD_REQUEST = 'bad_request'
    UNAUTHORIZED = 'unauthorized'
    FORBIDDEN = 'forbidden'
    NOT_FOUND = 'not_found'
    INTERNAL_SERVER_ERROR = 'internal_server_error'
    TOO_MANY_REQUESTS = 'too_many_requests'
    REDIRECTED = 'redirected'

    @classmethod
    def from_status_code(cls, status_code: int):
        """ Convert a status code to its string representation """
        # checks whether status code is between 200 and 206, but
        # range is exclusive on the right hand side
        if 200 <= status_code <= 206:
            return cls.SUCCESS
        elif 300 <= status_code <= 309:
            return cls.REDIRECTED
        elif status_code == 400:
            return cls.BAD_REQUEST
        elif status_code == 401:
            return cls.UNAUTHORIZED
        elif status_code == 403:
            return cls.FORBIDDEN
        elif status_code == 404:
            return cls.NOT_FOUND
        elif status_code == 429:
            return cls.TOO_MANY_REQUESTS
        elif 405 <= status_code <= 452:
            return cls.CLIENT_ERROR
        elif status_code == 429:
            return cls.TOO_MANY_REQUESTS
        elif status_code == 500:
            return cls.INTERNAL_SERVER_ERROR
        elif 501 <= status_code <= 511:
            return cls.SERVER_ERROR

In [None]:
RequestStatus.from_status_code(403)

In [None]:
from pydantic import BaseModel
from typing import Optional, List, Any, Union

class ParseRuleType(str, Enum):
    """ Parse rule types supported by parsers 
    
    One of:
        XPATH,
        CSS_SELECTOR,
        REGEX
    """
    XPATH: str = 'xpath'
    CSS_SELECTOR: str = 'css_selector'
    REGEX: str = 'regex'

        
class ParseRule(BaseModel):
    """ Defines the parse rule for a parser
    
    Fields:
        field_name: str
        field_value: str
        rule: str
        rule_type: ParseRuleType        
    """
    field_name: str
    field_value: str
    rule: str
    rule_type: ParseRuleType


class ParseResult(BaseModel):
    """ Defines the parse result from a parser
    
    Fields:
        field_name: str
        field_value: str  
    """
    name: str
    value: str

        
class URL(BaseModel):
    """ Holds an url and its domain name.

    If domain name is not specified, it will be guessed from the url

    Fields:
        url: str
        domain: Optional[str]
    """
    url: str
    domain: Optional[str] = None

    def __init__(self, **data: Any) -> None:
        super().__init__(**data)
        parsed_domain = domain_pattern.findall(self.url)

        if self.domain is None and len(parsed_domain):
            # auto fills domain name if not provided
            self.domain = parsed_domain[0]


In [None]:
from bs4 import BeautifulSoup
from abc import ABC
from typing import Any, List, Dict
# from ..models.data_models import ParseRule, ParseResult, URL



class BaseParser(ABC):

    def parse(self, text: str, rules: List[ParseRule]) -> List[ParseResult]:
        return NotImplemented


class LinkParser(BaseParser):
    _parser = BeautifulSoup

    def parse(self, text: str) -> List[URL]:
        return NotImplemented

In [None]:
dir(LinkParser)

In [None]:
import requests

In [None]:
url = "https://cuiqingcai.com/1319.html"  


In [None]:
test_page = requests.get(url).text

In [None]:
page = BeautifulSoup(test_page, 'lxml')

In [None]:
page

In [None]:
contents = page.select("*:is(p)")

In [None]:
contents

In [None]:
text = [content.text.strip() for content in contents if len(content.text.strip()) > 0]

In [None]:
text

In [None]:
import lxml

In [None]:
from lxml.html import fromstring

In [None]:
# html_tree = lxml.etree.HTML(test_page)
html_tree = fromstring(test_page)

In [None]:
html_tree

In [None]:
contents = html_tree.xpath('//article/div')

In [None]:
str.strip??

In [None]:
for c in contents:
    print(c.text_content().strip('\n '))

In [None]:
link = links[1]

In [None]:
hasattr(link, "href")
link.text_content()

In [None]:
css_select = getattr(page, 'select')

In [None]:
xpath_select = html_tree.xpath

In [None]:
css_select("a[href^=http]")

In [None]:
xpath_select('(//body//a)')

In [None]:
for link in links:
    print(link.get("href"))

In [None]:
class URL(BaseModel):
    """ Holds an url and its domain name.

    If domain name is not specified, it will be guessed from the url

    Fields:
        url: str
        domain: Optional[str]
    """
    url: str
    domain: Optional[str] = None

    def __init__(self, domain_pattern = re.compile("^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)"), **data: Any) -> None:
        super().__init__(**data)
        parsed_domain = domain_pattern.findall(self.url)

        if self.domain is None and len(parsed_domain):
            # auto fills domain name if not provided
            self.domain = parsed_domain[0]

    def __hash__(self):
        return hash(self.__repr__())


In [None]:
import re

domain_pattern = re.compile("^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)")

In [None]:
url = URL(url="https://www.google.com.hk/")
url2 = URL(url="https://www.google.com.hk/")

In [None]:
url

In [None]:
url_set = set()

In [None]:
url_set.add(url)

In [None]:
url_set.add(url2)

In [None]:
url_set

In [None]:
u = URL(url="/s?wd=%E6%9D%8E%E5%8D%8E%E6%98%8E&rsv_idx=2&tn=baiduhome_pg&ie=utf-8&rsv_cq=beautifulsoup&rsv_dl=0_right_recommends_merge_20826&rsv_pq=faaa1a190003f124&oq=beautifulsoup&rsv_t=bbf8OkBM1xh84iXdDDdaz41CQON4kTR7nMiUX74PVrfLJY0TjYocHz6G%2BUASur4Iv%2B6Y&euri=ac2c31061cbe4a86b95a0086cc39f6e1")

In [None]:
u

In [None]:
BeautifulSoup.select(BeautifulSoup(test_page), 'a')

In [None]:
getattr(page, 'select')

In [None]:
from lxml import etree

In [None]:
getattr(etree.HTML(test_page),'xpath')

In [None]:
html_tree = etree.HTML(test_page)

In [None]:
html_tree.xpath()

In [None]:
dir(etree.HTML)

In [None]:
html_tree.cssselect('a[href^=http]')

In [None]:
l = html_tree.cssselect('a[href^=http]')[0]

In [None]:
dir(l)

In [None]:
l.attrib

In [None]:
from functools import partial

In [None]:
select = partial(BeautifulSoup.select, etree.HTML(test_page))

In [None]:
select('a')

In [None]:
type(page)

In [295]:
urls = [
#     "https://www.google.com/search?q=scrapy&sxsrf=ALeKk03bcpnii8K22lvJxH--rR2KqJXLbw:1623390101257&ei=lffCYLukD5Hl-gTVmL3ADQ&start=10&sa=N&ved=2ahUKEwj7wsSy747xAhWRsp4KHVVMD9gQ8tMDegQIARA7", 
#     "https://www.google.com/search?q=scrapy&sxsrf=ALeKk03bcpnii8K22lvJxH--rR2KqJXLbw:1623390101257&ei=lffCYLukD5Hl-gTVmL3ADQ&start=40&sa=N&ved=2ahUKEwj7wsSy747xAhWRsp4KHVVMD9gQ8tMDegQIARBB",
    
    "https://www.baidu.com/s?wd=scrapy&pn=20&oq=scrapy&tn=baiduhome_pg&ie=utf-8&usm=4&rsv_idx=2&rsv_pq=9e4bcc9400012ea1&rsv_t=2516n0hjetaZMGZKNUuRvN1VMSf30%2B5WZ%2FXekpewX2ta1xYQC1ywaqoTETrYz2WZFvgK&gpc=stf&tfflag=0&rsv_page=1",
    "https://www.baidu.com/s?wd=scrapy&pn=10&oq=scrapy&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&rsv_pq=c145f364000074cf&rsv_t=1060Jw36QPVOtvTjzEupgZ1u1SF1HXn%2BLIKV%2BqbBDYiKJE5kEB7m4%2BeL8wCauWXjWOR5&gpc=stf%3D1622785207%2C1623390007%7Cstftype%3D1&tfflag=1&rsv_page=1",
]

In [296]:
param1, param2 = [set(u.split("?")[-1].split("&")) for u in urls]

In [303]:
param1,param2

({'gpc=stf',
  'ie=utf-8',
  'oq=scrapy',
  'pn=20',
  'rsv_idx=2',
  'rsv_page=1',
  'rsv_pq=9e4bcc9400012ea1',
  'rsv_t=2516n0hjetaZMGZKNUuRvN1VMSf30%2B5WZ%2FXekpewX2ta1xYQC1ywaqoTETrYz2WZFvgK',
  'tfflag=0',
  'tn=baiduhome_pg',
  'usm=4',
  'wd=scrapy'},
 {'gpc=stf%3D1622785207%2C1623390007%7Cstftype%3D1',
  'ie=utf-8',
  'oq=scrapy',
  'pn=10',
  'rsv_idx=2',
  'rsv_page=1',
  'rsv_pq=c145f364000074cf',
  'rsv_t=1060Jw36QPVOtvTjzEupgZ1u1SF1HXn%2BLIKV%2BqbBDYiKJE5kEB7m4%2BeL8wCauWXjWOR5',
  'tfflag=1',
  'tn=baiduhome_pg',
  'wd=scrapy'})

In [292]:
import re

In [301]:
paging_param_pattern = re.compile("^(start|page|p|pn|\w+)=\d{1,3}$")

In [304]:
for s1, s2 in (param1, param2):
    if paging_param_pattern.match(s1) and paging_param_pattern.match(s2):
        print(s1, s2)

ValueError: too many values to unpack (expected 2)

In [20]:
import re
from datetime import datetime, timedelta

cn_time_string_extractors = {
            re.compile('\d{1,2}秒前'):
                lambda now, time_str: now -
                timedelta(seconds=int(re.search('\d+', time_str).group(0))),
            re.compile('\d{1,2}分钟前'):
                lambda now, time_str: now -
                timedelta(minutes=int(re.search('\d+', time_str).group(0))),
            re.compile('\d{1,2}小时前'):
                lambda now, time_str: now -
                timedelta(hours=int(re.search('\d+', time_str).group(0))),
            re.compile('\d{1,2}天前'):
                lambda now, time_str: now -
                timedelta(days=int(re.search('\d+', time_str).group(0))),
            re.compile('昨天\d{1,2}:\d{1,2}'):
                lambda now, time_str: datetime(
                    now.year, now.month, now.day-1,
                    int(re.findall('\d+', time_str)[0]),
                    int(re.findall('\d+', time_str)[1])
                    
            ),
            re.compile('\d{1,2}月\d{1,2}日'):
                lambda now, time_str: datetime(
                    now.year,
                    int(re.findall('\d+', time_str)[0]),
                        int(re.findall('\d+', time_str)[1])),
            
            re.compile('\d{1,2}年\d{1,2}月\d{1,2}日'):
                lambda now, time_str: datetime(
                    *(re.findall('\d+', time_str))
                )
        }

In [22]:
ts = ["58分钟前", '1小时前', '昨天13:15', '6月5日', '5天前']

In [23]:
now = datetime.now()
for time_str in ts:
    for pattern in cn_time_string_extractors:
        if pattern.match(time_str):
            converted = cn_time_string_extractors[pattern](now, time_str)
            print(converted)

2021-06-16 18:41:43.457528
2021-06-16 18:39:43.457528
2021-06-15 13:15:00
2021-06-05 00:00:00
2021-06-11 19:39:43.457528


In [15]:
datetime(2021,1,1)

datetime.datetime(2021, 1, 1, 0, 0)

In [None]:
re.compile('\d{1,2}秒前'):\
                lambda now, time_str: now -
                timedelta(seconds=int(re.search('\d+', time_str).group(0))),
            re.compile('\d{1,2}分钟前'):\
                lambda now, time_str: now -
                timedelta(minutes=int(re.search('\d+', time_str).group(0))),
            re.compile('\d{1,2}小时前'):\
                lambda now, time_str: now -
                timedelta(hours=int(re.search('\d+', time_str).group(0))),
            re.compile('\d{1,2}天前'):\
                lambda now, time_str: now -
                timedelta(days=int(re.search('\d+', time_str).group(0))),
            re.compile('\d{1,2}月\d{1,2}日'):\
                lambda now, time_str: datetime.date(
                    now.year,
                    int(re.findall('\d+', time_str)[0],
                        int(re.findall('\d+', time_str)[1]),

In [44]:
paragraph = ["""
据“中国载人航天”微信公众号消息，""",
"""中国载人航天工程办公室：""",
"""执行神舟十二号载人航天飞行任务的载人飞船及长征二号F遥十二运载火箭完成出厂前所有研制工作，"""
]

In [54]:
exclude_pattern = re.compile("(?!(执行))*")

In [57]:
re.search(exclude_pattern, paragraph[2])

<re.Match object; span=(0, 0), match=''>

In [81]:
s = '''据“中国载人航天”微信公众号消息，
中国载人航天工程办公室：
执行神舟十二号载人航天飞行任务的载人飞船及长征二号F遥十二运载火箭完成出厂前所有研制工作，
日前，已分批安全运抵酒泉卫星发射中心，开展发射场区总装和测试工作。
目前，
发射场设施设备状态良好，
参试各系统正在有序开展各项任务准备，
执行本次载人航天飞行任务的航天员乘组正在进行强化训练。'''
for match in re.finditer('^((?!微信公众号|任务).)*$', s, flags=re.M):
    print(match)

<re.Match object; span=(18, 30), match='中国载人航天工程办公室：'>
<re.Match object; span=(77, 110), match='日前，已分批安全运抵酒泉卫星发射中心，开展发射场区总装和测试工作。'>
<re.Match object; span=(111, 114), match='目前，'>
<re.Match object; span=(115, 127), match='发射场设施设备状态良好，'>


In [73]:
pattern = re.compile('^((?!计算机).)*$')

In [77]:
re.search(pattern, s)

In [78]:
re.finditer?

[0;31mSignature:[0m [0mre[0m[0;34m.[0m[0mfinditer[0m[0;34m([0m[0mpattern[0m[0;34m,[0m [0mstring[0m[0;34m,[0m [0mflags[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return an iterator over all non-overlapping matches in the
string.  For each match, the iterator returns a Match object.

Empty matches are included in the result.
[0;31mFile:[0m      ~/anaconda3/lib/python3.7/re.py
[0;31mType:[0m      function


In [85]:
import requests
from lxml.html import fromstring

In [97]:
url = "http://www.baidu.com/s?wd=asyncio&pn=20"
page = requests.get(url).text
page



In [98]:
parsed_page = fromstring(page)

In [110]:
link_xpath = "//h3/a"
title_xpath = "//h3/a[not (@class)]/descendant-or-self::*/text()"
title_link_xpath = "//h3/a[not (@class)]/@href"
abstract_xpath = "//div[contains(@class, 'c-abstract')]/descendant-or-self::*/text()"
datetime_xpath = "//span[contains(@class, '_3wnyfua')]/descendant-or-self::*/text()"
image_src_xpath = "//span[contains(@class, 'c-img-border')]//preceding-sibling::img/@src"

In [103]:
search_result_xpath = "//div[contains(@class, 'result') and contains(@class, 'new-pmd')]"

In [114]:
search_divs = parsed_page.xpath(title_link_xpath)

In [116]:
search_divs

['http://www.baidu.com/link?url=EqwgVpg9laQ94l3eAMhNXczPU86oWPkEDnuSOWU50TkxWnR6s38ECnV_2564Sh4JQHm-eEUtLiijcqynL_r-cK',
 'http://www.baidu.com/link?url=RBGcSFajas52tJ3UFWLiqyL80PAGvPWbtwG1traHY48j1EGCmd2iZo2RHTkvOJBv5e8lRmy630mY4OqaxD1mDq',
 'http://www.baidu.com/link?url=ZneqwMo_njrhuKUF1CPP7EjMuzKLEwxt7CxNVqx8xmmc9FNABYu2B8sdOEbkbDz5S3s_ODo3vyUNxSus3PaLJmL_5LUlbRey0xT2rW4s-8m',
 'http://www.baidu.com/link?url=Cq2iPohbVzTvJ_PRHY2bivhGqGTaUZ3H-RfS3V9zNYVV-ZSeh_b30if6Y-4hCTAsq6_wu11d55GJFFMacZih3a',
 'http://www.baidu.com/link?url=dla2yO1U3IhfDNYBO3Pk9nJRSeFc0WNy_thcX0uLnISzWIQpdcOLDnYEJdXSvsj-ocusRoT_c7Aq5nVZAu_vvK',
 'http://www.baidu.com/link?url=9GeKMD6NtRz-C6_cxoTy3HBAVnMDyjRhzpHWwa7Aym6zIcr_p2N00yApcubyKnE6Wx45quX06VBDe4k4eHZykK',
 'http://www.baidu.com/link?url=XKFHI7kbApl3act2Juetlro3bgpWHxhiS9d-JWfSbz6dDbX2UdeU5OURFMkiX2Hht0U_1aSxXV9vjoEl3tmWR1ZqklEIEaPjav1fybvUE4u',
 'http://www.baidu.com/link?url=1-fGYI9sVMYkGtKf1s4C4Pk55EXd3EqYJcbby9Q0-98-FiLPRA2lv1qnjq7mSw-j',
 'http://www

In [108]:
search_divs[0].xpath(title_xpath)

['asyncio',
 ' — Asynchronous I/O — Python 3.9.5 documentat...',
 'asyncio',
 ' - 百鬼之主 - 博客园',
 'asyncio',
 '_十年学会编程-CSDN博客',
 '怎么掌握',
 'asyncio',
 '? - 知乎',
 'asyncio',
 '_python协程系列-CSDN下载',
 'Python 的异步 IO:',
 'Asyncio',
 ' 简介_loop',
 'python3 ',
 'asyncio',
 '官方文档中文版.pdf',
 'asyncio',
 ': ',
 'asyncio',
 ' 是 Python 3.3 的 ',
 'asyncio',
 ' 模块',
 'asyncio',
 '_',
 'asyncio',
 '是什么、最新动态_服务器之家',
 'asyncio',
 '异步IO--协程(Coroutine)与任务(Task)详解_慕课手记']

In [109]:
search_divs[1].xpath(title_xpath)

['asyncio',
 ' — Asynchronous I/O — Python 3.9.5 documentat...',
 'asyncio',
 ' - 百鬼之主 - 博客园',
 'asyncio',
 '_十年学会编程-CSDN博客',
 '怎么掌握',
 'asyncio',
 '? - 知乎',
 'asyncio',
 '_python协程系列-CSDN下载',
 'Python 的异步 IO:',
 'Asyncio',
 ' 简介_loop',
 'python3 ',
 'asyncio',
 '官方文档中文版.pdf',
 'asyncio',
 ': ',
 'asyncio',
 ' 是 Python 3.3 的 ',
 'asyncio',
 ' 模块',
 'asyncio',
 '_',
 'asyncio',
 '是什么、最新动态_服务器之家',
 'asyncio',
 '异步IO--协程(Coroutine)与任务(Task)详解_慕课手记']

In [163]:
def gen(value):
    for v in [value,value,value]:
        yield v

gens = []

for i in range(5):
    gens.append(gen(i))
    

In [164]:
gens

[<generator object gen at 0x10d242228>,
 <generator object gen at 0x10d2422a0>,
 <generator object gen at 0x10d242318>,
 <generator object gen at 0x10d242390>,
 <generator object gen at 0x10d242408>]

In [165]:
# a, b, c = gens

ValueError: too many values to unpack (expected 3)

In [158]:
a.__next__()

0

In [153]:
from itertools import zip_longest

In [166]:
list(zip_longest(*gens))

[(0, 1, 2, 3, 4), (0, 1, 2, 3, 4), (0, 1, 2, 3, 4)]

In [167]:
for a,b in zip((1,2,3),[4,5,6]):
    print(a,b)

1 4
2 5
3 6


In [5]:
from pydantic import BaseModel
from typing import Optional, List
from datetime import datetime, date

In [6]:
class TimeRange(BaseModel):
    past_days: Optional[int]
    date_before: Optional[date]
    date_after: Optional[date]

In [7]:
date?

[0;31mInit signature:[0m [0mdate[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m/[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      date(year, month, day) --> date object
[0;31mFile:[0m           ~/anaconda3/lib/python3.7/datetime.py
[0;31mType:[0m           type
[0;31mSubclasses:[0m     datetime


In [14]:
tr_dict = {
    'date_after': '2021-01-01',
    'date_before': '2021-06-01',
}

In [15]:
TimeRange.parse_obj(tr_dict)

TimeRange(past_days=None, date_before=datetime.date(2021, 6, 1), date_after=datetime.date(2021, 1, 1))

In [16]:
from dateutil.parser import parse

In [17]:
parse("201203")

datetime.datetime(2003, 12, 20, 0, 0)

In [18]:
date(2012,1,2)>parse("201203")

TypeError: can't compare datetime.datetime to datetime.date

In [34]:
url = "http://www.tianqihoubao.com/lishi/wuhan/20170201.html"

In [20]:
import re

In [36]:
date_str = re.findall("\d{6,8}", url)[0]
date_str

'20170201'

In [41]:
date_str[4:6]

'02'

In [26]:
date_pattern = re.compile("\d{6}")

In [28]:
date_pattern.findall('a')

[]

In [29]:
date(2012,1)

TypeError: function missing required argument 'day' (pos 3)

In [33]:
datetime(int(date_str[:4]), int(date_str[-1]),1) < datetime.now()

True

In [37]:
int('01')

1

In [42]:
"武汉历史天气预报"[0:1]

'武'

In [43]:
p = re.compile('\d{6,8}')

In [44]:
p.findall(url)

['20170201']

In [45]:
d1 = datetime(2020,12,1)
d2 = datetime(2021,2,1)
d3 = datetime.now()

In [47]:
d1 <= d2 <= d3

True

In [48]:
s = "暴雨\r\n                                        /中雨"

In [52]:
s.replace("\r\n ", "").replace(" ", "")

'暴雨/中雨'

In [53]:
def relative_url_distance(a, b):
    pass

In [87]:
u1 = "http://www.tianqihoubao.com/lishi/"
u2 = "http://www.tianqihoubao.com/lishi//wuhan/month//201603.html"

In [59]:
u2.startswith(u1)

True

In [84]:
pu1 = re.compile(u1)

In [89]:
match = pu1.search(u2)
match

<re.Match object; span=(0, 34), match='http://www.tianqihoubao.com/lishi/'>

In [90]:
start, end = match.span()

In [97]:
u2[end:]

'/wuhan/month//201603.html'

In [96]:
u2[end:].split("/")

['', 'wuhan', 'month', '', '201603.html']

In [100]:
re.split("/", u2[end:])

['', 'wuhan', 'month', '', '201603.html']

In [76]:
0 < float('inf')

True

In [104]:
len([s for s in u2[end:].split("/") if len(s)])

3

In [111]:
len(list(filter(len, u2[end:].split("/"))))

3

In [112]:
len([s for s in u2[end:].split("/") if len(s) > 0])

3

In [126]:
u3 = "'http://www.tianqihoubao.com/lishi/jingzhou.html'"

In [124]:
def calculate_depth(url, start_url_pattern) -> float:
        """ Calculate depth relative to the start url """
        common_root_matched = start_url_pattern.search(url)
        print(common_root_matched)
        if common_root_matched is None:
            # current url has no common root with the start url
            return float('inf')
        else:
            _, end = common_root_matched.span()
            relative_url = url[end:]
            depth = len([s for s in relative_url.split("/")
                         if len(s) > 0])
            return depth

In [127]:
calculate_depth(u3, pu1)

<re.Match object; span=(1, 35), match='http://www.tianqihoubao.com/lishi/'>


1

In [115]:
u2

'http://www.tianqihoubao.com/lishi//wuhan/month//201603.html'