Skip to content

Commit

Permalink
fix(class): Fix #17 issue
Browse files Browse the repository at this point in the history
  • Loading branch information
Jack authored and Jack committed Nov 10, 2022
1 parent 3e94153 commit 21e7dc9
Show file tree
Hide file tree
Showing 5 changed files with 455 additions and 521 deletions.
91 changes: 5 additions & 86 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,94 +30,10 @@ poetry install ruia-peewee-async[all]

## Usage

A complete example is like below.
```python
# -*- coding: utf-8 -*-
from peewee import CharField
from ruia import AttrField, Item, Response, TextField

from ruia_peewee_async import (
RuiaPeeweeInsert,
RuiaPeeweeUpdate,
Spider,
TargetDB,
after_start,
)

class DoubanItem(Item):
target_item = TextField(css_select="tr.item")
title = AttrField(css_select="a.nbg", attr="title")
url = AttrField(css_select="a.nbg", attr="href")

async def clean_title(self, value):
return value.strip()

class DoubanSpider(Spider):
start_urls = ["https://movie.douban.com/chart"]
# aiohttp_kwargs = {"proxy": "http://127.0.0.1:7890"}

async def parse(self, response: Response):
async for item in DoubanItem.get_items(html=await response.text()):
yield RuiaPeeweeInsert(item.results) # default is MySQL
# yield RuiaPeeweeInsert(item.results, filters="url") # use url field(column) to deduplicate, avoid unnecessary insert query executed.
# yield RuiaPeeweeInsert(item.results, database=TargetDB.POSTGRES) # save to Postgresql
# yield RuiaPeeweeInsert(item.results, database=TargetDB.BOTH) # save to both MySQL and Postgresql

class DoubanUpdateSpider(Spider):
start_urls = ["https://movie.douban.com/chart"]

async def parse(self, response: Response):
async for item in DoubanItem.get_items(html=await response.text()):
res = {}
res["title"] = item.results["title"]
res["url"] = "http://whatever.youwanttoupdate.com"
yield RuiaPeeweeUpdate(
res,
{"title": res["title"]},
database=TargetDB.POSTGRES, # default is MySQL
)

# Args for RuiaPeeweeUpdate
# data: A dict that's going to be updated in the database.
# query: A peewee's query or a dict to search for the target data in database.
# database: The target database type.
# filters: A str or List[str] of columns to avoid duplicate data and avoid unnecessary query execute.
# create_when_not_exists: Default is True. If True, will create a record when query can't get the record.
# not_update_when_exists: Default is True. If True and record exists, won't update data to the records.
# only: A list or tuple of fields that should be updated only.
mysql = {
"host": "127.0.0.1",
"port": 3306,
"user": "ruiamysql",
"password": "abc123",
"database": "ruiamysql",
"model": {
"table_name": "ruia_mysql",
"title": CharField(),
"url": CharField(),
},
}
postgres = {
"host": "127.0.0.1",
"port": 5432,
"user": "ruiapostgres",
"password": "abc123",
"database": "ruiapostgres",
"model": {
"table_name": "ruia_postgres",
"title": CharField(),
"url": CharField(),
},
}

if __name__ == "__main__":
DoubanSpider.start(after_start=after_start(mysql=mysql))
# DoubanSpider.start(after_start=after_start(postgres=postgres))
# DoubanSpider.start(after_start=after_start(mysql=mysql, postgres=postgres))
# DoubanUpdateSpider.start(after_start=after_start(mysql=mysql))
```
A complete example is in [the example directory](./examples/douban.py).

There's a `create_model` method to create the Peewee model based on database configuration.
You can use the `create_model` method to manipulate tables before starting the spider.
```python
from ruia_peewee_async import create_model

Expand Down Expand Up @@ -167,6 +83,9 @@ poetry install && poetry install -E aiomysql -E aiopg
```
to install all dependencies.

MacOS users have to run `brew install postgresql` to install postgresql and export the `pg_config` to the PATH,
so that the `psycorg2` dependency can be installed successfully with pip.

- Using `poetry shell` to enter the virtual environment.
Or open your favorite editor and select the virtual environment to start coding.
- Using `pytest` to run unit tests under `tests` folder.
Expand Down
10 changes: 5 additions & 5 deletions examples/douban.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
Spider,
TargetDB,
after_start,
before_stop
)


class DoubanItem(Item):
target_item = TextField(css_select="tr.item")
title = AttrField(css_select="a.nbg", attr="title")
Expand All @@ -19,7 +19,6 @@ class DoubanItem(Item):
async def clean_title(self, value):
return value.strip()


class DoubanSpider(Spider):
start_urls = ["https://movie.douban.com/chart"]
# aiohttp_kwargs = {"proxy": "http://127.0.0.1:7890"}
Expand All @@ -33,7 +32,6 @@ async def parse(self, response: Response):
# yield RuiaPeeweeInsert(item.results, database=TargetDB.POSTGRES) # save to Postgresql
# yield RuiaPeeweeInsert(item.results, database=TargetDB.BOTH) # save to both MySQL and Postgresql


class DoubanUpdateSpider(Spider):
start_urls = ["https://movie.douban.com/chart"]

Expand All @@ -57,7 +55,6 @@ async def parse(self, response: Response):
# not_update_when_exists: Default is True. If True and record exists, won't update data to the records.
# only: A list or tuple of fields that should be updated only.


mysql = {
"host": "127.0.0.1",
"port": 3306,
Expand All @@ -84,7 +81,10 @@ async def parse(self, response: Response):
}

if __name__ == "__main__":
DoubanSpider.start(after_start=after_start(mysql=mysql))
spider = DoubanSpider.start(
after_start=after_start(mysql=mysql),
before_stop=before_stop
)
# DoubanSpider.start(after_start=after_start(postgres=postgres))
# DoubanSpider.start(after_start=after_start(mysql=mysql, postgres=postgres))
# DoubanUpdateSpider.start(after_start=after_start(mysql=mysql))
Loading

0 comments on commit 21e7dc9

Please sign in to comment.