In [1]:
import requests
from aiohttp import ClientSession
import asyncio
from bs4 import BeautifulSoup
import pandas as pd

import time
import random
import logging

In [7]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
logger = logging.getLogger(__name__)

In [3]:
BASIC_FORMAT = "%(asctime)s-%(levelname)s-%(message)s"
chlr = logging.StreamHandler()
chlr.setFormatter(logging.Formatter(BASIC_FORMAT))
logger.setLevel('DEBUG')
logger.addHandler(chlr)

In [31]:
class RentalCrawler:
    DOMAIN_URL = 'https://rent.591.com.tw/'
    GET_LIST_URL = 'https://rent.591.com.tw/home/search/rsList'
    HEADERS = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
    }
    REGION_IDS = [1, 3]

    def __init__(self):
        self.df_rentals = pd.DataFrame(columns=['post_id', 'title', 'region_id'])

    def _get_token(self, session):
        res = session.get(self.DOMAIN_URL, headers=self.HEADERS)
        soup = BeautifulSoup(res.text, 'html.parser')
        token = soup.select_one('meta[name="csrf-token"]').get('content')
        return token

    def _get_total_pages(self, session, token, region_id):
        headers = self.HEADERS.copy()
        headers['X-CSRF-TOKEN'] = token
        c = requests.cookies.RequestsCookieJar()
        c.set('urlJumpIp', f'{region_id}',
                domain='.591.com.tw',
                path='/')
        session.cookies.update(c)
        params = f'is_format_data=1&is_new_list=1&type=1&region={region_id}&firstRow=0'
        res = session.get(self.GET_LIST_URL, params=params, headers=headers)
        total_page = int(res.json()['records'].replace(',',''))//30+1
        cookies = session.cookies
        return total_page, cookies
    
    async def _get_rentals_by_page(self, session, region_id, page):
        first_row = 30 * (page-1)
        params = f'is_format_data=1&is_new_list=1&type=1&region={region_id}&firstRow={first_row}'
        async with session.get(crawler.GET_LIST_URL, params=params) as response:
            logger.info(f"""
                Page: {page}, Status: {response.status}
            """)
            data = await response.json()
            data = data['data']['data']
            for house in data:
                self.df_rentals = self.df_rentals.append({'post_id': house['post_id'], 'title': house['title'], 'region_id': region_id}, ignore_index=True)

    def _get_rentals_by_region(self, region_id: int = 1):
        with requests.session() as session:
            token = self._get_token(session=session)
            total_page, cookies = self._get_total_pages(
                session=session, token=token, region_id=region_id)
        headers = self.HEADERS.copy()
        headers['X-CSRF-TOKEN'] = token

        async def get_rentals_all_pages():
            async with ClientSession(headers=headers, cookies=cookies) as session:
                # await self._get_rentals_by_page(session=session, region_id=region_id, page=1)
                tasks = [asyncio.create_task(self._get_rentals_by_page(
                    session=session, region_id=region_id, page=page)) for page in range(1, total_page+1)]
                await asyncio.gather(*tasks)

        loop = asyncio.get_event_loop()
        loop.run_until_complete(get_rentals_all_pages())

    def _get_rentals_all(self):
        for region_id in self.REGION_IDS:
            self._get_rentals_by_region(region_id=region_id)
        logger.info(f'Finish get rentals list.')
        self.df_rentals.drop_duplicates(subset=['post_id'], keep='last', inplace=True)


In [32]:
crawler = RentalCrawler()

In [33]:
crawler._get_rentals_all()

2022-06-28 11:17:42,243-INFO-
                Page: 55, Status: 200
            
2022-06-28 11:17:42,333-INFO-
                Page: 62, Status: 200
            
2022-06-28 11:17:42,467-INFO-
                Page: 54, Status: 200
            
2022-06-28 11:17:42,568-INFO-
                Page: 69, Status: 200
            
2022-06-28 11:17:42,655-INFO-
                Page: 80, Status: 200
            
2022-06-28 11:17:42,763-INFO-
                Page: 99, Status: 200
            
2022-06-28 11:17:42,863-INFO-
                Page: 85, Status: 200
            
2022-06-28 11:17:42,966-INFO-
                Page: 82, Status: 200
            
2022-06-28 11:17:43,064-INFO-
                Page: 58, Status: 200
            
2022-06-28 11:17:43,163-INFO-
                Page: 88, Status: 200
            
2022-06-28 11:17:43,260-INFO-
                Page: 7, Status: 200
            
2022-06-28 11:17:43,358-INFO-
                Page: 79, Status: 200
            
2022-06-28 11:17:43,444-INFO-

In [34]:
crawler.df_rentals

Unnamed: 0,post_id,title,region_id
15,12774249,套房出租，交通便利，近文化大學公車站,1
16,12792555,全新1+1房拎包入住(含車位含管理費),1
17,12799312,24管理、大採光、天然瓦斯、三陽台、浴缸,1
18,12810641,電梯分租套房(捷運市府站),1
19,12668720,免爬高樓層/附個人專用洗衣機之分租套房,1
...,...,...,...
18535,12693199,進海山.土城捷運站.全新完工.四房加車位,3
18536,12693167,近海山.土城捷運站.三房+車位,3
18537,12693159,倉庫店面住家,3
18538,12693095,汐科國泰皇家天下高楼4房+車位,3


In [35]:
import sqlalchemy


ModuleNotFoundError: No module named 'sqlalchemy'

In [131]:
token, cookies, total_page = crawler._get_rentals_by_region(region_id=3)

In [133]:
headers = crawler.HEADERS.copy()
headers['X-CSRF-TOKEN'] = token

In [149]:
def get_data():
    headers = crawler.HEADERS.copy()
    headers['X-CSRF-TOKEN'] = token
    async def main():

        async with ClientSession(headers=headers, cookies=cookies) as session:
            params = f'is_format_data=1&is_new_list=1&type=1&region=3&firstRow=0'
            async with session.get(crawler.GET_LIST_URL, params=params) as response:
                print("Status:", response.status)
                data = await response.json()
                data = data['data']['data']
                print("Body:", "...")
                return data

    loop = asyncio.get_event_loop()
    data = loop.run_until_complete(main())
    return data

In [150]:
data = get_data()

Status: 200
Body: ...


In [151]:
data

[{'title': '金城商圈麥當勞對面精緻幽靜可貓',
  'type': '1',
  'post_id': 12672786,
  'kind_name': '分租套房',
  'room_str': '',
  'floor_str': '5F/5F',
  'community': '',
  'price': '9,500',
  'price_unit': '元/月',
  'photo_list': ['https://img2.591.com.tw/house/2018/01/05/151510757166643101.jpg!510x400.jpg',
   'https://img1.591.com.tw/house/2022/06/03/165421780607775000.jpg!510x400.jpg',
   'https://img2.591.com.tw/house/2022/06/03/165421791339126802.jpg!510x400.jpg',
   'https://img2.591.com.tw/house/2020/02/02/158065021336744604.jpg!510x400.jpg',
   'https://img2.591.com.tw/house/2020/02/05/158086028567538803.jpg!510x400.jpg',
   'https://img1.591.com.tw/house/2018/01/05/151510757295405506.jpg!510x400.jpg',
   'https://img1.591.com.tw/house/2018/01/05/151510757557181009.jpg!510x400.jpg',
   'https://img2.591.com.tw/house/2018/08/05/153344024798396208.jpg!510x400.jpg',
   'https://img2.591.com.tw/house/2016/09/30/147520521083050402.jpg!510x400.jpg',
   'https://img2.591.com.tw/house/2018/08/05/15334401

In [132]:
total_page

255

In [86]:
token

'wxQyh1XLymd8hry1Ebs9uG1zMJbK6zUWRe1RXI0z'

In [None]:
async with ClientSession() as session:

In [80]:
cookies

<RequestsCookieJar[Cookie(version=0, name='PHPSESSID', value='d6g2mgkc8kgfgltm7jq57ipfg5', port=None, port_specified=False, domain='.591.com.tw', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={}, rfc2109=False), Cookie(version=0, name='T591_TOKEN', value='d6g2mgkc8kgfgltm7jq57ipfg5', port=None, port_specified=False, domain='.591.com.tw', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=1971683992, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False), Cookie(version=0, name='newUI', value='1', port=None, port_specified=False, domain='.591.com.tw', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={}, rfc2109=False), Cookie(version=0, name='new_rent_list_kind_test', value='1', port=None, port_specified=

In [57]:
class RentalCrawler:
    DOMAIN_URL = 'https://rent.591.com.tw/'
    GET_LIST_URL = 'https://rent.591.com.tw/home/search/rsList'
    HEADERS = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
    }

    def __init__(self):
        self.session = ClientSession()

    def _get_token(self):
        async def token_session():
            res = await self.session.get(self.DOMAIN_URL, headers=self.HEADERS)
            # print(res[:15])
            # soup = BeautifulSoup(res.text, 'html.parser')
            # token = soup.select_one('meta[name="csrf-token"]').get('content')
            return res
        loop = asyncio.get_event_loop()
        token = loop.run_until_complete(token_session())
        return token

    def _get_rentals(self, token: str, region_id: int=3):
        headers = self.HEADERS.copy()
        headers['X-CSRF-TOKEN'] = token
        c = requests.cookies.RequestsCookieJar()
        c.set('urlJumpIp', f'{region_id}',
                domain='.591.com.tw',
                path='/')
        self.session.cookies.update(c)
        df_rental_list = pd.DataFrame(columns=['post_id', 'title'])
        params = f'is_format_data=1&is_new_list=1&type=1&region={region_id}&firstRow=0'
        res = self.session.get(self.GET_LIST_URL, params=params, headers=headers)
        time.sleep(random.randint(3, 5))
        for house in res.json()['data']['data']:
            df_rentals = df_rentals.append({'post_id': house['post_id'], 'title': house['title']}, ignore_index=True)
        total_page = int(res.json()['records'].replace(',',''))//30+1
        if total_page > 1:
            for page in range(1, total_page):
                params = f'is_format_data=1&is_new_list=1&type=1&region={region_id}&firstRow={30*page}'
                res = self.session.get(self.GET_LIST_URL, params=params, headers=headers)
                time.sleep(random.randint(3, 5))
                for house in res.json()['data']['data']:
                    df_rentals = df_rentals.append({'post_id': house['post_id'], 'title': house['title']}, ignore_index=True)
        return df_rentals

In [58]:
crawler = RentalCrawler()

In [59]:
token = crawler._get_token()

In [69]:
token.read().t

<coroutine object ClientResponse.read at 0x000001990D71DB48>

In [71]:
soup = BeautifulSoup(token.read(), 'html.parser')

  codeob = compile(source, filename, symbol, self.flags, 1)
  codeob = compile(source, filename, symbol, self.flags, 1)


TypeError: object of type 'coroutine' has no len()

In [36]:
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
s = requests.Session()
url = 'https://rent.591.com.tw/'
r = s.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
token_item = soup.select_one('meta[name="csrf-token"]')

In [37]:
headers = headers.copy()
headers['X-CSRF-TOKEN'] = token_item.get('content')

In [105]:
j = requests.jar()

AttributeError: module 'requests' has no attribute 'jar'

In [None]:
requests.cookies

In [41]:
c = requests.cookies.RequestsCookieJar()
c.set('urlJumpIp', '3',
        domain='.591.com.tw',
        path='/')
s.cookies.update(c)

Cookie(version=0, name='urlJumpIp', value='3', port=None, port_specified=False, domain='.591.com.tw', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)

In [42]:
s.cookies.update(c)

In [44]:
s.cookies['urlJumpIp']

'3'

In [28]:
s.cookies.set('urlJumpIp', '3', domain='rent.591.com.tw', path='/')

Cookie(version=0, name='urlJumpIp', value='3', port=None, port_specified=False, domain='rent.591.com.tw', domain_specified=True, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)

In [45]:
url = 'https://rent.591.com.tw/home/search/rsList'
# params = 'is_format_data=1&is_new_list=1&type=1&region=3&firstRow=0'
params = 'is_format_data=1&is_new_list=1&type=1&region=3'
r = s.get(url, params=params, headers=headers)

In [84]:
# https://rent.591.com.tw/home/search/rsList?is_format_data=1&is_new_list=1&type=1&region=3
r = s.get('', params=params, headers=headers)

In [58]:
r.json().keys()

dict_keys(['status', 'data', 'records', 'is_recom', 'deal_recom', 'online_social_user', 'bluekai_data', 'recommend', 'seo'])

In [70]:
int(r.json()['records'].replace(',',''))//30+1

254

In [69]:
1//30

0

In [55]:
df_rental_list = pd.DataFrame(columns=['post_id', 'title'])
for house in r.json()['data']['data']:
    df_rental_list = df_rental_list.append({'post_id': house['post_id'], 'title': house['title']}, ignore_index=True)

In [56]:
df_rental_list

Unnamed: 0,post_id,title
0,12750791,環球莒光寵物友善兩大房格局
1,12713709,環球莒光寵物友善兩大房格局
2,12815016,近頂溪捷運站精心裝修的二楼舒適套房
3,12696888,【1樓整層】及【2、3樓女性套房】出租
4,12565749,板橋民生花市橋下停車場平日月租
5,12632769,新台五路二段套房出租
6,12684586,獨立衛浴~近捷運~可報稅申請補貼~限女性
7,12764172,台北矽谷大樓地下停車場
8,12805436,板橋.府中.對外窗.獨洗.台藝大致理.
9,12759355,【新大樓】永安捷運旁、陽台套房、中和路


In [50]:
r.json()['data']['data'][0]

{'title': '環球莒光寵物友善兩大房格局',
 'type': '1',
 'post_id': 12750791,
 'kind_name': '整層住家',
 'room_str': '2房1廳',
 'floor_str': '5F/5F',
 'community': '',
 'price': '24,800',
 'price_unit': '元/月',
 'photo_list': ['https://img1.591.com.tw/house/2022/06/14/165520490972242303.jpg!510x400.jpg',
  'https://img1.591.com.tw/house/2022/06/14/165520416894674101.jpg!510x400.jpg',
  'https://img2.591.com.tw/house/2022/06/14/165520416968016308.jpg!510x400.jpg',
  'https://img1.591.com.tw/house/2022/06/14/165520417031995408.jpg!510x400.jpg',
  'https://img1.591.com.tw/house/2022/06/14/165520417064122303.jpg!510x400.jpg'],
 'section_name': '中和區',
 'street_name': '國光街',
 'location': '中和區-國光街107巷',
 'rent_tag': [{'id': '1', 'name': '屋主直租'},
  {'id': '3', 'name': '拎包入住'},
  {'id': '5', 'name': '隨時可遷入'},
  {'id': '6', 'name': '可開伙'},
  {'id': '7', 'name': '可養寵物'}],
 'area': '25',
 'role_name': '屋主',
 'contact': '林先生',
 'refresh_time': '9分鐘內',
 'yesterday_hit': 326,
 'is_vip': 1,
 'is_combine': 1,
 'hurry': 0,
 

In [9]:
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
s = requests.Session()
url = f'https://rent.591.com.tw/home/12670629'
r = s.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
token_item = soup.select_one('meta[name="csrf-token"]')

headers = headers.copy()
headers['X-CSRF-TOKEN'] = token_item.get('content')
headers['deviceid'] = s.cookies.get_dict()['T591_TOKEN']
# headers['token'] = s.cookies.get_dict()['PHPSESSID']
headers['device'] = 'pc'

url = f'https://bff.591.com.tw/v1/house/rent/detail?id=12670629'
r = s.get(url, headers=headers)

In [11]:
r.json()['data']

{'breadcrumb': [{'name': '台北市',
   'id': 1,
   'query': 'region',
   'link': '/?region=1'},
  {'name': '萬華區', 'id': 6, 'query': 'section', 'link': '/?region=1&section=6'},
  {'name': '分租套房',
   'id': 3,
   'query': 'kind',
   'link': '/?region=1&section=6&kind=3'}],
 'title': '西門新套房可炊有烘衣機廚房',
 'deposit': '押金二個月',
 'kind': 3,
 'relieved': 0,
 'regionId': 1,
 'sectionId': 6,
 'shareInfo': {'url': 'https://www.591.com.tw/1R?salt=BYsK&s=',
  'from': '來自【591租屋】',
  'title': '台北市萬華區，分租套房出租，12867元/月，詳情：'},
 'dealText': '',
 'dealTime': 0,
 'browse': {'pc': 70, 'mobile': 161},
 'tags': [{'id': 2, 'value': '近捷運'},
  {'id': 3, 'value': '拎包入住'},
  {'id': 4, 'value': '近商圈'},
  {'id': 5, 'value': '隨時可遷入'},
  {'id': 6, 'value': '可開伙'}],
 'price': '12,867',
 'priceUnit': '元/月',
 'navData': [{'title': '位置與周邊', 'key': 'positionRound', 'active': 1},
  {'title': '設備與服務', 'key': 'service', 'active': 1},
  {'title': '屋況介紹', 'key': 'remark', 'active': 1},
  {'title': '房屋詳情', 'key': 'houseDetail', 'active': 