In [42]:
import pandas as pd
import requests
from aiohttp import ClientSession
import asyncio
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from sqlalchemy.types import BigInteger, INTEGER, VARCHAR

import datetime
import time
import random
import logging

In [7]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
logger = logging.getLogger(__name__)

In [3]:
BASIC_FORMAT = "%(asctime)s-%(levelname)s-%(message)s"
chlr = logging.StreamHandler()
chlr.setFormatter(logging.Formatter(BASIC_FORMAT))
logger.setLevel('DEBUG')
logger.addHandler(chlr)

In [None]:
rental_conn = create_engine("postgresql://postgres:gallon1995@localhost/blackhead")

In [60]:
class RentalCrawler:
    DOMAIN_URL = 'https://rent.591.com.tw/'
    GET_LIST_URL = 'https://rent.591.com.tw/home/search/rsList'
    HEADERS = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
    }
    REGION_IDS = [1, 3]

    def __init__(self, rental_conn):
        self.df_rentals = pd.DataFrame(columns=['post_id', 'title', 'region_id'])
        self.rental_conn = rental_conn

    def _get_token(self, session):
        res = session.get(self.DOMAIN_URL, headers=self.HEADERS)
        soup = BeautifulSoup(res.text, 'html.parser')
        token = soup.select_one('meta[name="csrf-token"]').get('content')
        return token

    def _get_total_pages(self, session, token, region_id):
        headers = self.HEADERS.copy()
        headers['X-CSRF-TOKEN'] = token
        c = requests.cookies.RequestsCookieJar()
        c.set('urlJumpIp', f'{region_id}',
                domain='.591.com.tw',
                path='/')
        session.cookies.update(c)
        params = f'is_format_data=1&is_new_list=1&type=1&region={region_id}&firstRow=0'
        res = session.get(self.GET_LIST_URL, params=params, headers=headers)
        total_page = int(res.json()['records'].replace(',',''))//30+1
        cookies = session.cookies
        return total_page, cookies
    
    async def _get_rentals_by_page(self, session, region_id, page):
        first_row = 30 * (page-1)
        params = f'is_format_data=1&is_new_list=1&type=1&region={region_id}&firstRow={first_row}'
        async with session.get(crawler.GET_LIST_URL, params=params) as response:
            if (page % 50 == 0):
                logger.info(f"""
                    Page: {page}, Status: {response.status}
                """)
            data = await response.json()
            data = data['data']['data']
            for house in data:
                self.df_rentals = self.df_rentals.append({'post_id': house['post_id'], 'title': house['title'], 'region_id': region_id}, ignore_index=True)

    def _get_rentals_by_region(self, region_id: int = 1):
        with requests.session() as session:
            token = self._get_token(session=session)
            total_page, cookies = self._get_total_pages(
                session=session, token=token, region_id=region_id)
        headers = self.HEADERS.copy()
        headers['X-CSRF-TOKEN'] = token

        async def get_rentals_all_pages():
            async with ClientSession(headers=headers, cookies=cookies) as session:
                # await self._get_rentals_by_page(session=session, region_id=region_id, page=1)
                tasks = [asyncio.create_task(self._get_rentals_by_page(
                    session=session, region_id=region_id, page=page)) for page in range(1, total_page+1)]
                await asyncio.gather(*tasks)

        loop = asyncio.get_event_loop()
        loop.run_until_complete(get_rentals_all_pages())

    def _get_rentals_all(self):
        for region_id in self.REGION_IDS:
            self._get_rentals_by_region(region_id=region_id)
        logger.info(f'Finish get rentals list.')
        self.df_rentals.drop_duplicates(subset=['post_id'], keep='last', inplace=True)

    def _rentals_to_do(self):
        with self.rental_conn.connect() as con:
            con.execute("""
                truncate table rental.rentals
            """)
        df_rentals_type = {
            'post_id': BigInteger,
            'title': VARCHAR(128),
            'region_id': INTEGER
        }
        self.df_rentals.to_sql(name='rentals',
                            schema='rental',
                            if_exists='append',
                            index=False,
                            dtype=df_rentals_type,
                            con=self.rental_conn)
    
    def _read_rentals(self):
        


In [62]:
crawler = RentalCrawler(rental_conn=rental_conn)


In [63]:
crawler._get_rentals_all()

2022-06-28 16:54:07,148-INFO-
                    Page: 100, Status: 200
                
INFO:__main__:
                    Page: 100, Status: 200
                
2022-06-28 16:54:08,976-INFO-
                    Page: 50, Status: 200
                
INFO:__main__:
                    Page: 50, Status: 200
                
2022-06-28 16:54:15,815-INFO-
                    Page: 150, Status: 200
                
INFO:__main__:
                    Page: 150, Status: 200
                
2022-06-28 16:54:22,469-INFO-
                    Page: 200, Status: 200
                
INFO:__main__:
                    Page: 200, Status: 200
                
2022-06-28 16:54:25,621-INFO-
                    Page: 250, Status: 200
                
INFO:__main__:
                    Page: 250, Status: 200
                
2022-06-28 16:54:35,207-INFO-
                    Page: 300, Status: 200
                
INFO:__main__:
                    Page: 300, Status: 200
                
2022-06-28 1

In [66]:
crawler._rentals_to_do()

In [34]:
crawler.df_rentals

Unnamed: 0,post_id,title,region_id
15,12774249,套房出租，交通便利，近文化大學公車站,1
16,12792555,全新1+1房拎包入住(含車位含管理費),1
17,12799312,24管理、大採光、天然瓦斯、三陽台、浴缸,1
18,12810641,電梯分租套房(捷運市府站),1
19,12668720,免爬高樓層/附個人專用洗衣機之分租套房,1
...,...,...,...
18535,12693199,進海山.土城捷運站.全新完工.四房加車位,3
18536,12693167,近海山.土城捷運站.三房+車位,3
18537,12693159,倉庫店面住家,3
18538,12693095,汐科國泰皇家天下高楼4房+車位,3


In [43]:
# BigInteger, INTEGER, VARCHAR
df_rentals_type = {
  'post_id': BigInteger,
  'title': VARCHAR(128),
  'region_id': INTEGER
}

In [55]:
crawler.df_rentals.to_sql(name='rentals',
                          schema='rental',
                          if_exists='append',
                          index=False,
                          dtype=df_rentals_type,
                          con=rental_conn)


In [45]:
rental_conn = create_engine("postgresql://postgres:gallon1995@localhost/blackhead")

In [54]:
with rental_conn.connect() as con:
  con.execute("""
    truncate table rental.rentals
  """)

In [39]:
df = pd.read_sql("""
  select *
  from rental.rentals
""", con=engine)

In [40]:
df

Unnamed: 0,post_id,title,region_id


In [49]:
import datetime

In [52]:
datetime.datetime.today()

datetime.datetime(2022, 6, 28, 16, 32, 36, 846692)

In [131]:
token, cookies, total_page = crawler._get_rentals_by_region(region_id=3)

In [133]:
headers = crawler.HEADERS.copy()
headers['X-CSRF-TOKEN'] = token

In [149]:
def get_data():
    headers = crawler.HEADERS.copy()
    headers['X-CSRF-TOKEN'] = token
    async def main():

        async with ClientSession(headers=headers, cookies=cookies) as session:
            params = f'is_format_data=1&is_new_list=1&type=1&region=3&firstRow=0'
            async with session.get(crawler.GET_LIST_URL, params=params) as response:
                print("Status:", response.status)
                data = await response.json()
                data = data['data']['data']
                print("Body:", "...")
                return data

    loop = asyncio.get_event_loop()
    data = loop.run_until_complete(main())
    return data

In [36]:
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
s = requests.Session()
url = 'https://rent.591.com.tw/'
r = s.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
token_item = soup.select_one('meta[name="csrf-token"]')

In [37]:
headers = headers.copy()
headers['X-CSRF-TOKEN'] = token_item.get('content')

In [None]:
requests.cookies

In [41]:
c = requests.cookies.RequestsCookieJar()
c.set('urlJumpIp', '3',
        domain='.591.com.tw',
        path='/')
s.cookies.update(c)

Cookie(version=0, name='urlJumpIp', value='3', port=None, port_specified=False, domain='.591.com.tw', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)

In [42]:
s.cookies.update(c)

In [44]:
s.cookies['urlJumpIp']

'3'

In [28]:
s.cookies.set('urlJumpIp', '3', domain='rent.591.com.tw', path='/')

Cookie(version=0, name='urlJumpIp', value='3', port=None, port_specified=False, domain='rent.591.com.tw', domain_specified=True, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False)

In [45]:
url = 'https://rent.591.com.tw/home/search/rsList'
# params = 'is_format_data=1&is_new_list=1&type=1&region=3&firstRow=0'
params = 'is_format_data=1&is_new_list=1&type=1&region=3'
r = s.get(url, params=params, headers=headers)

In [84]:
# https://rent.591.com.tw/home/search/rsList?is_format_data=1&is_new_list=1&type=1&region=3
r = s.get('', params=params, headers=headers)

In [58]:
r.json().keys()

dict_keys(['status', 'data', 'records', 'is_recom', 'deal_recom', 'online_social_user', 'bluekai_data', 'recommend', 'seo'])

In [70]:
int(r.json()['records'].replace(',',''))//30+1

254

In [106]:
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
s = requests.Session()
url = f'https://rent.591.com.tw/'
r = s.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
token_item = soup.select_one('meta[name="csrf-token"]')

headers = headers.copy()
headers['X-CSRF-TOKEN'] = token_item.get('content')
headers['deviceid'] = s.cookies.get_dict()['T591_TOKEN']
# headers['token'] = s.cookies.get_dict()['PHPSESSID']
headers['device'] = 'pc'

url = f'https://bff.591.com.tw/v1/house/rent/detail?id=12716453'
r = s.get(url, headers=headers)

In [107]:
r.json().keys()

dict_keys(['status', 'msg', 'data'])

In [108]:
r.json()['data'].keys()

dict_keys(['breadcrumb', 'title', 'deposit', 'kind', 'relieved', 'regionId', 'sectionId', 'shareInfo', 'dealText', 'dealTime', 'browse', 'tags', 'price', 'priceUnit', 'navData', 'priceCacheTxt', 'priceCache', 'info', 'publish', 'rooms', 'positionRound', 'service', 'preference', 'remark', 'houseDetail', 'costData', 'infoData', 'questionData', 'linkInfo', 'favData'])

In [142]:
r.json()['data']['favData']


{'thumb': 'https://img2.591.com.tw/house/2021/11/02/163582444018942406.jpg!190x150.water2.jpg',
 'title': '免仲介費電梯套房限女，有管理代收信件包裹',
 'layout': '',
 'address': '台北市士林區文林路765號',
 'price': 13000,
 'area': '7',
 'kindTxt': '分租套房',
 'posttime': 1654660983,
 'count': 13}