In [1]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from aiohttp import ClientSession
import asyncio
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from sqlalchemy.types import BigInteger, INTEGER, VARCHAR, Numeric, TEXT

import datetime
import time
import random
import logging

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
logger = logging.getLogger(__name__)

In [4]:
BASIC_FORMAT = "%(asctime)s-%(levelname)s-%(message)s"
chlr = logging.StreamHandler()
chlr.setFormatter(logging.Formatter(BASIC_FORMAT))
logger.setLevel('DEBUG')
logger.addHandler(chlr)

In [5]:
rental_conn = create_engine("postgresql://postgres:gallon1995@localhost/blackhead")

In [77]:
class RentalCrawler:
    DOMAIN_URL = 'https://rent.591.com.tw/'
    GET_LIST_URL = 'https://rent.591.com.tw/home/search/rsList'
    GET_DETAIL_URL = 'https://bff.591.com.tw/v1/house/rent/detail?id='
    HEADERS = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
    }
    REGION_IDS = [1, 3]

    def __init__(self, rental_conn):
        self.df_rentals = pd.DataFrame(columns=['post_id', 'title', 'region_id'])
        self.df_rentals_detail = pd.DataFrame(columns=[
            'id', 'title', 'countyname', 'townname', 'tags', 'price', 'price_unit',
            'kind', 'area', 'address', 'lat', 'lon', 'desc', 'rule', 'content'
        ])
        self.rental_conn = rental_conn

    def _get_token(self, session):
        res = session.get(self.DOMAIN_URL, headers=self.HEADERS)
        soup = BeautifulSoup(res.text, 'html.parser')
        token = soup.select_one('meta[name="csrf-token"]').get('content')
        return token

    def _get_total_pages(self, session, token, region_id):
        headers = self.HEADERS.copy()
        headers['X-CSRF-TOKEN'] = token
        c = requests.cookies.RequestsCookieJar()
        c.set('urlJumpIp', f'{region_id}',
                domain='.591.com.tw',
                path='/')
        session.cookies.update(c)
        params = f'is_format_data=1&is_new_list=1&type=1&region={region_id}&firstRow=0'
        res = session.get(self.GET_LIST_URL, params=params, headers=headers)
        total_page = int(res.json()['records'].replace(',',''))//30+1
        cookies = session.cookies
        return total_page, cookies
    
    async def _get_rentals_by_page(self, session, region_id, page):
        first_row = 30 * (page-1)
        params = f'is_format_data=1&is_new_list=1&type=1&region={region_id}&firstRow={first_row}'
        async with session.get(self.GET_LIST_URL, params=params) as response:
            if (page % 50 == 0):
                logger.info(f"""
                    Page: {page}, Status: {response.status}
                """)
            data = await response.json()
            data = data['data']['data']
            for house in data:
                self.df_rentals = self.df_rentals.append({'post_id': house['post_id'], 'title': house['title'], 'region_id': region_id}, ignore_index=True)

    def _get_rentals_by_region(self, region_id: int = 1):
        with requests.session() as session:
            token = self._get_token(session=session)
            total_page, cookies = self._get_total_pages(
                session=session, token=token, region_id=region_id)
        headers = self.HEADERS.copy()
        headers['X-CSRF-TOKEN'] = token

        async def get_rentals_all_pages():
            async with ClientSession(headers=headers, cookies=cookies) as session:
                tasks = [asyncio.create_task(self._get_rentals_by_page(
                    session=session, region_id=region_id, page=page)) for page in range(1, total_page+1)]
                await asyncio.gather(*tasks)

        loop = asyncio.get_event_loop()
        loop.run_until_complete(get_rentals_all_pages())

    def _get_rentals_all(self):
        for region_id in self.REGION_IDS:
            self._get_rentals_by_region(region_id=region_id)
        logger.info(f'Finish get rentals list.')
        self.df_rentals.drop_duplicates(subset=['post_id'], keep='last', inplace=True)

    def _rentals_to_do(self):
        with self.rental_conn.connect() as con:
            con.execute("""
                truncate table rental.rentals
            """)
        df_rentals_type = {
            'post_id': BigInteger,
            'title': VARCHAR(128),
            'region_id': INTEGER
        }
        self.df_rentals.to_sql(name='rentals',
                            schema='rental',
                            if_exists='append',
                            index=False,
                            dtype=df_rentals_type,
                            con=self.rental_conn)
    
    def _read_rentals(self):
        self.df_rentals = pd.read_sql("""
            select f1.post_id, f1.title, f1.region_id
            from rental.rentals f1
            """, con=rental_conn)
    
    def _use_rental_detail_api(self, session, headers, id):
        url = self.GET_DETAIL_URL + str(id)
        try:
            res = session.get(url, headers=headers, timeout=5)
        except :
            logger.info(f'This id {id} is not available.')
            res = None
        return res

    def _parse_response(self, res, id):
        if (res.status_code != 200):
            logger.info(f'This id {id} is not available.')
            pass
        else:
            try:
                rental_data = res.json()['data']
                title = rental_data['title']
                countyname = rental_data['breadcrumb'][0]['name']
                townname = rental_data['breadcrumb'][1]['name']
                tags = '|'.join([i['value'] for i in rental_data['tags']])
                price = rental_data['favData']['price']
                price_unit = rental_data['priceUnit']
                kind = rental_data['favData'].get('kindTxt', 'NA')
                area = float(rental_data['favData'].get('area', -1))
                address = rental_data['favData'].get('address', '')
                lat = float(rental_data['positionRound'].get('lat', 0))
                lon = float(rental_data['positionRound'].get('lng', 0))
                desc = rental_data['service'].get('desc', '')
                rule = rental_data['service'].get('rule', '')
                content = BeautifulSoup(rental_data['remark'].get(
                    'content', ''), "html.parser").text
                df_rentals_detail = pd.DataFrame({
                    'id': [id], 'title': [title], 'countyname': [countyname], 'townname': [townname],
                    'tags': [tags], 'price': [price], 'price_unit': [price_unit], 'kind': [kind],
                    'area': [area], 'address': [address], 'lat': [lat], 'lon': [lon],
                    'desc': [desc], 'rule': [rule], 'content': [content]
                })
                self.df_rentals_detail = pd.concat([self.df_rentals_detail, df_rentals_detail]).reset_index(drop=True)
                # logger.info(f'Finish: {id}')
            except:
                logger.info(f'Can not get detail from {id}')
                pass
    
    def _get_rentals_detail_all(self):
        self._read_rentals()
        session = requests.Session()
        session.mount('http://', HTTPAdapter(max_retries=3))
        session.mount('https://', HTTPAdapter(max_retries=3))
        token = self._get_token(session=session)
        cookies = session.cookies
        headers = self.HEADERS.copy()
        headers['X-CSRF-TOKEN'] = token
        headers['deviceid'] = cookies.get_dict()['T591_TOKEN']
        headers['device'] = 'pc'
        for idx, data in self.df_rentals.iterrows():
            res = self._use_rental_detail_api(session=session, headers=headers, id=data.post_id)
            self._parse_response(res=res, id=data.post_id)
            time.sleep(random.random())
            if (idx+1) % 1000 == 0:
                logger.info(f'Finish number: {(idx+1)}')


In [78]:
crawler = RentalCrawler(rental_conn=rental_conn)

In [79]:
crawler._get_rentals_all()

2022-06-30 09:05:12,796-INFO-
                    Page: 50, Status: 200
                
2022-06-30 09:05:14,953-INFO-
                    Page: 100, Status: 200
                
2022-06-30 09:05:16,301-INFO-
                    Page: 150, Status: 200
                
2022-06-30 09:05:26,571-INFO-
                    Page: 200, Status: 200
                
2022-06-30 09:05:31,543-INFO-
                    Page: 250, Status: 200
                
2022-06-30 09:05:36,511-INFO-
                    Page: 300, Status: 200
                
2022-06-30 09:05:38,885-INFO-
                    Page: 350, Status: 200
                
2022-06-30 09:05:48,020-INFO-
                    Page: 100, Status: 200
                
2022-06-30 09:05:52,312-INFO-
                    Page: 50, Status: 200
                
2022-06-30 09:05:58,966-INFO-
                    Page: 200, Status: 200
                
2022-06-30 09:06:04,452-INFO-
                    Page: 150, Status: 200
                
2022-06-30 0

In [80]:
crawler._rentals_to_do()

In [81]:
crawler._get_rentals_detail_all()


2022-06-30 09:23:41,986-INFO-Finish number: 1000
2022-06-30 09:34:46,877-INFO-Finish number: 2000
2022-06-30 09:45:21,217-INFO-Finish number: 3000
2022-06-30 09:56:16,371-INFO-Finish number: 4000
2022-06-30 10:04:19,268-INFO-Can not get detail from 12827702
2022-06-30 10:06:57,451-INFO-Finish number: 5000
2022-06-30 10:08:02,986-INFO-Can not get detail from 12808748
2022-06-30 10:09:13,847-INFO-Can not get detail from 12818003
2022-06-30 10:09:33,553-INFO-Can not get detail from 12743329
2022-06-30 10:17:26,148-INFO-Finish number: 6000
2022-06-30 10:21:07,548-INFO-Can not get detail from 12736520
2022-06-30 10:21:11,894-INFO-Can not get detail from 12822384
2022-06-30 10:28:39,832-INFO-Finish number: 7000
2022-06-30 10:37:26,269-INFO-Can not get detail from 12677070
2022-06-30 10:40:23,426-INFO-Finish number: 8000
2022-06-30 10:53:28,923-INFO-Can not get detail from 12787077
2022-06-30 10:53:39,874-INFO-Finish number: 9000
2022-06-30 10:57:23,874-INFO-Can not get detail from 12794478
2

In [82]:
df_rentals_detail = crawler.df_rentals_detail.copy()

In [47]:
# df_rentals_detail


In [49]:
df_rentals_detail_type = {
    'id': BigInteger,
    'title': VARCHAR(128),
    'countyname': VARCHAR(3),
    'townname': VARCHAR(3),
    'tags': VARCHAR(128),
    'price': INTEGER,
    'price_unit': VARCHAR(10),
    'kind': VARCHAR(10),
    'area': Numeric,
    'address': VARCHAR(128),
    'lat': Numeric,
    'lon': Numeric,
    'desc': VARCHAR(128),
    'rule': VARCHAR(128),
    'content': TEXT,
}
df_rentals_detail.to_sql(name='rentals_info',
                    schema='rental',
                    if_exists='append',
                    index=False,
                    dtype=df_rentals_detail_type,
                    con=rental_conn)


In [30]:
crawler.df_rentals

Unnamed: 0,post_id,title,region_id
0,12779674,美樂，東湖站，哈拉影城。百萬裝潢，採光強,1
1,12806475,晴光全新大樓裝潢首租⚡2房2衛✔浴缸陽台,1
2,12820915,🐱陽台電梯套房可寵🐶✨獨立門牌💘,1
3,12801334,近捷運忠孝敦化站陽光套房/可短租,1
4,12813538,西園路/平價套房/萬華火車站/南機場夜市,1
...,...,...,...
18370,12719275,風景優美空氣清新交通方便，單租房間四間,3
18371,8031468,永安市場優美小套房,3
18372,12718775,樹林雅房出租,3
18373,12718723,平面車位出租,3


In [63]:
crawler._get_rentals_all()

2022-06-28 16:54:07,148-INFO-
                    Page: 100, Status: 200
                
INFO:__main__:
                    Page: 100, Status: 200
                
2022-06-28 16:54:08,976-INFO-
                    Page: 50, Status: 200
                
INFO:__main__:
                    Page: 50, Status: 200
                
2022-06-28 16:54:15,815-INFO-
                    Page: 150, Status: 200
                
INFO:__main__:
                    Page: 150, Status: 200
                
2022-06-28 16:54:22,469-INFO-
                    Page: 200, Status: 200
                
INFO:__main__:
                    Page: 200, Status: 200
                
2022-06-28 16:54:25,621-INFO-
                    Page: 250, Status: 200
                
INFO:__main__:
                    Page: 250, Status: 200
                
2022-06-28 16:54:35,207-INFO-
                    Page: 300, Status: 200
                
INFO:__main__:
                    Page: 300, Status: 200
                
2022-06-28 1

In [66]:
crawler._rentals_to_do()

In [34]:
crawler.df_rentals

Unnamed: 0,post_id,title,region_id
15,12774249,套房出租，交通便利，近文化大學公車站,1
16,12792555,全新1+1房拎包入住(含車位含管理費),1
17,12799312,24管理、大採光、天然瓦斯、三陽台、浴缸,1
18,12810641,電梯分租套房(捷運市府站),1
19,12668720,免爬高樓層/附個人專用洗衣機之分租套房,1
...,...,...,...
18535,12693199,進海山.土城捷運站.全新完工.四房加車位,3
18536,12693167,近海山.土城捷運站.三房+車位,3
18537,12693159,倉庫店面住家,3
18538,12693095,汐科國泰皇家天下高楼4房+車位,3


In [43]:
# BigInteger, INTEGER, VARCHAR
df_rentals_type = {
  'post_id': BigInteger,
  'title': VARCHAR(128),
  'region_id': INTEGER
}

In [55]:
crawler.df_rentals.to_sql(name='rentals',
                          schema='rental',
                          if_exists='append',
                          index=False,
                          dtype=df_rentals_type,
                          con=rental_conn)


In [45]:
rental_conn = create_engine("postgresql://postgres:gallon1995@localhost/blackhead")

In [54]:
with rental_conn.connect() as con:
  con.execute("""
    truncate table rental.rentals
  """)

In [39]:
df = pd.read_sql("""
  select *
  from rental.rentals
""", con=rental_conn)


In [42]:
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
s = requests.Session()
url = 'https://rent.591.com.tw/'
r = s.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
token_item = soup.select_one('meta[name="csrf-token"]')

In [31]:
headers = headers.copy()
headers['X-CSRF-TOKEN'] = token_item.get('content')

In [46]:
# s.cookies


In [37]:
c = requests.cookies.RequestsCookieJar()
c.set('urlJumpIp', '17',
        domain='.591.com.tw',
        path='/')
s.cookies.update(c)

In [43]:
s.cookies['urlJumpIp']

'1'

In [45]:
# url = 'https://rent.591.com.tw/home/search/rsList'
# params = 'is_format_data=1&is_new_list=1&type=1&region=3&firstRow=0'
# params = 'is_format_data=1&is_new_list=1&type=1&region=17&section=250,245&searchtype=1'
# r = s.get(url, params=params, headers=headers)

In [44]:
# r.json()