In [1]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from aiohttp import ClientSession
import asyncio
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from sqlalchemy.types import BigInteger, INTEGER, VARCHAR, Numeric, TEXT

import datetime
import time
import random
import logging

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
logger = logging.getLogger(__name__)

In [4]:
BASIC_FORMAT = "%(asctime)s-%(levelname)s-%(message)s"
chlr = logging.StreamHandler()
chlr.setFormatter(logging.Formatter(BASIC_FORMAT))
logger.setLevel('DEBUG')
logger.addHandler(chlr)

In [5]:
rental_conn = create_engine("postgresql://postgres:gallon1995@localhost/blackhead")

In [6]:
class RentalCrawler:
    DOMAIN_URL = 'https://rent.591.com.tw/'
    GET_LIST_URL = 'https://rent.591.com.tw/home/search/rsList'
    GET_DETAIL_URL = 'https://bff.591.com.tw/v1/house/rent/detail?id='
    HEADERS = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
    }
    REGION_IDS = [1, 3]

    def __init__(self, rental_conn):
        self.df_rentals = pd.DataFrame(columns=['post_id', 'title', 'region_id'])
        self.df_rentals_detail = pd.DataFrame(columns=[
            'id', 'title', 'countyname', 'townname', 'tags', 'price', 'price_unit',
            'kind', 'area', 'address', 'lat', 'lon', 'desc', 'rule', 'content'
        ])
        self.rental_conn = rental_conn

    def _get_token(self, session):
        res = session.get(self.DOMAIN_URL, headers=self.HEADERS)
        soup = BeautifulSoup(res.text, 'html.parser')
        token = soup.select_one('meta[name="csrf-token"]').get('content')
        return token

    def _get_total_pages(self, session, token, region_id):
        headers = self.HEADERS.copy()
        headers['X-CSRF-TOKEN'] = token
        c = requests.cookies.RequestsCookieJar()
        c.set('urlJumpIp', f'{region_id}',
                domain='.591.com.tw',
                path='/')
        session.cookies.update(c)
        params = f'is_format_data=1&is_new_list=1&type=1&region={region_id}&firstRow=0'
        res = session.get(self.GET_LIST_URL, params=params, headers=headers)
        total_page = int(res.json()['records'].replace(',',''))//30+1
        cookies = session.cookies
        return total_page, cookies
    
    async def _get_rentals_by_page(self, session, region_id, page):
        first_row = 30 * (page-1)
        params = f'is_format_data=1&is_new_list=1&type=1&region={region_id}&firstRow={first_row}'
        async with session.get(self.GET_LIST_URL, params=params) as response:
            if (page % 50 == 0):
                logger.info(f"""
                    Page: {page}, Status: {response.status}
                """)
            data = await response.json()
            data = data['data']['data']
            for house in data:
                self.df_rentals = self.df_rentals.append({'post_id': house['post_id'], 'title': house['title'], 'region_id': region_id}, ignore_index=True)

    def _get_rentals_by_region(self, region_id: int = 1):
        with requests.session() as session:
            token = self._get_token(session=session)
            total_page, cookies = self._get_total_pages(
                session=session, token=token, region_id=region_id)
        headers = self.HEADERS.copy()
        headers['X-CSRF-TOKEN'] = token

        async def get_rentals_all_pages():
            async with ClientSession(headers=headers, cookies=cookies) as session:
                tasks = [asyncio.create_task(self._get_rentals_by_page(
                    session=session, region_id=region_id, page=page)) for page in range(1, total_page+1)]
                await asyncio.gather(*tasks)

        loop = asyncio.get_event_loop()
        loop.run_until_complete(get_rentals_all_pages())

    def _get_rentals_all(self):
        for region_id in self.REGION_IDS:
            self._get_rentals_by_region(region_id=region_id)
        logger.info(f'Finish get rentals list.')
        self.df_rentals.drop_duplicates(subset=['post_id'], keep='last', inplace=True)

    def _rentals_to_do(self):
        with self.rental_conn.connect() as con:
            con.execute("""
                truncate table rental.rentals
            """)
        df_rentals_type = {
            'post_id': BigInteger,
            'title': VARCHAR(128),
            'region_id': INTEGER
        }
        self.df_rentals.to_sql(name='rentals',
                            schema='rental',
                            if_exists='append',
                            index=False,
                            dtype=df_rentals_type,
                            con=self.rental_conn)
    
    def _read_rentals(self):
        self.df_rentals = pd.read_sql("""
            select f1.post_id, f1.title, f1.region_id
            from rental.rentals f1
            """, con=rental_conn)
    
    def _use_rental_detail_api(self, session, headers, id):
        url = self.GET_DETAIL_URL + str(id)
        try:
            res = session.get(url, headers=headers, timeout=5)
        except :
            logger.info(f'This id {id} is not available.')
            res = None
        return res

    def _parse_response(self, res, id):
        if (res.status_code != 200):
            logger.info(f'This id {id} is not available.')
            pass
        else:
            try:
                rental_data = res.json()['data']
                title = rental_data['title']
                countyname = rental_data['breadcrumb'][0]['name']
                townname = rental_data['breadcrumb'][1]['name']
                tags = '|'.join([i['value'] for i in rental_data['tags']])
                price = rental_data['favData']['price']
                price_unit = rental_data['priceUnit']
                kind = rental_data['favData'].get('kindTxt', 'NA')
                area = float(rental_data['favData'].get('area', -1))
                address = rental_data['favData'].get('address', '')
                lat = float(rental_data['positionRound'].get('lat', 0))
                lon = float(rental_data['positionRound'].get('lng', 0))
                desc = rental_data['service'].get('desc', '')
                rule = rental_data['service'].get('rule', '')
                content = BeautifulSoup(rental_data['remark'].get(
                    'content', ''), "html.parser").text
                df_rentals_detail = pd.DataFrame({
                    'id': [id], 'title': [title], 'countyname': [countyname], 'townname': [townname],
                    'tags': [tags], 'price': [price], 'price_unit': [price_unit], 'kind': [kind],
                    'area': [area], 'address': [address], 'lat': [lat], 'lon': [lon],
                    'desc': [desc], 'rule': [rule], 'content': [content]
                })
                self.df_rentals_detail = pd.concat([self.df_rentals_detail, df_rentals_detail]).reset_index(drop=True)
                # logger.info(f'Finish: {id}')
            except:
                logger.info(f'Can not get detail from {id}')
                pass
    
    def _get_rentals_detail_all(self):
        self._read_rentals()
        session = requests.Session()
        session.mount('http://', HTTPAdapter(max_retries=3))
        session.mount('https://', HTTPAdapter(max_retries=3))
        token = self._get_token(session=session)
        cookies = session.cookies
        headers = self.HEADERS.copy()
        headers['X-CSRF-TOKEN'] = token
        headers['deviceid'] = cookies.get_dict()['T591_TOKEN']
        headers['device'] = 'pc'
        for idx, data in self.df_rentals.iterrows():
            res = self._use_rental_detail_api(session=session, headers=headers, id=data.post_id)
            self._parse_response(res=res, id=data.post_id)
            time.sleep(random.random())
            if (idx+1) % 1000 == 0:
                logger.info(f'Finish number: {(idx+1)}')


In [7]:
crawler = RentalCrawler(rental_conn=rental_conn)

In [8]:
crawler._get_rentals_all()

2022-09-03 02:23:34,318-INFO-
                    Page: 50, Status: 200
                
2022-09-03 02:23:37,785-INFO-
                    Page: 100, Status: 200
                
2022-09-03 02:23:50,062-INFO-
                    Page: 200, Status: 200
                
2022-09-03 02:23:55,186-INFO-
                    Page: 150, Status: 200
                
2022-09-03 02:24:00,944-INFO-
                    Page: 300, Status: 200
                
2022-09-03 02:24:06,330-INFO-
                    Page: 250, Status: 200
                
2022-09-03 02:24:15,585-INFO-
                    Page: 350, Status: 200
                
2022-09-03 02:24:24,899-INFO-
                    Page: 50, Status: 200
                
2022-09-03 02:24:32,456-INFO-
                    Page: 100, Status: 200
                
2022-09-03 02:24:33,380-INFO-
                    Page: 150, Status: 200
                
2022-09-03 02:24:38,955-INFO-
                    Page: 200, Status: 200
                
2022-09-03 0

In [9]:
crawler._rentals_to_do()

In [10]:
crawler._get_rentals_detail_all()


2022-09-03 02:37:58,445-INFO-Finish number: 1000
2022-09-03 02:54:27,423-INFO-Finish number: 2000
2022-09-03 03:05:00,056-INFO-Can not get detail from 13023196
2022-09-03 03:08:50,983-INFO-Finish number: 3000
2022-09-03 03:21:24,194-INFO-Finish number: 4000
2022-09-03 03:34:09,815-INFO-Finish number: 5000
2022-09-03 03:39:29,534-INFO-Can not get detail from 13177972
2022-09-03 03:39:31,037-INFO-Can not get detail from 13177982
2022-09-03 03:39:31,900-INFO-Can not get detail from 13177986
2022-09-03 03:46:04,053-INFO-Finish number: 6000
2022-09-03 03:54:04,966-INFO-Can not get detail from 13023261
2022-09-03 03:58:36,686-INFO-Finish number: 7000
2022-09-03 04:12:53,744-INFO-Finish number: 8000
2022-09-03 04:16:31,272-INFO-Can not get detail from 13023051
2022-09-03 04:26:26,818-INFO-Finish number: 9000
2022-09-03 04:42:30,849-INFO-Finish number: 10000
2022-09-03 04:43:51,219-INFO-Can not get detail from 13022954
2022-09-03 04:44:07,098-INFO-Can not get detail from 13023122
2022-09-03 04

In [11]:
df_rentals_detail = crawler.df_rentals_detail.copy()

In [23]:
len(df_rentals_detail.query('id == 13125063').desc.values[0])


12

In [30]:
df_rentals_detail_type = {
    'id': BigInteger,
    'title': VARCHAR(128),
    'countyname': VARCHAR(3),
    'townname': VARCHAR(3),
    'tags': VARCHAR(128),
    'price': INTEGER,
    'price_unit': VARCHAR(10),
    'kind': VARCHAR(10),
    'area': Numeric,
    'address': VARCHAR(128),
    'lat': Numeric,
    'lon': Numeric,
    'desc': VARCHAR(128),
    'rule': VARCHAR(128),
    'content': TEXT,
}
df_rentals_detail.to_sql(name='rentals_detail',
                    schema='rental',
                    if_exists='append',
                    index=False,
                    dtype=df_rentals_detail_type,
                    con=rental_conn)


In [31]:
df = pd.read_sql("""
  select *
  from rental.rentals_detail
""", con=rental_conn)


In [32]:
df

Unnamed: 0,id,title,countyname,townname,tags,price,price_unit,kind,area,address,lat,lon,desc,rule,content,create_at
0,13125063,西藏路上超大套房1人舒適2人剛好,台北市,萬華區,屋主直租|拎包入住|隨時可遷入|免管理費,12000,元/月,獨立套房,8.0,台北市萬華區西藏路,25.028762,121.493489,最短租期一年，可隨時遷入,此房屋男女皆可租住，不可養寵物，不可開伙；適合學生及上班族,超寬敞大套房，可輕鬆隔出一個客廳，CP值超高～逼近9坪的使用空間，僅此一間！老公寓邊間，三面...,2022-09-03 07:30:59.892331
1,13123293,飯店管理&拎包入住（琢豐）,台北市,中山區,近捷運|有電梯|隨時可遷入|可開伙|有車位|有陽台,118800,元/月,獨立套房,31.6,台北市中山區松江路,25.053658,121.533704,最短租期一年，可隨時遷入,此房屋男女皆可租住，不可養寵物；適合上班族,大陸工程打造頂客族的城市住家特聘愛馬仕御用建築團隊R A D I室內設計精緻高級物業管理，住...,2022-09-03 07:30:59.892331
2,13126952,飯店客房月租，24小時櫃台飯店式管理,台北市,萬華區,近捷運|近商圈|有電梯|隨時可遷入|可開伙|押一付一,16000,元/月,分租套房,8.0,台北市萬華區昆明街76號,25.045717,121.505484,最短租期3月，可隨時遷入,此房屋男女皆可租住，不可養寵物,"西門町近電影街，交通便利；飯店客房分租，24小時櫃台服務，短期可無窗房12,000-13,0...",2022-09-03 07:30:59.892331
3,13164690,"近文德捷運站,市場,生活機能佳,電梯大樓",台北市,內湖區,屋主直租|近捷運|新上架|有電梯|隨時可遷入|可開伙,33000,元/月,整層住家,30.0,台北市內湖區文德路66巷,25.079144,121.582317,最短租期一年，可隨時遷入,不可養寵物；適合上班族及家庭,"電梯大樓，優質社區內，棟距大,採光好。周邊：附近有7-11、市場，學校, 夜市, 生活機能非...",2022-09-03 07:30:59.892331
4,13167799,Alife士林全新電梯陽台套房,台北市,士林區,新上架|拎包入住|有電梯|隨時可遷入|可開伙|有陽台,25800,元/月,獨立套房,14.6,台北市士林區福林路,25.097358,121.533142,最短租期一年，可隨時遷入,此房屋男女皆可租住，不可養寵物,歡迎申請租金補貼！！知名租屋品牌 Alife 全新落成新據點請加入 Alife LINE ...,2022-09-03 07:30:59.892331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19485,13027954,"全為一個注重衛生的妳,打造完美套房",新北市,蘆洲區,屋主直租|近捷運|拎包入住|有電梯|隨時可遷入|有陽台,11000,元/月,獨立套房,8.5,新北市蘆洲區中正路,25.081984,121.471764,最短租期一年，可隨時遷入,此房屋男女皆可租住，不可養寵物，不可開伙；適合學生及上班族,公車站牌在樓下.有陽台.磚牆隔間\r\n附近有便利商店、傳統市場、學校、醫療機構、夜市。,2022-09-03 07:30:59.892331
19486,13027686,宏泰市場內美甲店門口出租攤位,新北市,新莊區,近捷運|隨時可遷入|可養寵物,3000,元/月,其他,1.0,新北市新莊區自強街23巷,25.045802,121.453846,最短租期一年，可隨時遷入,不可開伙,宏泰市場內門口攤位~人潮多,2022-09-03 07:30:59.892331
19487,13026298,近耕莘護校全新水泥隔間套房,新北市,新店區,屋主直租|近捷運|拎包入住|隨時可遷入,8000,元/月,分租套房,4.0,新北市新店區中正路,24.977019,121.534608,最短租期一年，可隨時遷入,此房屋限女生租住，不可養寵物，不可開伙；適合學生及上班族,全新衛浴設備，磁磚地板，低樓層，全套家電設施，獨立門戶，有對外窗戶，光線佳靠近耕莘護校，安靜...,2022-09-03 07:30:59.892331
19488,13026096,套房出租【板橋大觀路2段】,新北市,板橋區,屋主直租|隨時可遷入|可短租,4000,元/月,分租套房,4.0,新北市板橋區大觀路二段,24.998444,121.441766,最短租期半年，可隨時遷入,此房屋男女皆可租住，不可養寵物，不可開伙,。套 房 出 租。 ◎房屋月租金：4000元~~~ 押 金 2 個 月 ~~ 水 費: 每月...,2022-09-03 07:30:59.892331


In [29]:
# truncate table rental.rentals

# rental_conn.execute("""
#   truncate table rental.rentals_info
# """)

rental_conn.execute("""
  alter table rental.rentals_info add primary key(id, create_at)
""")


ProgrammingError: (psycopg2.errors.InvalidTableDefinition) multiple primary keys for table "rentals_info" are not allowed

[SQL: 
  alter table rental.rentals_info add primary key(id, create_at)
]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [30]:
crawler.df_rentals

Unnamed: 0,post_id,title,region_id
0,12779674,美樂，東湖站，哈拉影城。百萬裝潢，採光強,1
1,12806475,晴光全新大樓裝潢首租⚡2房2衛✔浴缸陽台,1
2,12820915,🐱陽台電梯套房可寵🐶✨獨立門牌💘,1
3,12801334,近捷運忠孝敦化站陽光套房/可短租,1
4,12813538,西園路/平價套房/萬華火車站/南機場夜市,1
...,...,...,...
18370,12719275,風景優美空氣清新交通方便，單租房間四間,3
18371,8031468,永安市場優美小套房,3
18372,12718775,樹林雅房出租,3
18373,12718723,平面車位出租,3


In [63]:
crawler._get_rentals_all()

2022-06-28 16:54:07,148-INFO-
                    Page: 100, Status: 200
                
INFO:__main__:
                    Page: 100, Status: 200
                
2022-06-28 16:54:08,976-INFO-
                    Page: 50, Status: 200
                
INFO:__main__:
                    Page: 50, Status: 200
                
2022-06-28 16:54:15,815-INFO-
                    Page: 150, Status: 200
                
INFO:__main__:
                    Page: 150, Status: 200
                
2022-06-28 16:54:22,469-INFO-
                    Page: 200, Status: 200
                
INFO:__main__:
                    Page: 200, Status: 200
                
2022-06-28 16:54:25,621-INFO-
                    Page: 250, Status: 200
                
INFO:__main__:
                    Page: 250, Status: 200
                
2022-06-28 16:54:35,207-INFO-
                    Page: 300, Status: 200
                
INFO:__main__:
                    Page: 300, Status: 200
                
2022-06-28 1

In [66]:
crawler._rentals_to_do()

In [34]:
crawler.df_rentals

Unnamed: 0,post_id,title,region_id
15,12774249,套房出租，交通便利，近文化大學公車站,1
16,12792555,全新1+1房拎包入住(含車位含管理費),1
17,12799312,24管理、大採光、天然瓦斯、三陽台、浴缸,1
18,12810641,電梯分租套房(捷運市府站),1
19,12668720,免爬高樓層/附個人專用洗衣機之分租套房,1
...,...,...,...
18535,12693199,進海山.土城捷運站.全新完工.四房加車位,3
18536,12693167,近海山.土城捷運站.三房+車位,3
18537,12693159,倉庫店面住家,3
18538,12693095,汐科國泰皇家天下高楼4房+車位,3


In [43]:
# BigInteger, INTEGER, VARCHAR
df_rentals_type = {
  'post_id': BigInteger,
  'title': VARCHAR(128),
  'region_id': INTEGER
}

In [55]:
crawler.df_rentals.to_sql(name='rentals',
                          schema='rental',
                          if_exists='append',
                          index=False,
                          dtype=df_rentals_type,
                          con=rental_conn)


In [45]:
rental_conn = create_engine("postgresql://postgres:gallon1995@localhost/blackhead")

In [54]:
with rental_conn.connect() as con:
  con.execute("""
    truncate table rental.rentals
  """)

In [39]:
df = pd.read_sql("""
  select *
  from rental.rentals
""", con=rental_conn)


In [42]:
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
s = requests.Session()
url = 'https://rent.591.com.tw/'
r = s.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
token_item = soup.select_one('meta[name="csrf-token"]')

In [31]:
headers = headers.copy()
headers['X-CSRF-TOKEN'] = token_item.get('content')

In [46]:
# s.cookies


In [37]:
c = requests.cookies.RequestsCookieJar()
c.set('urlJumpIp', '17',
        domain='.591.com.tw',
        path='/')
s.cookies.update(c)

In [43]:
s.cookies['urlJumpIp']

'1'

In [45]:
# url = 'https://rent.591.com.tw/home/search/rsList'
# params = 'is_format_data=1&is_new_list=1&type=1&region=3&firstRow=0'
# params = 'is_format_data=1&is_new_list=1&type=1&region=17&section=250,245&searchtype=1'
# r = s.get(url, params=params, headers=headers)

In [44]:
# r.json()