# Data Preperation With Web Scraping 

In [9]:
from bs4 import BeautifulSoup  
import pandas as pd
from tqdm import tqdm
import math
import requests  
import lxml
import re
import time
import numpy as np

In [85]:
# if regular expression fails, return np.nan
def re_match(re_pattern, string, pattern2 = None, errif=np.nan):
    try:
        return re.findall(re_pattern, string)[0].strip()
    except IndexError:
        if pattern2 != None:
            return re.findall(pattern2, string)[0].strip()
        return errif

In [129]:
def scrap(citycode, area_dic):
    '''
    Input citycode and a dictionary of areas to scrape from Lian Jia website 
    '''
    data = pd.DataFrame()

    #header
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
               'Referer': f'https://{citycode}.lianjia.com/ershoufang/'}
    # create a new session
    sess = requests.session()
    sess.get(f'https://{citycode}.lianjia.com/ershoufang/', headers=headers)
    # ex. https://sz.lianjia.com/ershoufang/luohuqu/pg2/
    url = f'https://{citycode}'+'.lianjia.com/ershoufang/{}/pg{}/'

    for key_, value_ in area_dic.items():
        # total number of real estate
        start_url = 'https://{}.lianjia.com/ershoufang/{}/'.format(citycode, value_)
        html = sess.get(start_url).text
        house_num = re.findall('共找到<span> (.*?) </span>套.*二手房', html)[0]
        print('👻 District: {}\n😈 Total real estate for sale:「{}」'.format(key_, house_num))
        time.sleep(1)
        # website constraint
        total_page = int(math.ceil(min(3000, int(house_num)) / 30.0))
        for i in tqdm(range(total_page), colour = 'WHITE', desc=key_):
            html = sess.get(url.format(value_, i+1)).text
            soup = BeautifulSoup(html, 'lxml')
            info_collect = soup.find_all(class_="info clear")

            for info in info_collect:
                info_dic = {}
                # district
                info_dic['area'] = key_
                # title
                info_dic['title'] = re_match('target="_blank">(.*?)</a><!--', str(info))
                # community name
                info_dic['community'] = re_match('xiaoqu.*?target="_blank">(.*?)</a>', str(info))
                # location
                info_dic['position'] = re_match('<a href.*?target="_blank">(.*?)</a>.*?class="address">', str(info))
                # unit price
                info_dic['unit_price'] = str(re_match('<div class="unitPrice".*?><span>(.*?)元/平</span></div>', str(info)))

                # tax related, deeds over 5 years 
                info_dic['tax'] = re_match('class="taxfree">(.*?)</span>', str(info))
                # star received/ release time 
                info_dic['star'] = re_match('class="starIcon"></span>(.*?)人关注.*?</div>', str(info))
                info_dic['release_time'] = re_match('class="starIcon"></span>.*? / (.*?)以前发布</div>', str(info))
                # whether close to subway stations
                info_dic['subway'] = re_match('class="subway">(.*?)</span>', str(info))
                # allows VR inspection
                info_dic['vr'] = re_match('class="vr">(.*?)</span>', str(info))
                # has key 
                info_dic['has_key'] = re_match('class="haskey">(.*?)</span>', str(info)) 


                # other tages, splited by |
                icons = re.findall('class="houseIcon"></span>(.*?)</div>', str(info))[0].strip().split('|')
                info_dic['house_type'] = icons[0].strip()
                if len(icons)>1:
                    info_dic['house_size'] = float(icons[1].replace('平米', ''))
                if len(icons)>2:
                    info_dic['direction'] = icons[2].strip()
                if len(icons)>3:
                    info_dic['fitment'] = icons[3].strip()
                if len(icons)>4:
                    info_dic['floor'] = icons[4].strip()
                if len(icons)>5:
                    info_dic['year_built'] = icons[5].strip()
                if len(icons)>6:
                    info_dic['structure_type'] = icons[6].strip()
                if len(icons)>7:
                    info_dic['other_feature'] = icons[7].strip()

                # saved to dataframe
                if data.empty:
                    data = pd.DataFrame(info_dic,index=[0])
                else:
                    data = data.append(info_dic,ignore_index=True)
    return data 

In [144]:
#All districts: 锦江 青羊 武侯 高新 成华 金牛 天府新区 高新西 双流 温江 郫都 龙泉驿 新都 青白江
jinjiang = {'川师':'chuanshi','大面':'damian','东大路':'dongdalu','东光小区':'dongguangxiaoqu','东湖':'donghu',
'东客站':'dongkezhan','合江亭':'hejiangting','红星路':'hongxinglu','静居寺':'jingjusi','九眼桥':'jiuyanqiao',
'蓝谷地':'langudi','莲花':'lianhua2','琉璃场':'liulichang','攀成钢':'panchenggang','三官堂':'sanguantang',
'三圣乡':'sanshengxiang','沙河堡':'shahebao','水碾河':'shuinianhe','盐市口':'yanshikou','卓锦城':'zhuojincheng'}

qingyang = {'八宝街':'babaojie','贝森':'beisen','草市街':'caoshijie','草堂':'caotang','府南新区':'funanxinqu',
'光华泡小':'guanghuapaoxiao','浣花溪':'huanhuaxi','金沙':'jinsha','宽窄巷子':'kuanzhaixiangzi','人民公园':'renmingongyuan',
'蜀汉路':'shuhanlu','太升路':'taishenglu','外光华':'waiguanghua','外金沙':'waijinsha','万家湾':'wanjiawan',
'西南财大':'xinancaida','盐市口':'yanshikou','优品道':'youpindao'}

wuhou = {'草金立交':'caojinlijiao','川大':'chuanda','川音':'chuanyin','簇桥':'cuqiao','高升桥':'gaoshengqiao',
'广福桥':'guangfuqiao','航空路':'hangkonglu','红牌楼':'hongpailou','华西':'huaxi','火车南站':'huochenanzhan',
'丽都':'lidu','龙湾':'longwan','双楠':'shuangnan','桐梓林':'tongzilin','外双楠':'waishuangnan','五大花园':'wudahuayuan',
'武侯祠':'wuhouci','武侯立交':'wuhoulijiao','新双楠':'xinshuangnan','玉林':'yulin','棕北':'zongbei'}

gaoxin = {'城南宜家':'chengnanyijia','大源':'dayuan','东苑':'dongyuan2','芳草':'fangcao','高朋':'gaopeng','广都':'guangdou',
'华府':'huafu1','华阳':'huayang','金融城':'jinrongcheng','丽都':'lidu','神仙树':'shenxianshu','市一医院':'shiyiyiyuan',
'天府长城':'tianfuchangcheng','新北':'xinbei','新会展':'xinhuizhan','新双楠':'xinshuangnan','衣冠庙':'yiguanmiao',
'远大':'yuanda','中德':'zhongde','中和':'zhonghe','紫荆':'zijing'}

chenghua = {'八里小区':'balixiaoqu','成渝立交':'chengyulijiao','东郊记忆':'dongjiaojiyi','东客站':'dongkezhan','动物园':'dongwuyuan',
'建设路':'jianshelu','理工大':'ligongda','李家沱':'lijiatuo','龙潭寺':'longtansi','猛追湾':'mengzhuiwan','沙河堡':'shahebao',
'驷马桥':'simaqiao','SM广场':'smguangchang','万年场':'wannianchang','万象城':'wanxiangcheng1','新华公园':'xinhuagongyuan'}

jinniu = {'八宝街':'babaojie','茶店子':'chadianzi','抚琴小区':'fuqinxiaoqu','高家庄':'gaojiazhuang','国宾':'guobin',
'花牌坊':'huapaifang','华侨城':'huaqiaocheng3','花照壁':'huazhaobi','金府':'jinfu','金牛万达':'jinniuwanda',
'九里堤':'jiulidi','马鞍路':'maanlu','沙湾':'shawanhuizhan','石人小区':'shirenxiaoqu','蜀汉路':'shuhanlu','天回镇':'tianhuizhen',
'通惠门':'tonghuimen','五块石':'wukuaishi','西南交大':'xinanjiaoda','营门口':'yingmenkou','一品天下':'yipintianxia'}

tianfuxinqu = {'海洋公园':'haiyanggongyuan','华阳':'huayang','锦江生态带':'jinjiangshengtaidai','麓湖生态城':'luhushengtaicheng',
'麓山':'lushan','南湖':'nanhu','仁寿':'renshou1','四河':'sihe','新会展':'xinhuizhan','雅居乐':'yajule'}

gaoxinxi = {'高新西':'gaoxinxi','中海国际':'zhonghaiguoji'}

shuangliu = {'簇桥':'cuqiao','东升镇':'dongshengzhen','公兴':'gongxing','航空港':'hangkonggang','蛟龙港':'jiaolonggang',
'警院':'jingyuan','九龙湖':'jiulonghu1','牧马山':'mumashan','双流城区':'shuangliuchengqu','文星镇':'wenxingzhen',
'新津':'xinjin'}

wenjiang = {'芙蓉古城':'furonggucheng','光华大道沿线':'guanghuadadaoyanxian','国色天乡':'guosetianxiang','花都大道':'huadudadao',
'温江大学城':'wenjiangdaxuecheng','温江老城':'wenjianglaocheng','温江新城':'wenjiangxincheng','珠江新城':'zhujiangxincheng'}

pidu = {'成外':'chengwai','红光':'hongguang','郫县城区':'pixianchengqu','郫县万达':'pixianwanda','橡树湾':'xiangshuwan',
'犀浦':'xipu'}

longquanyi = {'东山':'dongshan','航天':'hangtian','洪河':'honghe','龙泉驿城区':'longquanyichengqu','十陵':'shiling2',
'西河':'xihe','阳光城':'yangguangcheng'}

xindu = {'保利公园':'baoligongyuan','大丰':'dafeng','毗河':'pihe','新都城区':'xinduchengqu'}

qingbaijiang = {'青白江':'qingbaijiang'}


inner_districts = [jinjiang, qingyang, wuhou, jinniu, gaoxin, chenghua]
outer_districts = [tianfuxinqu, gaoxinxi, shuangliu, wenjiang, pidu, longquanyi, xindu, qingbaijiang]

In [131]:
inner_data = pd.DataFrame()
outer_data = pd.DataFrame()
cd_data = pd.DataFrame()
for district in inner_districts:
    district_data = scrap('cd',district)
    inner_data = pd.concat([inner_data, district_data])

👻 District: 川师
😈 Total real estate for sale:「375」


川师: 100%|[37m██████████[0m| 13/13 [00:29<00:00,  2.28s/it]


👻 District: 大面
😈 Total real estate for sale:「2378」


大面: 100%|[37m██████████[0m| 80/80 [02:44<00:00,  2.05s/it]


👻 District: 东大路
😈 Total real estate for sale:「257」


东大路: 100%|[37m██████████[0m| 9/9 [00:19<00:00,  2.17s/it]


👻 District: 东光小区
😈 Total real estate for sale:「241」


东光小区: 100%|[37m██████████[0m| 9/9 [00:19<00:00,  2.12s/it]


👻 District: 东湖
😈 Total real estate for sale:「621」


东湖: 100%|[37m██████████[0m| 21/21 [00:43<00:00,  2.05s/it]


👻 District: 东客站
😈 Total real estate for sale:「276」


东客站: 100%|[37m██████████[0m| 10/10 [00:20<00:00,  2.00s/it]


👻 District: 合江亭
😈 Total real estate for sale:「1140」


合江亭: 100%|[37m██████████[0m| 38/38 [01:19<00:00,  2.09s/it]


👻 District: 红星路
😈 Total real estate for sale:「420」


红星路: 100%|[37m██████████[0m| 14/14 [00:31<00:00,  2.22s/it]


👻 District: 静居寺
😈 Total real estate for sale:「305」


静居寺: 100%|[37m██████████[0m| 11/11 [00:22<00:00,  2.03s/it]


👻 District: 九眼桥
😈 Total real estate for sale:「600」


九眼桥: 100%|[37m██████████[0m| 20/20 [00:44<00:00,  2.21s/it]


👻 District: 蓝谷地
😈 Total real estate for sale:「290」


蓝谷地: 100%|[37m██████████[0m| 10/10 [00:21<00:00,  2.17s/it]


👻 District: 莲花
😈 Total real estate for sale:「346」


莲花: 100%|[37m██████████[0m| 12/12 [00:26<00:00,  2.18s/it]


👻 District: 琉璃场
😈 Total real estate for sale:「344」


琉璃场: 100%|[37m██████████[0m| 12/12 [00:26<00:00,  2.18s/it]


👻 District: 攀成钢
😈 Total real estate for sale:「705」


攀成钢: 100%|[37m██████████[0m| 24/24 [00:52<00:00,  2.19s/it]


👻 District: 三官堂
😈 Total real estate for sale:「412」


三官堂: 100%|[37m██████████[0m| 14/14 [00:33<00:00,  2.41s/it]


👻 District: 三圣乡
😈 Total real estate for sale:「1737」


三圣乡: 100%|[37m██████████[0m| 58/58 [02:19<00:00,  2.40s/it]


👻 District: 沙河堡
😈 Total real estate for sale:「643」


沙河堡: 100%|[37m██████████[0m| 22/22 [00:54<00:00,  2.48s/it]


👻 District: 水碾河
😈 Total real estate for sale:「320」


水碾河: 100%|[37m██████████[0m| 11/11 [00:24<00:00,  2.26s/it]


👻 District: 盐市口
😈 Total real estate for sale:「981」


盐市口: 100%|[37m██████████[0m| 33/33 [01:21<00:00,  2.47s/it]


👻 District: 卓锦城
😈 Total real estate for sale:「783」


卓锦城: 100%|[37m██████████[0m| 27/27 [01:07<00:00,  2.49s/it]


👻 District: 八宝街
😈 Total real estate for sale:「803」


八宝街: 100%|[37m██████████[0m| 27/27 [00:55<00:00,  2.06s/it]


👻 District: 贝森
😈 Total real estate for sale:「522」


贝森: 100%|[37m██████████[0m| 18/18 [00:36<00:00,  2.05s/it]


👻 District: 草市街
😈 Total real estate for sale:「712」


草市街: 100%|[37m██████████[0m| 24/24 [00:49<00:00,  2.05s/it]


👻 District: 草堂
😈 Total real estate for sale:「282」


草堂: 100%|[37m██████████[0m| 10/10 [00:19<00:00,  1.99s/it]


👻 District: 府南新区
😈 Total real estate for sale:「1061」


府南新区: 100%|[37m██████████[0m| 36/36 [01:13<00:00,  2.05s/it]


👻 District: 光华泡小
😈 Total real estate for sale:「597」


光华泡小: 100%|[37m██████████[0m| 20/20 [00:41<00:00,  2.08s/it]


👻 District: 浣花溪
😈 Total real estate for sale:「157」


浣花溪: 100%|[37m██████████[0m| 6/6 [00:11<00:00,  1.90s/it]


👻 District: 金沙
😈 Total real estate for sale:「977」


金沙: 100%|[37m██████████[0m| 33/33 [01:10<00:00,  2.13s/it]


👻 District: 宽窄巷子
😈 Total real estate for sale:「467」


宽窄巷子: 100%|[37m██████████[0m| 16/16 [00:32<00:00,  2.04s/it]


👻 District: 人民公园
😈 Total real estate for sale:「618」


人民公园: 100%|[37m██████████[0m| 21/21 [00:46<00:00,  2.19s/it]


👻 District: 蜀汉路
😈 Total real estate for sale:「425」


蜀汉路: 100%|[37m██████████[0m| 15/15 [00:34<00:00,  2.33s/it]


👻 District: 太升路
😈 Total real estate for sale:「725」


太升路: 100%|[37m██████████[0m| 25/25 [00:53<00:00,  2.14s/it]


👻 District: 外光华
😈 Total real estate for sale:「1506」


外光华: 100%|[37m██████████[0m| 51/51 [01:54<00:00,  2.24s/it]


👻 District: 外金沙
😈 Total real estate for sale:「746」


外金沙: 100%|[37m██████████[0m| 25/25 [00:58<00:00,  2.34s/it]


👻 District: 万家湾
😈 Total real estate for sale:「486」


万家湾: 100%|[37m██████████[0m| 17/17 [00:39<00:00,  2.30s/it]


👻 District: 西南财大
😈 Total real estate for sale:「458」


西南财大: 100%|[37m██████████[0m| 16/16 [00:37<00:00,  2.35s/it]


👻 District: 盐市口
😈 Total real estate for sale:「981」


盐市口: 100%|[37m██████████[0m| 33/33 [01:22<00:00,  2.49s/it]


👻 District: 优品道
😈 Total real estate for sale:「253」


优品道: 100%|[37m██████████[0m| 9/9 [00:22<00:00,  2.45s/it]


👻 District: 草金立交
😈 Total real estate for sale:「669」


草金立交: 100%|[37m██████████[0m| 23/23 [00:46<00:00,  2.01s/it]


👻 District: 川大
😈 Total real estate for sale:「338」


川大: 100%|[37m██████████[0m| 12/12 [00:24<00:00,  2.06s/it]


👻 District: 川音
😈 Total real estate for sale:「574」


川音: 100%|[37m██████████[0m| 20/20 [00:42<00:00,  2.10s/it]


👻 District: 簇桥
😈 Total real estate for sale:「406」


簇桥: 100%|[37m██████████[0m| 14/14 [00:28<00:00,  2.03s/it]


👻 District: 高升桥
😈 Total real estate for sale:「318」


高升桥: 100%|[37m██████████[0m| 11/11 [00:24<00:00,  2.23s/it]


👻 District: 广福桥
😈 Total real estate for sale:「163」


广福桥: 100%|[37m██████████[0m| 6/6 [00:15<00:00,  2.65s/it]


👻 District: 航空路
😈 Total real estate for sale:「452」


航空路: 100%|[37m██████████[0m| 16/16 [00:35<00:00,  2.21s/it]


👻 District: 红牌楼
😈 Total real estate for sale:「741」


红牌楼: 100%|[37m██████████[0m| 25/25 [00:59<00:00,  2.36s/it]


👻 District: 华西
😈 Total real estate for sale:「283」


华西: 100%|[37m██████████[0m| 10/10 [00:23<00:00,  2.37s/it]


👻 District: 火车南站
😈 Total real estate for sale:「356」


火车南站: 100%|[37m██████████[0m| 12/12 [00:26<00:00,  2.20s/it]


👻 District: 丽都
😈 Total real estate for sale:「559」


丽都: 100%|[37m██████████[0m| 19/19 [00:42<00:00,  2.24s/it]


👻 District: 龙湾
😈 Total real estate for sale:「254」


龙湾: 100%|[37m██████████[0m| 9/9 [00:18<00:00,  2.09s/it]


👻 District: 双楠
😈 Total real estate for sale:「772」


双楠: 100%|[37m██████████[0m| 26/26 [00:57<00:00,  2.23s/it]


👻 District: 桐梓林
😈 Total real estate for sale:「532」


桐梓林: 100%|[37m██████████[0m| 18/18 [00:42<00:00,  2.34s/it]


👻 District: 外双楠
😈 Total real estate for sale:「829」


外双楠: 100%|[37m██████████[0m| 28/28 [01:04<00:00,  2.31s/it]


👻 District: 五大花园
😈 Total real estate for sale:「1018」


五大花园: 100%|[37m██████████[0m| 34/34 [01:18<00:00,  2.32s/it]


👻 District: 武侯祠
😈 Total real estate for sale:「75」


武侯祠: 100%|[37m██████████[0m| 3/3 [00:06<00:00,  2.20s/it]


👻 District: 武侯立交
😈 Total real estate for sale:「586」


武侯立交: 100%|[37m██████████[0m| 20/20 [00:48<00:00,  2.42s/it]


👻 District: 新双楠
😈 Total real estate for sale:「378」


新双楠: 100%|[37m██████████[0m| 13/13 [00:31<00:00,  2.40s/it]


👻 District: 玉林
😈 Total real estate for sale:「292」


玉林: 100%|[37m██████████[0m| 10/10 [00:26<00:00,  2.62s/it]


👻 District: 棕北
😈 Total real estate for sale:「661」


棕北: 100%|[37m██████████[0m| 23/23 [00:53<00:00,  2.32s/it]


👻 District: 八宝街
😈 Total real estate for sale:「803」


八宝街: 100%|[37m██████████[0m| 27/27 [00:58<00:00,  2.17s/it]


👻 District: 茶店子
😈 Total real estate for sale:「787」


茶店子: 100%|[37m██████████[0m| 27/27 [00:59<00:00,  2.21s/it]


👻 District: 抚琴小区
😈 Total real estate for sale:「490」


抚琴小区: 100%|[37m██████████[0m| 17/17 [00:37<00:00,  2.23s/it]


👻 District: 高家庄
😈 Total real estate for sale:「773」


高家庄: 100%|[37m██████████[0m| 26/26 [01:03<00:00,  2.43s/it]


👻 District: 国宾
😈 Total real estate for sale:「699」


国宾: 100%|[37m██████████[0m| 24/24 [00:58<00:00,  2.43s/it]


👻 District: 花牌坊
😈 Total real estate for sale:「861」


花牌坊: 100%|[37m██████████[0m| 29/29 [01:10<00:00,  2.41s/it]


👻 District: 华侨城
😈 Total real estate for sale:「313」


华侨城: 100%|[37m██████████[0m| 11/11 [00:26<00:00,  2.39s/it]


👻 District: 花照壁
😈 Total real estate for sale:「410」


花照壁: 100%|[37m██████████[0m| 14/14 [00:31<00:00,  2.23s/it]


👻 District: 金府
😈 Total real estate for sale:「388」


金府: 100%|[37m██████████[0m| 13/13 [00:34<00:00,  2.65s/it]


👻 District: 金牛万达
😈 Total real estate for sale:「1271」


金牛万达: 100%|[37m██████████[0m| 43/43 [01:39<00:00,  2.31s/it]


👻 District: 九里堤
😈 Total real estate for sale:「254」


九里堤: 100%|[37m██████████[0m| 9/9 [00:21<00:00,  2.38s/it]


👻 District: 马鞍路
😈 Total real estate for sale:「535」


马鞍路: 100%|[37m██████████[0m| 18/18 [00:43<00:00,  2.41s/it]


👻 District: 沙湾
😈 Total real estate for sale:「740」


沙湾: 100%|[37m██████████[0m| 25/25 [01:02<00:00,  2.49s/it]


👻 District: 石人小区
😈 Total real estate for sale:「238」


石人小区: 100%|[37m██████████[0m| 8/8 [00:21<00:00,  2.74s/it]


👻 District: 蜀汉路
😈 Total real estate for sale:「425」


蜀汉路: 100%|[37m██████████[0m| 15/15 [00:35<00:00,  2.34s/it]


👻 District: 天回镇
😈 Total real estate for sale:「211」


天回镇: 100%|[37m██████████[0m| 8/8 [00:19<00:00,  2.46s/it]


👻 District: 通惠门
😈 Total real estate for sale:「332」


通惠门: 100%|[37m██████████[0m| 12/12 [00:28<00:00,  2.35s/it]


👻 District: 五块石
😈 Total real estate for sale:「971」


五块石: 100%|[37m██████████[0m| 33/33 [01:23<00:00,  2.53s/it]


👻 District: 西南交大
😈 Total real estate for sale:「346」


西南交大: 100%|[37m██████████[0m| 12/12 [00:29<00:00,  2.45s/it]


👻 District: 营门口
😈 Total real estate for sale:「654」


营门口: 100%|[37m██████████[0m| 22/22 [00:56<00:00,  2.58s/it]


👻 District: 一品天下
😈 Total real estate for sale:「650」


一品天下: 100%|[37m██████████[0m| 22/22 [00:54<00:00,  2.50s/it]


👻 District: 城南宜家
😈 Total real estate for sale:「262」


城南宜家: 100%|[37m██████████[0m| 9/9 [00:19<00:00,  2.18s/it]


👻 District: 大源
😈 Total real estate for sale:「1747」


大源: 100%|[37m██████████[0m| 59/59 [02:12<00:00,  2.25s/it]


👻 District: 东苑
😈 Total real estate for sale:「329」


东苑: 100%|[37m██████████[0m| 11/11 [00:24<00:00,  2.23s/it]


👻 District: 芳草
😈 Total real estate for sale:「421」


芳草: 100%|[37m██████████[0m| 15/15 [00:37<00:00,  2.49s/it]


👻 District: 高朋
😈 Total real estate for sale:「54」


高朋: 100%|[37m██████████[0m| 2/2 [00:04<00:00,  2.28s/it]


👻 District: 广都
😈 Total real estate for sale:「850」


广都: 100%|[37m██████████[0m| 29/29 [01:06<00:00,  2.28s/it]


👻 District: 华府
😈 Total real estate for sale:「1071」


华府: 100%|[37m██████████[0m| 36/36 [01:22<00:00,  2.30s/it]


👻 District: 华阳
😈 Total real estate for sale:「1807」


华阳: 100%|[37m██████████[0m| 61/61 [02:20<00:00,  2.31s/it]


👻 District: 金融城
😈 Total real estate for sale:「1043」


金融城: 100%|[37m██████████[0m| 35/35 [01:16<00:00,  2.19s/it]


👻 District: 丽都
😈 Total real estate for sale:「559」


丽都: 100%|[37m██████████[0m| 19/19 [00:43<00:00,  2.28s/it]


👻 District: 神仙树
😈 Total real estate for sale:「558」


神仙树: 100%|[37m██████████[0m| 19/19 [00:47<00:00,  2.52s/it]


👻 District: 市一医院
😈 Total real estate for sale:「717」


市一医院: 100%|[37m██████████[0m| 24/24 [01:00<00:00,  2.53s/it]


👻 District: 天府长城
😈 Total real estate for sale:「248」


天府长城: 100%|[37m██████████[0m| 9/9 [00:25<00:00,  2.84s/it]


👻 District: 新北
😈 Total real estate for sale:「330」


新北: 100%|[37m██████████[0m| 11/11 [00:29<00:00,  2.68s/it]


👻 District: 新会展
😈 Total real estate for sale:「1021」


新会展: 100%|[37m██████████[0m| 35/35 [01:29<00:00,  2.55s/it]


👻 District: 新双楠
😈 Total real estate for sale:「378」


新双楠: 100%|[37m██████████[0m| 13/13 [00:32<00:00,  2.51s/it]


👻 District: 衣冠庙
😈 Total real estate for sale:「393」


衣冠庙: 100%|[37m██████████[0m| 14/14 [00:38<00:00,  2.72s/it]


👻 District: 远大
😈 Total real estate for sale:「600」


远大: 100%|[37m██████████[0m| 20/20 [00:51<00:00,  2.57s/it]


👻 District: 中德
😈 Total real estate for sale:「556」


中德: 100%|[37m██████████[0m| 19/19 [00:51<00:00,  2.72s/it]


👻 District: 中和
😈 Total real estate for sale:「978」


中和: 100%|[37m██████████[0m| 33/33 [01:37<00:00,  2.95s/it]


👻 District: 紫荆
😈 Total real estate for sale:「423」


紫荆: 100%|[37m██████████[0m| 15/15 [00:37<00:00,  2.53s/it]


👻 District: 八里小区
😈 Total real estate for sale:「1154」


八里小区: 100%|[37m██████████[0m| 39/39 [01:29<00:00,  2.28s/it]


👻 District: 成渝立交
😈 Total real estate for sale:「576」


成渝立交: 100%|[37m██████████[0m| 20/20 [00:45<00:00,  2.26s/it]


👻 District: 东郊记忆
😈 Total real estate for sale:「502」


东郊记忆: 100%|[37m██████████[0m| 17/17 [00:42<00:00,  2.51s/it]


👻 District: 东客站
😈 Total real estate for sale:「276」


东客站: 100%|[37m██████████[0m| 10/10 [00:23<00:00,  2.37s/it]


👻 District: 动物园
😈 Total real estate for sale:「850」


动物园: 100%|[37m██████████[0m| 29/29 [01:10<00:00,  2.41s/it]


👻 District: 建设路
😈 Total real estate for sale:「508」


建设路: 100%|[37m██████████[0m| 17/17 [00:40<00:00,  2.40s/it]


👻 District: 理工大
😈 Total real estate for sale:「456」


理工大: 100%|[37m██████████[0m| 16/16 [00:36<00:00,  2.25s/it]


👻 District: 李家沱
😈 Total real estate for sale:「551」


李家沱: 100%|[37m██████████[0m| 19/19 [00:40<00:00,  2.13s/it]


👻 District: 龙潭寺
😈 Total real estate for sale:「629」


龙潭寺: 100%|[37m██████████[0m| 21/21 [00:47<00:00,  2.25s/it]


👻 District: 猛追湾
😈 Total real estate for sale:「699」


猛追湾: 100%|[37m██████████[0m| 24/24 [00:59<00:00,  2.46s/it]


👻 District: 沙河堡
😈 Total real estate for sale:「643」


沙河堡: 100%|[37m██████████[0m| 22/22 [00:49<00:00,  2.24s/it]


👻 District: 驷马桥
😈 Total real estate for sale:「1700」


驷马桥: 100%|[37m██████████[0m| 57/57 [02:20<00:00,  2.47s/it]


👻 District: SM广场
😈 Total real estate for sale:「325」


SM广场: 100%|[37m██████████[0m| 11/11 [00:29<00:00,  2.72s/it]


👻 District: 万年场
😈 Total real estate for sale:「1272」


万年场: 100%|[37m██████████[0m| 43/43 [01:51<00:00,  2.58s/it]


👻 District: 万象城
😈 Total real estate for sale:「930」


万象城: 100%|[37m██████████[0m| 31/31 [01:21<00:00,  2.64s/it]


👻 District: 新华公园
😈 Total real estate for sale:「777」


新华公园: 100%|[37m██████████[0m| 26/26 [01:12<00:00,  2.78s/it]


In [132]:
for district in outer_districts:
    district_data = scrap('cd',district)
    outer_data = pd.concat([outer_data, district_data])

👻 District: 海洋公园
😈 Total real estate for sale:「770」


海洋公园: 100%|[37m██████████[0m| 26/26 [01:01<00:00,  2.36s/it]


👻 District: 华阳
😈 Total real estate for sale:「1807」


华阳: 100%|[37m██████████[0m| 61/61 [02:31<00:00,  2.48s/it]


👻 District: 锦江生态带
😈 Total real estate for sale:「588」


锦江生态带: 100%|[37m██████████[0m| 20/20 [00:48<00:00,  2.41s/it]


👻 District: 麓湖生态城
😈 Total real estate for sale:「96」


麓湖生态城: 100%|[37m██████████[0m| 4/4 [00:08<00:00,  2.18s/it]


👻 District: 麓山
😈 Total real estate for sale:「826」


麓山: 100%|[37m██████████[0m| 28/28 [01:06<00:00,  2.36s/it]


👻 District: 南湖
😈 Total real estate for sale:「2388」


南湖: 100%|[37m██████████[0m| 80/80 [03:32<00:00,  2.65s/it]


👻 District: 仁寿
😈 Total real estate for sale:「5」


仁寿: 100%|[37m██████████[0m| 1/1 [00:02<00:00,  2.01s/it]


👻 District: 四河
😈 Total real estate for sale:「186」


四河: 100%|[37m██████████[0m| 7/7 [00:17<00:00,  2.50s/it]


👻 District: 新会展
😈 Total real estate for sale:「1021」


新会展: 100%|[37m██████████[0m| 35/35 [01:38<00:00,  2.83s/it]


👻 District: 雅居乐
😈 Total real estate for sale:「318」


雅居乐: 100%|[37m██████████[0m| 11/11 [00:29<00:00,  2.70s/it]


👻 District: 高新西
😈 Total real estate for sale:「1278」


高新西: 100%|[37m██████████[0m| 43/43 [01:40<00:00,  2.35s/it]


👻 District: 中海国际
😈 Total real estate for sale:「797」


中海国际: 100%|[37m██████████[0m| 27/27 [01:04<00:00,  2.40s/it]


👻 District: 簇桥
😈 Total real estate for sale:「406」


簇桥: 100%|[37m██████████[0m| 14/14 [00:32<00:00,  2.32s/it]


👻 District: 东升镇
😈 Total real estate for sale:「1269」


东升镇: 100%|[37m██████████[0m| 43/43 [01:40<00:00,  2.33s/it]


👻 District: 公兴
😈 Total real estate for sale:「334」


公兴: 100%|[37m██████████[0m| 12/12 [00:26<00:00,  2.19s/it]


👻 District: 航空港
😈 Total real estate for sale:「2675」


航空港: 100%|[37m██████████[0m| 90/90 [03:46<00:00,  2.52s/it]


👻 District: 蛟龙港
😈 Total real estate for sale:「1680」


蛟龙港: 100%|[37m██████████[0m| 56/56 [02:24<00:00,  2.58s/it]


👻 District: 警院
😈 Total real estate for sale:「109」


警院: 100%|[37m██████████[0m| 4/4 [00:08<00:00,  2.23s/it]


👻 District: 九龙湖
😈 Total real estate for sale:「73」


九龙湖: 100%|[37m██████████[0m| 3/3 [00:06<00:00,  2.22s/it]


👻 District: 牧马山
😈 Total real estate for sale:「341」


牧马山: 100%|[37m██████████[0m| 12/12 [00:37<00:00,  3.16s/it]


👻 District: 双流城区
😈 Total real estate for sale:「510」


双流城区: 100%|[37m██████████[0m| 17/17 [00:44<00:00,  2.64s/it]


👻 District: 文星镇
😈 Total real estate for sale:「440」


文星镇: 100%|[37m██████████[0m| 15/15 [00:40<00:00,  2.70s/it]


👻 District: 新津
😈 Total real estate for sale:「2925」


新津: 100%|[37m██████████[0m| 98/98 [04:28<00:00,  2.74s/it]


👻 District: 芙蓉古城
😈 Total real estate for sale:「497」


芙蓉古城: 100%|[37m██████████[0m| 17/17 [00:44<00:00,  2.61s/it]


👻 District: 光华大道沿线
😈 Total real estate for sale:「2027」


光华大道沿线: 100%|[37m██████████[0m| 68/68 [02:44<00:00,  2.42s/it]


👻 District: 国色天乡
😈 Total real estate for sale:「1035」


国色天乡: 100%|[37m██████████[0m| 35/35 [01:38<00:00,  2.80s/it]


👻 District: 花都大道
😈 Total real estate for sale:「1009」


花都大道: 100%|[37m██████████[0m| 34/34 [01:29<00:00,  2.62s/it]


👻 District: 温江大学城
😈 Total real estate for sale:「2362」


温江大学城: 100%|[37m██████████[0m| 79/79 [03:23<00:00,  2.57s/it]


👻 District: 温江老城
😈 Total real estate for sale:「1837」


温江老城: 100%|[37m██████████[0m| 62/62 [02:39<00:00,  2.58s/it]


👻 District: 温江新城
😈 Total real estate for sale:「878」


温江新城: 100%|[37m██████████[0m| 30/30 [01:20<00:00,  2.67s/it]


👻 District: 珠江新城
😈 Total real estate for sale:「586」


珠江新城: 100%|[37m██████████[0m| 20/20 [00:59<00:00,  2.97s/it]


👻 District: 成外
😈 Total real estate for sale:「1320」


成外: 100%|[37m██████████[0m| 44/44 [01:47<00:00,  2.44s/it]


👻 District: 红光
😈 Total real estate for sale:「1068」


红光: 100%|[37m██████████[0m| 36/36 [01:29<00:00,  2.49s/it]


👻 District: 郫县城区
😈 Total real estate for sale:「3502」


郫县城区: 100%|[37m██████████[0m| 100/100 [03:56<00:00,  2.36s/it]


👻 District: 郫县万达
😈 Total real estate for sale:「1110」


郫县万达: 100%|[37m██████████[0m| 37/37 [01:36<00:00,  2.60s/it]


👻 District: 橡树湾
😈 Total real estate for sale:「958」


橡树湾: 100%|[37m██████████[0m| 32/32 [01:21<00:00,  2.56s/it]


👻 District: 犀浦
😈 Total real estate for sale:「2748」


犀浦: 100%|[37m██████████[0m| 92/92 [03:59<00:00,  2.60s/it]


👻 District: 东山
😈 Total real estate for sale:「532」


东山: 100%|[37m██████████[0m| 18/18 [00:39<00:00,  2.19s/it]


👻 District: 航天
😈 Total real estate for sale:「737」


航天: 100%|[37m██████████[0m| 25/25 [00:57<00:00,  2.30s/it]


👻 District: 洪河
😈 Total real estate for sale:「485」


洪河: 100%|[37m██████████[0m| 17/17 [00:40<00:00,  2.36s/it]


👻 District: 龙泉驿城区
😈 Total real estate for sale:「3186」


龙泉驿城区: 100%|[37m██████████[0m| 100/100 [04:08<00:00,  2.48s/it]


👻 District: 十陵
😈 Total real estate for sale:「520」


十陵: 100%|[37m██████████[0m| 18/18 [00:46<00:00,  2.58s/it]


👻 District: 西河
😈 Total real estate for sale:「494」


西河: 100%|[37m██████████[0m| 17/17 [00:40<00:00,  2.37s/it]


👻 District: 阳光城
😈 Total real estate for sale:「1188」


阳光城: 100%|[37m██████████[0m| 40/40 [01:41<00:00,  2.53s/it]


👻 District: 保利公园
😈 Total real estate for sale:「1057」


保利公园: 100%|[37m██████████[0m| 36/36 [01:28<00:00,  2.46s/it]


👻 District: 大丰
😈 Total real estate for sale:「2134」


大丰: 100%|[37m██████████[0m| 72/72 [03:02<00:00,  2.54s/it]


👻 District: 毗河
😈 Total real estate for sale:「1294」


毗河: 100%|[37m██████████[0m| 44/44 [01:50<00:00,  2.51s/it]


👻 District: 新都城区
😈 Total real estate for sale:「2784」


新都城区: 100%|[37m██████████[0m| 93/93 [03:51<00:00,  2.49s/it]


👻 District: 青白江
😈 Total real estate for sale:「2687」


青白江: 100%|[37m██████████[0m| 90/90 [03:46<00:00,  2.52s/it]


In [139]:
cd_data = pd.concat([outer_data,inner_data],axis = 0)

In [140]:
cd_data.to_csv('cd_raw.csv', index = False)

In [141]:
inner_data.to_csv('cd_inner_raw.csv', index = False)
outer_data.to_csv('cd_outer_raw.csv', index = False)