In [1]:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import json

In [2]:
def get_res(url, proxy=False, stream=True, text=True, params=None):
    headers_setting = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
    proxy_setting = {
        'http':'http://119.28.222.122:8888',
        'https':'https://119.28.222.122:8888'
    }

    try_counter = 0
    res_text = None
    if res_text is None:
        with requests.Session() as session:
            retries = Retry(total=10,
                    backoff_factor=0.1,
                    status_forcelist=[403, 404, 500, 502, 503, 504 ])
            session.mount('http://',HTTPAdapter(max_retries=retries))
            if not proxy:
                res = session.get(url, 
                    headers = headers_setting,
                    timeout = 40, 
                    stream=stream)
            else:
                print('Use proxy %s' % proxy_setting['http'])
                res = session.get(url, 
                    headers = headers_setting,
                    timeout = (40, 40), 
                    proxies=self.proxy_setting)
        if text:
            return res.text
        else:
            return res.content 
    return


def get_json(wd, city, pn):
    url_0 = 'http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=direct&pcevaname=pc4.1&qt=s&da_src=searchBox.button&wd=@@wd&c=@@city&pn=@@pn'
    url = url_0.replace('@@wd', str(wd)).replace('@@city', str(city)).replace('@@pn', str(pn))
    return json.loads(get_res(url))


def get_menu(dict_):
    menu_col = ['name', 'price', 'recommend_num']
    all_foods = []
    try:
        menu_list = dict_['ext']['detail_info']['business_scope'][0]['content']
    except:
        menu_list = []
    if len(menu_list) > 0:
        for food in menu_list:
            food_info = []
            for col in menu_col:
                if col in food.keys():
                    food_info.append(food[col])
                else:
                    food_info.append(None)
            all_foods.append(food_info)
    if len(all_foods) > 0:
        return pd.DataFrame(all_foods, columns = menu_col).sort_values('price', ascending=False)  
    else:
        return None


def get_all_hotpots(res_json):
    col_list = ['name', 'address_norm', 'aoi', 'area_name', 'alias', 'tel', 
                'di_tag', 'menu', 'avg_price', 'comment_num', 'navigation']
    all_values = []
    for p in range(pn):
        res_json = get_json(wd, city, p)
        if 'content' not in res_json.keys():
            break
        for i in range(len(res_json['content'])):
            dict_ = res_json['content'][i]
            values = []
            for col in col_list[:-4]:
                if col in dict_.keys():
                    values.append(dict_[col])
                else:
                    values.append(None)
            try:
                price = float(dict_['ext']['detail_info']['price'])
            except:
                price = None
            try:
                comment_num = float(dict_['ext']['detail_info']['comment_num'])
            except:
                comment_num = 0.
            try:
                navi = dict_['ext']['detail_info']['navi_xy']['diPoint']
            except:
                navi = None
                
            all_values.append(values + [get_menu(dict_), price, comment_num, navi])

    print('Total raw data: %s' % str(len(all_values)))
    df = pd.DataFrame(all_values, columns=col_list).drop_duplicates('name')
    df = df[['name', 'comment_num', 'avg_price', 'aoi', 'area_name', 'address_norm', 
            'navigation', 'di_tag', 'tel', 'alias', 'menu']].sort_values(['avg_price', 'comment_num'], 
                                                                         ascending=(False, True))
    print('Drop dupliacted data: %s' % str(len(all_values) - len(df)))
    print('%s rows left' % str(len(df)))

    return df.reset_index()[df.columns.tolist()]

In [3]:
wd = '火锅'
city = '南京'
pn = 9999
res_json = get_json(wd, city, pn)
df = get_all_hotpots(res_json)
df.head()

Total raw data: 574
Drop dupliacted data: 61
513 rows left


Unnamed: 0,name,comment_num,avg_price,aoi,area_name,address_norm,navigation,di_tag,tel,alias,menu
0,蓝宝湾8号,5.0,300.0,百家湖,南京市江宁区,[江苏省(320000)|PROV|0|][南京市(320100)|CITY|1|][江宁区...,"{'x': 13227556.75, 'y': 3732142.71}",美食 中餐馆 火锅 餐馆,(025)58679718,,name price recommend_num 0 火山...
1,食辣四川秘制,11.0,147.0,,南京市建邺区,[江苏省(320000)|PROV|0|][南京市(320100)|CITY|0|][建邺区...,,美食 中餐馆 火锅 川味火锅 餐馆,,"[食辣秘制重庆火锅, 食辣鱼头火锅]",
2,炎风阁火锅,2.0,143.0,夫子庙,南京市秦淮区,[江苏省(320000)|PROV|0|][南京市(320100)|CITY|0|][秦淮区...,,美食 中餐馆 火锅 餐馆,,,
3,捞王锅物料理(张府园店),3.0,139.0,,南京市白下区,[江苏省(320000)|PROV|0|][南京市(320100)|CITY|0|][秦淮区...,"{'x': 13223745.34, 'y': 3745221.36}",美食 中餐馆 火锅 餐馆,(025)86806946,"[捞王, 捞王锅物料理(张府园店), 捞王火锅]",name price recommend_num 0 胡椒猪肚鸡 ...
4,小龙坎南京1912店,2.0,138.0,珠江路,南京市玄武区,[江苏省(320000)|PROV|0|][南京市(320100)|CITY|0|][玄武区...,,美食 中餐馆 火锅 川味火锅 餐馆,(025)84451212,"[小龙坎(南京旗舰总店), 小龙坎老火锅, 小龙坎老火锅(1912店), 小龙坎重庆老火锅]",


In [4]:
len(df['menu'].dropna())

262

In [5]:
df.at[3, 'menu']

Unnamed: 0,name,price,recommend_num
0,胡椒猪肚鸡,88.0,99
10,胡椒猪肚锅,88.0,6
1,爆浆手打虾丸,56.0,65
15,竹荪虾滑卷,48.0,2
6,雪花牛舌,42.0,16
12,鲜菇杂菌,42.0,5
13,田园大荟萃,42.0,3
5,玫瑰鲷鱼肉,38.0,23
4,马蹄竹蔗水,28.0,33
7,绣球豆腐,25.0,15


In [12]:
df['avg_price'].sum()*2

65730.0