# [01]数据获取

## 数据获取方式1 使用API获取数据

In [None]:
!pip install tqdm requests pandas matplotlib

In [5]:
from tqdm import tqdm
import requests
import time
import json

In [9]:
# API的使用需要申请key
# 高德地图 web api key
KEYS =  ["a6d4b3ebaa846c610fad61ec36ffceba",
         "cca3eba09d3e327449c45cf531e08e7a",
         "89f0d1fc8a975311789cb1833afcbb9d",
         "4555f64b2a07e70c17bc756580018e41",
         "c5d350447d0802b34760e7ba2ea7c1be",
         "75acd1144e83563fe7442004c46b771c",
         "0a6c8cc540730b11874e21f866f696f6",
         "031ac4c04e5da84ec8791bcf1960a58c",
         "6a7bdadebdc8eb9dc27d120013390a28",
         "4c0e67a5c3cdbc2950c07fec58f4126c"]

In [15]:
# 预先定义的header
HEADERS= {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/51.0.2704.63 Safari/537.36'}

In [16]:

def request_from(url, retry_count:int = 5, timeout = 30) -> str:
    """
    使用Get方法向网页请求信息
    :param url: 请求的网页
    :param retry_count: 重试次数
    :param timeout: 超时时间
    :return: 网页的response，失败返回“”
    """
    while retry_count:
        response = requests.get(url, headers=HEADERS, timeout=timeout)
        if response.status_code == 200:
            # 200表示成功
            return response.text
        else:
            retry_count -= 1
            print(f"[Warning] 访问 {url} 失败， 剩余次数{retry_count}")
        print(f'[Error] 访问{url}失败')
        return ""


In [17]:
def validate_keys(keys:list) -> list:
    """
    校验提供的key是否有效
    :param keys: 要校验的key
    :return: 返回有效的key
    """
    key_list = []
    for i in tqdm(range(len(keys)),ncols=80, desc="validating keys"):
        key = KEYS[i]
        url = f"https://restapi.amap.com/v3/place/around?key={key}&location=118,31&types=010000"
        res = request_from(url)
        if res == "":
            continue
        json_data = json.loads(res)
        infocode = json_data['infocode']
        if infocode == "10000":
            key_list.append(key)
    return key_list

In [24]:
import random
def get_key(keys):
    key_num = random.randint(0,len(keys)-1)
    return keys[key_num]

In [25]:
valid_keys = validate_keys(KEYS)
print(valid_keys) # 尚且有效的key

validating keys: 100%|██████████████████████████| 10/10 [00:01<00:00,  7.20it/s]

['a6d4b3ebaa846c610fad61ec36ffceba', 'cca3eba09d3e327449c45cf531e08e7a', '89f0d1fc8a975311789cb1833afcbb9d', '4555f64b2a07e70c17bc756580018e41', '75acd1144e83563fe7442004c46b771c', '6a7bdadebdc8eb9dc27d120013390a28']





In [46]:
import math
def get_poi(city, radius, types, locations, extensions="base"):


    result = []
    progress = 0
    for i in range(len(locations)):
        location = str(locations[i])[1:-1].replace(" ","")
        for j in range(len(types)):
            key = get_key(valid_keys)  # 随机取一个key
            type = types[j]
            url = f"https://restapi.amap.com/v3/place/around?key={key}&location={location}&types={type}&city={city}&radius={radius}&extentions={extensions}"
            # 获得信息
            res = request_from(url)
            assert res != ""
            json_data = json.loads(res)
            count = int(json_data['count'])
            num_page = math.ceil(count / 20)

            print(f"\n正在获取第{i+1}个点的POI({type}), 共找到{count}个POI， 共{num_page}页")
            step = 100/len(locations)/len(types)/num_page

            # 分页爬取
            for page in range(num_page):
                progress += step
                print("\r 总进度："+str(int(progress))+"%",end="")

                url2 = f"{url}&page={page+1}"
                res = request_from(url2)
                if res == "":
                    print(f"[Error] page:{page} url: {url2}")
                    continue
                json_data = json.loads(res)
                num_poi = len(json_data['pois'])

                if num_poi <= 0:
                    continue
                for k in range(num_poi):
                    poi_name = str(json_data['pois'][k]['name']).replace(",", " ")
                    poi_loc = json_data['pois'][k]['location']
                    poi_id = json_data['pois'][k]['id']
                    poi_address = str(json_data['pois'][k]['address']).replace(",", " ")
                    poi_dist = json_data['pois'][k]['distance']
                    poi_code = json_data['pois'][k]['typecode']
                    rating = json_data['pois'][k]['biz_ext']['rating']
                    poi_rating = "" if rating == [] else str(rating)

                    result.append(f"{i+1}, {location}, {j+1}, {type}, {poi_code}, {poi_id}, {poi_name}, {poi_loc}, {poi_address}, {poi_dist}, {poi_rating}")

    return result


In [45]:
city = "南京"
lon_list = [118.794748, 118.784136]
lat_list = [32.041717, 32.041806]
locations = list(zip(lon_list, lat_list))
search_radius = 500  # 一般不宜大于5000m
# POI类型列表
# types = ["010000", "050000", "060000", "070000", "080000", "090000", "100000", "110000", "120000", "130000",
#         "140000","150000", "160000", "170000"]
types = ["010000", "050000"]

result = get_poi(city=city,radius=search_radius, types=types, locations=locations)


正在获取第1个点的POI(010000), 共找到13个POI， 共1页
 总进度：25%
正在获取第1个点的POI(050000), 共找到621个POI， 共32页
 总进度：32%pass
 总进度：33%pass
 总进度：34%pass
 总进度：35%pass
 总进度：35%pass
 总进度：36%pass
 总进度：37%pass
 总进度：38%pass
 总进度：39%pass
 总进度：39%pass
 总进度：40%pass
 总进度：41%pass
 总进度：42%pass
 总进度：42%pass
 总进度：43%pass
 总进度：44%pass
 总进度：45%pass
 总进度：46%pass
 总进度：46%pass
 总进度：47%pass
 总进度：48%pass
 总进度：49%pass
 总进度：50%pass

正在获取第2个点的POI(010000), 共找到26个POI， 共2页
 总进度：75%
正在获取第2个点的POI(050000), 共找到879个POI， 共44页
 总进度：80%pass
 总进度：81%pass
 总进度：81%pass
 总进度：82%pass
 总进度：82%pass
 总进度：83%pass
 总进度：84%pass
 总进度：84%pass
 总进度：85%pass
 总进度：85%pass
 总进度：86%pass
 总进度：86%pass
 总进度：87%pass
 总进度：88%pass
 总进度：88%pass
 总进度：89%pass
 总进度：89%pass
 总进度：90%pass
 总进度：90%pass
 总进度：91%pass
 总进度：92%pass
 总进度：92%pass
 总进度：93%pass
 总进度：93%pass
 总进度：94%pass
 总进度：94%pass
 总进度：95%pass
 总进度：96%pass
 总进度：96%pass
 总进度：97%pass
 总进度：97%pass
 总进度：98%pass
 总进度：98%pass
 总进度：99%pass
 总进度：99%pass


In [53]:
#查看前10个结果
for line in result[0:10]:
    print(line)

1, 118.794748,32.041717, 1, 010000, 010900, B0G0254XZ2, 大方租车(大行宫), 118.794093,32.040781, 中山东路219号, 121, 3.5
1, 118.794748,32.041717, 1, 010000, 010000, B0IDZC7CG3, 炜程汽车维修保养服务(贴膜), 118.795148,32.039883, 新世纪广场地下停车场负一楼, 207, 4.1
1, 118.794748,32.041717, 1, 010000, 010900, B0FFF4QHU6, 佳偶婚车, 118.795099,32.039863, 中山东路科巷1号大行宫新世纪广场B座1610室, 209, 3.8
1, 118.794748,32.041717, 1, 010000, 011100, B0HKFOYXCL, 依威能源汽车充电站(龙台国际大厦停车场), 118.793036,32.040258, 中山东路198号龙台国际大厦龙台国际大厦停车场, 229, 1.2
1, 118.794748,32.041717, 1, 010000, 010000, B0I1GMU6PS, 酷车族汽车服务连锁新世纪广场店, 118.795256,32.039695, 新世纪广场地下停车场负一楼, 230, 4.1
1, 118.794748,32.041717, 1, 010000, 010400, B0FFLH21EV, 潮玩车酷汽车美容工作室, 118.797313,32.040030, 中山东路300号长发中心负三, 306, 3.8
1, 118.794748,32.041717, 1, 010000, 010000, B0I6T5PIYG, 穗通客车, 118.797994,32.040995, 中山东路与利济巷交叉口西北40米, 316, 3.2
1, 118.794748,32.041717, 1, 010000, 010400, B0HB7R1MRY, 轩轼贴膜工作室, 118.797544,32.040085, 中山东路300号负3层, 320, 4.4
1, 118.794748,32.041717, 1, 010000, 011100, B0FFHVWKMU, 南京易充充电站(长江路

In [54]:
# 保存文件
title = "index,lon,lat,poi_index,poi_code1,poi_code2,item_id,item_name,item_lon,item_lat,item_address,item_distance,item_rate"
file_path = f"{city}_poi_test.csv"
with open(file_path,'w',encoding='utf-8') as f:
    f.write(title + "\n")
    for line in result:
        f.write(line + "\n")
    f.close()


In [None]:
!pip install numpy pandas

In [50]:
import pandas as pd

city = "南京"
df = pd.read_csv(fr"data/metro/{city}/data_{city}地铁.csv")
df

Unnamed: 0.1,Unnamed: 0,route_index,route_name,name,stop_id,stop_poi_id,lon,lat,route,trans,su
0,0,0,S1号线(机场线),空港新城江宁,970 1565,BV10946753,118.887659,31.737687,320100022467|900000075073,1,1
1,1,0,S1号线(机场线),禄口机场,895 1565,BV10420611,118.873640,31.730279,320100022467,0,1
2,2,0,S1号线(机场线),翔宇路南,801 1510,BV10054284,118.829201,31.754866,320100022467|900000075091,1,1
3,3,0,S1号线(机场线),翔宇路北,801 1435,BV10053120,118.821542,31.791284,320100022467,0,1
4,4,0,S1号线(机场线),正方中路,801 1361,BV10057781,118.805809,31.845115,320100022467,0,1
...,...,...,...,...,...,...,...,...,...,...,...
154,169,9,10号线,浦口万汇城,276 592,BV10057778,118.657082,32.061689,320100022437,0,1
155,170,9,10号线,南京工业大学,247 561,BV10057779,118.648245,32.066794,320100022437,0,1
156,171,9,10号线,龙华路,215 527,BV10055913,118.635204,32.064712,320100022437,0,1
157,172,9,10号线,文德路,186 494,BV10057780,118.626645,32.057396,320100022437,0,1


## 数据获取方式2

使用现有工具
例如：
后裔采集器 <https://www.houyicaiji.com/>
水经注<http://www.rivermap.cn/>

## 数据获取方式3

selenium

浏览器输入 chrome://settings/help 查看版本
chrome driver下载地址 <https://registry.npmmirror.com/binary.html?path=chromedriver/>

In [57]:
!pip install selenium pillow opencv-python




[notice] A new release of pip is available: 23.0.1 -> 23.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [58]:
!pip install opencv-python




[notice] A new release of pip is available: 23.0.1 -> 23.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


正在加载网页...
