# 上海疫情地图可视化

In [1]:
from bs4 import BeautifulSoup
import lxml
import requests
import re
import numpy as np
import pandas as pd
from tqdm import notebook
import os
import time
from sklearn.cluster import KMeans

## 1.获取上海市卫生健康委员会疫情通报

In [183]:
def read_first_line(descriptions):
    regstr = '\d+年\d+月\d+日'
    regexp_date = re.compile(regstr)
    regstr = '，(.*)区'
    regexp_district = re.compile(regstr)
    regstr = '\d+例本土(.*)确诊'
    regexp_confirmedCases = re.compile(regstr)
    regstr = '\d+例(|新冠肺炎)(|本土)无症状'
    regexp_asymptomaticCases = re.compile(regstr)
    
    districtname = re.search(regexp_district, descriptions)
    if districtname:
        districtname = districtname[0]
        if '，' in districtname:
            districtname = districtname.split('，')[1]
    
    date = re.search(regexp_date, descriptions)[0]
    
    confirmed = re.search(regexp_confirmedCases, descriptions)
    if confirmed:
        confirmed = int(re.search('\d+', confirmed[0])[0])
    else:
        confirmed = None
        
    asymptomatic = re.search(regexp_asymptomaticCases, descriptions)
    if asymptomatic:
        asymptomatic = int(re.search('\d+', asymptomatic[0])[0])
    else:
        asymptomatic = None
        
    return districtname, date, confirmed, asymptomatic

def get_metadata(url, minimal_address_size=3):
    html = requests.get(url)
    soup = BeautifulSoup(html.text,'lxml') 
    
    regstr = u'，|。|、|；'
    regexp_address = re.compile(regstr, re.IGNORECASE)
    regstr = u'分别居住于'
    regexp_firstline = re.compile(regstr, re.IGNORECASE)
    shanghai = soup.find_all('span', text=regexp_firstline)
    
    addresses = {}
    confirmedCases = {}
    asymptomaticCases = {}

    for district in shanghai:
        district = district.parent.parent

        # Line-one
        descriptions = district.parent.parent.p.get_text()
        districtname, date, confirmed, asymptomatic = read_first_line(descriptions)
        # print(districtname, date, confirmed, asymptomatic)

        # find address
        flag = False
        address_list = []
        for address in district.children:
            address = address.get_text()
            if not address:
                if not flag:
                    flag = True
                else:
                    flag = False
                    break
            if flag:
                address = re.split(regexp_address, address)
                if len(address)>1 and len(address[0])>minimal_address_size:
                    address = address[0]
                    address_list.append(address)

        addresses[districtname] = address_list
        confirmedCases[districtname] = confirmed
        asymptomaticCases[districtname] = asymptomatic
    return date, addresses, confirmedCases, asymptomaticCases

url = 'https://mp.weixin.qq.com/s/djwW3S9FUYBE2L5Hj94a3A'
date, addresses, confirmedCases, asymptomaticCases = get_metadata(url)

{'浦东新区': 162, '黄浦区': 16, '静安区': 12, '徐汇区': 23, '长宁区': 6, '普陀区': 7, '虹口区': 8, '杨浦区': 11, '宝山区': 7, '闵行区': 23, '嘉定区': 11, '金山区': 1, '松江区': 14, '青浦区': 6, '奉贤区': 2, '崇明区': 2} {'浦东新区': 7983, '黄浦区': 642, '静安区': 290, '徐汇区': 897, '长宁区': 78, '普陀区': 476, '虹口区': 402, '杨浦区': 612, '宝山区': 547, '闵行区': 2914, '嘉定区': 470, '金山区': 76, '松江区': 782, '青浦区': 378, '奉贤区': 142, '崇明区': 77}


## 2. 调用高德地图API获取位置信息

In [160]:
def get_gd_loc(address, 
               key = 'c13c96adfa738532fcef1b2ecb931f77',
               url = 'https://restapi.amap.com/v3/geocode/geo?parameters',
               city='上海',
               timeout=5):
    if not address:
        return None, None
    
    address_ = ''
    batch = 'false'
    num_batch = 1
    address_validmask = []
    if isinstance(address, list):
        batch = 'true'
        num_batch = len(address)
        for add in address:
            if add:
                address_validmask.append(True)
            else:
                address_validmask.append(False)
            address_ += add + ' | '
        address = address_[:-1]

    parameters = {'address': address, 'key': key, 'city': city, 'batch':batch}
    
    requests.packages.urllib3.disable_warnings()
    try:
        response = requests.get(url, parameters, timeout=2, verify=False)   
        assert response.status_code == 200
        answer = response.json()
        assert answer['status'] == '1'
    except Exception as e:
        print(e)
        raise
    
    loc = []
    for batch_id in range(num_batch):
        loc.append(answer['geocodes'][batch_id]['location'])
    return loc, answer
        
loc, answer = get_gd_loc(['河南中路575弄', '宝山路463号'])

## 3.使用Pandas构建PD数据帧

In [168]:
def get_csv(url):
    date, addresses, confirmedCases, asymptomaticCases = get_metadata(url)
    date_l = []
    district_l = []
    address_l = []
    lats = []
    lons = []

    date_dec = re.findall('\d+', date)
    date_dec = np.datetime64('%4d-%02d-%02d'%(int(date_dec[0]), 
                                              int(date_dec[1]), int(date_dec[2])))
    for district in addresses.keys():
        for i_add in addresses[district]:
            date_l.append(date_dec)
            district_l.append(district)
            address_l.append(i_add)

    batch_size = 10
    for i in notebook.tqdm(range(0,len(address_l),batch_size), desc=url):
        add = address_l[i:i+batch_size]
        locs, answer = get_gd_loc(add)
        for loc in locs:
            if isinstance(loc, list):
                lons.append(None)
                lats.append(None)
            else:
                loc = loc.split(',')
                lons.append(float(loc[0]))
                lats.append(float(loc[1]))

    df_dict = {"date": date_l, "district":district_l, "address":address_l,
               "lons": lons, "lats": lats}
    df = pd.DataFrame(df_dict)
    filename = 'shanghai/' + date + '.csv'
    if filename in os.listdir():
        os.remove(filename)
    df.to_csv(filename, encoding='utf_8_sig', index=None)
    
urls =['https://mp.weixin.qq.com/s/8bljTUplPh1q4MXb6wd_gg',
       'https://mp.weixin.qq.com/s/HTM47mUp0GF-tWXkPeZJlg',
       'https://mp.weixin.qq.com/s/79NsKhMHbg09Y0xaybTXjA',
       'https://mp.weixin.qq.com/s/_Je5_5_HqBcs5chvH5SFfA',
       'https://mp.weixin.qq.com/s/u0XfHF8dgfEp8vGjRtcwXA',
       'https://mp.weixin.qq.com/s/vxFiV2HeSvByINUlTmFKZA',
       'https://mp.weixin.qq.com/s/OZGM-pNkefZqWr0IFRJj1g',
       'https://mp.weixin.qq.com/s/L9AffT-SoEBV4puBa_mRqg',
       'https://mp.weixin.qq.com/s/5T76lht3s6g_KTiIx3XAYw',
       'https://mp.weixin.qq.com/s/ZkhimhWpa92I2EWn3hmd8w',
       'https://mp.weixin.qq.com/s/dRa-PExJr1qkRis88eGCnQ',
       'https://mp.weixin.qq.com/s/LguiUZj-zxy4xy19WO0_UA',
       'https://mp.weixin.qq.com/s/GWI6LxYLHOvv1dioN5olxg',
       'https://mp.weixin.qq.com/s/puNUP9bjYlZNELsse09Z0w',
       'https://mp.weixin.qq.com/s/8qCvsE578Ehz6UcWYRBfXw',
       'https://mp.weixin.qq.com/s/qFvUyEB-R-GKP7vgKR-c3A',
       'https://mp.weixin.qq.com/s/LySBR0VJswl_ZI1KtWlXqw',
       'https://mp.weixin.qq.com/s/YNeLEO7BZouZRfyD2TWOlA',
       'https://mp.weixin.qq.com/s/9-DRQF8pbz_2uivgscOmbw',
       'https://mp.weixin.qq.com/s/IrtFkZDWaB6io18QXwuQ9g',
       'https://mp.weixin.qq.com/s/SIuDbITNdgWwYyM3eiyrgg',
       'https://mp.weixin.qq.com/s/SKkU2W-Ic1H_qWnYC9NtXA',
       'https://mp.weixin.qq.com/s/aDU54MGe9XPWEMrdKMRFww',
       'https://mp.weixin.qq.com/s/aQMZ8WmeYEaBPFv0yVs4BQ',
       'https://mp.weixin.qq.com/s/qbB7VjEXMTK0zB6JIqBbAA',
       'https://mp.weixin.qq.com/s/agdZHOqVZh9atNHOQEFTog',
       'https://mp.weixin.qq.com/s/s_spcc0OApRItbuq5DG2LA',
       'https://mp.weixin.qq.com/s/KyTRqsRBWbM5cEa2sk2wbg',
       'https://mp.weixin.qq.com/s/J68hA0ncRR_q91ccVINP0g',
       'https://mp.weixin.qq.com/s/IqIqMik_fGpPgfNgIZ8ieg',
       'https://mp.weixin.qq.com/s/bqZp2AqqE-FPzJpx6FlhPA',
       'https://mp.weixin.qq.com/s/Dt_Q7mwgzJIdn7NwqeGNeA',
       'https://mp.weixin.qq.com/s/SU8bV1IqoaH2NeUs_HJBzg',
       'https://mp.weixin.qq.com/s/iUhgNb9-2Ofhsg9zxi2hiw',
       'https://mp.weixin.qq.com/s/V9gNghk8vWinad_VT_YAtg',
       'https://mp.weixin.qq.com/s/i4BwsY-a9zXjkJe-FTea4Q',
       'https://mp.weixin.qq.com/s/YyhqHoMgyetDu6kP9-Ybpw',
       'https://mp.weixin.qq.com/s/lIMFiBlzIXvju2VV_I4j0g',
       'https://mp.weixin.qq.com/s/d1qIhfwsisM2jQpfURml3A',]

#for url in urls:
#    get_csv(url)
get_statistics(urls)

  0%|          | 0/39 [00:00<?, ?it/s]

ValueError: All arrays must be of the same length

## 3.疫情点聚类

In [15]:
from shanghai import get_gd_add, get_gd_loc

def read_dfcsv(path='shanghai/'):
    files = os.listdir(path)

    df_list = []
    df_merge = pd.DataFrame()
    for file in files:
        if file == 'confirmedCases.csv':
            with open(path+file, encoding="utf-8") as f:
                confirmedCases = pd.read_csv(f)
        elif file == 'asymptomaticCases.csv':
            with open(path+file, encoding="utf-8") as f:
                asymptomaticCases = pd.read_csv(f)    
        elif '.csv' in file:
            with open(path+file, encoding="utf-8") as f:
                df = pd.read_csv(f)
                df_list.append(df)
                df_merge = pd.concat([df_merge, df])
                
    df = df_merge.dropna()
    return df

def cluster(df, num_center=100):
    # K-Means
    if num_center > len(df):
        num_center = len(df)

    lons = df['lons'].values.tolist()
    lats = df['lats'].values.tolist()
    locs = []
    for lon, lat in zip(lons, lats):
        locs.append([lon, lat])
    estimator  = KMeans(n_clusters=num_center).fit(locs)
    centroids = estimator.cluster_centers_
    labels = estimator.labels_

    centroids_dense = []
    for i in range(len(centroids)):
        centroids_dense.append(np.sum(labels==i))

    # Get address from Gaode
    district = []
    township = []
    neighborhood = []
    building = []
    streetNumber = []
    for i in notebook.tqdm(range(0, len(centroids), 20)):
        locs = centroids[i:i+20]
        district_, township_, neighborhood_, building_, streetNumber_ = get_gd_add(locs)
        district.extend(district_)
        township.extend(township_)
        neighborhood.extend(neighborhood_)
        building.extend(building_)
        streetNumber.extend(streetNumber_)

    # make Pandas frame
    df_hm = {'lon':centroids[:,0], 'lat':centroids[:,1], 'centroids_dense':centroids_dense,
             'district':district, 'township':township, 'neighborhood':neighborhood,
             'building':building, 'streetNumber':streetNumber}
    df_hm = pd.DataFrame(df_hm)
    df_hm = df_hm.dropna()
    return df_hm
    
path='shanghai/centroid/'
dfm = read_dfcsv()
df_cen = pd.DataFrame({})
# dates = np.sort(dfm['date'].unique())
dates = ['2022-04-05', '2022-04-06', '2022-04-07', '2022-04-08',
       '2022-04-09', '2022-04-10', '2022-04-11', '2022-04-12',
       '2022-04-13', '2022-04-14', '2022-04-15', '2022-04-16',
       '2022-04-17', '2022-04-18', '2022-04-19', '2022-04-20',
       '2022-04-21', '2022-04-22', '2022-04-23', '2022-04-24',
       '2022-04-25', '2022-04-26', '2022-04-27', '2022-04-28',
       '2022-04-29', '2022-04-30', '2022-05-01', '2022-05-02',
       '2022-05-03', '2022-05-04', '2022-05-05', '2022-05-06',
       '2022-05-07', '2022-05-08', '2022-05-09', '2022-05-10',
       '2022-05-11', '2022-05-12', '2022-05-13', '2022-05-14']
for date in dates:
    print(date)
    df = dfm[dfm['date'] == date]
    df = cluster(df)
    df['date'] = date
    df.to_csv(path+date+'.csv', encoding='utf_8_sig', index=None)
    df_cen = pd.concat([df_cen, df])
df_cen.to_csv(path+'merged_centroids.csv', encoding='utf_8_sig', index=None)  

2022-04-05


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-06


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-07


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-08


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-09


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-10


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-11


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-12


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-13


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-14


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-15


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-16


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-17


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-18


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-19


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-20


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-21


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-22


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-23


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-24


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-25


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-26


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-27


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-28


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-29


  0%|          | 0/5 [00:00<?, ?it/s]

2022-04-30


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-01


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-02


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-03


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-04


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-05


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-06


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-07


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-08


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-09


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-10


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-11


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-12


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-13


  0%|          | 0/5 [00:00<?, ?it/s]

2022-05-14


  0%|          | 0/5 [00:00<?, ?it/s]

PermissionError: [Errno 13] Permission denied: 'shanghai/centroid/merged_centroids.csv'

## 4. 使用Plotly绘制Heat Map

In [111]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as pltoff

filename = date+'.csv'
with open(filename, encoding="utf-8") as f:
    df = pd.read_csv(f)

px.set_mapbox_access_token(open(".mapbox_token").read())
color_list = ['#1e1e2a', '#293e5d', '#2e598f', '#397ab2', '#59a3c4', 
              '#738a83', '#9abdae', '#e5e8b3', '#d3d179', '#e7c034', ]
hoverD = {'address':True,'date':True,'district':True}
city_center = {'lon': np.mean(df['lons']), 'lat': np.mean(df['lats'])}

fig = px.scatter_mapbox(df, lat="lats", lon="lons",
                        hover_name="address",hover_data=hoverD,
                        title ='Shanghai COVID Atlas',
                        center=city_center,
                        zoom=8,
                        )

fig.add_annotation(
    x=0, y=0,
    text=date+'：The author is not responsible for the boundaries on the map.',
    font={'color': 'red', 'size': 12, 'family': 'Arial'},
    showarrow=False,
)



pltoff.plot(fig, filename='Shanghai COVID Atlas.html')
fig.show()

In [186]:
date_l = []
confirmedCases_dict = {}
asymptomaticCases_dict = {}

districts = ['浦东新区', '黄浦区', '静安区', '徐汇区', '长宁区', '普陀区','虹口区',
        '杨浦区', '宝山区', '闵行区', '嘉定区', '金山区', '松江区', '青浦区',
        '奉贤区', '崇明区']
for url in notebook.tqdm(urls):
    date, addresses, confirmedCases, asymptomaticCases = get_metadata(url)
    if len(date_l) == 0:
        for district in addresses.keys():
            confirmedCases_dict[district] = []
            asymptomaticCases_dict[district] = []

    date_l.append(date) 
    for district in districts:
        val = None
        if district in confirmedCases.keys():
            val = confirmedCases[district]
        confirmedCases_dict[district].append(val)
        val = None
        if district in asymptomaticCases.keys():
            val = asymptomaticCases[district]
        asymptomaticCases_dict[district].append(val)
confirmedCases_dict['date'] = date_l
asymptomaticCases_dict['date'] = date_l

df = pd.DataFrame(confirmedCases_dict)
filename = 'shanghai/confirmedCases.csv'
if filename in os.listdir():
    os.remove(filename)
df.to_csv(filename, encoding='utf_8_sig', index=None)
df = pd.DataFrame(asymptomaticCases_dict)
filename = 'shanghai/asymptomaticCases.csv'
if filename in os.listdir():
    os.remove(filename)
df.to_csv(filename, encoding='utf_8_sig', index=None)

  0%|          | 0/39 [00:00<?, ?it/s]