In [1]:
import requests, re, json
from bs4 import BeautifulSoup
from tqdm import tqdm

In [5]:

class CoronaVirusSipder(object):
    
    def __init__(self):
        self.home_url = "https://ncov.dxy.cn/ncovh5/view/pneumonia"
        
    def get_content_from_url(self, url):
        response = requests.get(url)
        return response.content.decode()
    
    def parse_home_page(self, homepage, tage_id):
        soup = BeautifulSoup(homepage, 'lxml')
        script = soup.find(id=tage_id)
        text = script.string
        
        
        json_str = re.findall(r'\[.+\]',text)[0]
        
        data = json.loads(json_str)
        return data
    
    def load(self, path):
        with open(path,encoding="utf-8") as fp:
            data = json.load(fp)
        return data
    
    def save(self, data, path):
        with open(path,"w",encoding='utf-8') as fp:
            json.dump(data, fp, ensure_ascii=False)
            
    def crawl_last_day_corona_virus_of_china(self):
        #1.获取首页内容
        homepage = self.get_content_from_url(self.home_url)
        #2.解析数据
        last_day_corona_virus = self.parse_home_page(homepage, "getAreaStat")
        #3.保存数据
        self.save(last_day_corona_virus, "data\\last_day_corona_virus_of_china.json")
        
    def crawl_corona_virus_of_china(self):
        """
        采集从1月23号以来全国各省的疫情数据
        """
        #1.加载各国疫情数据
        last_day_corona_virus = self.load("data\\last_day_corona_virus_of_china.json")
        corona_virus_of_china = []
        #2.便利各国疫情数据statisticsData，获取统计的URL
        for province in tqdm(last_day_corona_virus, "采集1月23日以来各省信息"):
             #3.发送请求，获取各国从1月23日至今的json数据
            statistic_data_url = province['statisticsData']
            statistic_data_json_str = self.get_content_from_url(statistic_data_url)
            #4.把json数据转换为Python数据，添加到列表中
            statistic_data = json.loads(statistic_data_json_str)['data']
            for one_day in statistic_data:
                one_day['provinceName'] = province['provinceName']
                one_day['provinceShortName'] = province['provinceShortName']
            corona_virus_of_china.extend(statistic_data)
        #5.保存列表数据到json文件
        self.save(corona_virus_of_china, "data\\corona_virus_of_china.json")
        
        
    def run(self):
        self.crawl_last_day_corona_virus_of_china()
        self.crawl_corona_virus_of_china()
        
        
if __name__ == '__main__':
    spider = CoronaVirusSipder()
    spider.run()

采集1月23日以来各省信息: 100%|██████████| 34/34 [00:38<00:00,  1.14s/it]
