## 爬取腾讯的疫情数据

In [1]:
import json

import requests

url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5'
res = requests.get(url)
d = json.loads(res.text)
print(d.keys())

dict_keys(['ret', 'data'])


In [2]:
data_all = json.loads(d['data'])
print(data_all.keys())

dict_keys(['lastUpdateTime', 'chinaTotal', 'chinaAdd', 'isShowAdd', 'showAddSwitch', 'areaTree'])


In [3]:
print(len(data_all['areaTree']))
print(data_all['areaTree'][0].keys())

1
dict_keys(['name', 'today', 'total', 'children'])


In [4]:
print(data_all['areaTree'][0]['name'])
print(data_all['areaTree'][0]['today'])
print(data_all['areaTree'][0]['total'])

中国
{'confirm': 85, 'isUpdated': True}
{'nowConfirm': 766, 'confirm': 92914, 'suspect': 2, 'dead': 4749, 'deadRate': '5.11', 'showRate': False, 'heal': 87399, 'healRate': '94.06', 'showHeal': True}


In [5]:
print(len(data_all['areaTree'][0]['children']))

for i in data_all['areaTree'][0]['children']:
    print(i['name'])

34
香港
台湾
上海
四川
福建
陕西
广东
内蒙古
天津
浙江
湖北
江苏
河南
山西
云南
北京
广西
辽宁
山东
重庆
河北
安徽
新疆
澳门
青海
吉林
海南
湖南
甘肃
宁夏
贵州
黑龙江
西藏
江西


In [6]:
import json
import time
import traceback

import pymysql

In [7]:
def get_tencent_data():
    url1 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"
    url2 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_other"
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
    }
    r1 = requests.get(url1, headers)
    r2 = requests.get(url2, headers)

    res1 = json.loads(r1.text)
    res2 = json.loads(r2.text)

    data_all1 = json.loads(res1["data"])
    data_all2 = json.loads(res2["data"])

    history = {}
    for i in data_all2["chinaDayList"]:
        ds = "2020." + i["date"]
        tup = time.strptime(ds, "%Y.%m.%d")  # 匹配时间
        ds = time.strftime("%Y-%m-%d", tup)  # 改变时间格式
        confirm = i["confirm"]
        suspect = i["suspect"]
        heal = i["heal"]
        dead = i["dead"]
        history[ds] = {"confirm": confirm,
                       "suspect": suspect, "heal": heal, "dead": dead}
    for i in data_all2["chinaDayAddList"]:
        ds = "2020." + i["date"]
        tup = time.strptime(ds, "%Y.%m.%d")  # 匹配时间
        ds = time.strftime("%Y-%m-%d", tup)  # 改变时间格式
        confirm = i["confirm"]
        suspect = i["suspect"]
        heal = i["heal"]
        dead = i["dead"]
        history[ds].update(
            {"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead})

    details = []
    update_time = data_all1["lastUpdateTime"]
    data_country = data_all1["areaTree"]
    data_province = data_country[0]["children"]
    for pro_infos in data_province:
        province = pro_infos["name"]
        for city_infos in pro_infos["children"]:
            city = city_infos["name"]
            confirm = city_infos["total"]["confirm"]
            confirm_add = city_infos["today"]["confirm"]
            heal = city_infos["total"]["heal"]
            dead = city_infos["total"]["dead"]
            details.append([update_time, province, city,
                            confirm, confirm_add, heal, dead])
    return history, details

## 存储数据

In [8]:
def get_conn():
    # 建立连接
    conn = pymysql.connect(host='localhost',
                           user='root',
                           password='123456',
                           db='cov')
    # 创建游标
    cursor = conn.cursor()
    return conn, cursor


def close_conn(conn, cursor):
    if cursor:
        cursor.close()
    if conn:
        conn.close()

In [9]:
def update_details():
    """更新details表"""
    cursor = None
    conn = None
    try:
        li = get_tencent_data()[1]
        conn, cursor = get_conn()
        sql = 'insert into details(update_time,province,city,confirm,confirm_add,heal,dead) values(%s,%s,%s,%s,%s,%s,%s)'
        # 对比当前最大时间戳
        sql_query = 'select %s=(select update_time from details order by id desc limit 1)'
        cursor.execute(sql_query, li[0][0])
        if not cursor.fetchone()[0]:
            print(f'{time.asctime()}开始更新最新数据')
            for item in li:
                cursor.execute(sql, item)
            conn.commit()
            print(f'{time.asctime()}更新最新数据完毕')
        else:
            print(f'{time.asctime()}已是最新数据')
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

In [10]:
def insert_history():
    """插入history表"""
    cursor = None
    conn = None
    try:
        dic = get_tencent_data()[0]
        print(f'{time.asctime()}开始插入历史数据')
        conn, cursor = get_conn()
        sql = 'insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        for k, v in dic.items():
            cursor.execute(sql, [k, v.get('confirm'), v.get('confirm_add'),
                                 v.get('suspect'), v.get('suspect_add'),
                                 v.get('heal'), v.get('heal_add'),
                                 v.get('dead'), v.get('dead_add')])
        conn.commit()
        print(f'{time.asctime()}插入历史数据完毕')
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

In [11]:
def update_history():
    """插入history表"""
    cursor = None
    conn = None
    try:
        dic = get_tencent_data()[0]
        print(f'{time.asctime()}开始更新历史数据')
        conn, cursor = get_conn()
        sql = 'insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        sql_query = 'select confirm from history where ds=%s'
        for k, v in dic.items():
            if not cursor.execute(sql_query,k):
                cursor.execute(sql, [k, v.get('confirm'), v.get('confirm_add'),
                                     v.get('suspect'), v.get('suspect_add'),
                                     v.get('heal'), v.get('heal_add'),
                                     v.get('dead'), v.get('dead_add')])
        conn.commit()
        print(f'{time.asctime()}历史数据更新完毕')
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

In [12]:
# 插入历史数据(仅第一次运行)
# insert_history()

In [13]:
# 建立连接
conn = pymysql.connect(host='localhost',
                       user='root',
                       password='123456',
                       db='cov')
# 创建游标，默认是元组型
cursor = conn.cursor()
sql = 'select * from history limit 5'
cursor.execute(sql)
# 提交事务
res = cursor.fetchall()
print(res)
cursor.close()
conn.close()

()


In [14]:
update_details()

Wed Nov 25 15:32:59 2020开始更新最新数据
Wed Nov 25 15:32:59 2020更新最新数据完毕


In [15]:
update_history()

Wed Nov 25 15:33:01 2020开始更新历史数据
Wed Nov 25 15:33:02 2020历史数据更新完毕


## 爬取热搜数据

In [16]:
from selenium.webdriver import Chrome, ChromeOptions

In [17]:
def get_baidu_hot():
    # 无头浏览器
    option = ChromeOptions()
    option.add_argument('--headless')
#     option.add_argument('--no-sandbox')

    url = 'http://top.baidu.com/?fr=mhd_card'
    browser = Chrome(options=option, executable_path='chromedriver.exe')
    browser.get(url)
    # 找到更多按钮
    button = browser.find_elements_by_xpath(
        '//*[@id="main"]/div[1]/div[1]/div[3]/div[1]/a')[0]
    button.click()
    time.sleep(1)
    # 获取热搜标题
    c = browser.find_elements_by_xpath(
        '//*[@id="main"]/div[2]/div/table/tbody/tr/td/a[@class="list-title"]')
    context = [i.text for i in c]
    h = browser.find_elements_by_xpath('//*[@id="main"]/div/div/table/tbody/tr/td[@class="last"]/span')
    hot = [i.text for i in h]
    return context,hot

In [18]:
def update_hotsearch():
    """插入热搜数据"""
    cursor = None
    conn = None
    try:
        context,hot = get_baidu_hot()
        print(f'{time.asctime()}开始更新热搜数据')
        conn, cursor = get_conn()
        sql = 'insert into hotsearch(dt,content,hot) values(%s,%s,%s)'
        ts = time.strftime('%y-%m-%d %X')
        for i ,j in zip(context,hot):
            cursor.execute(sql, (ts, i,j))  # 插入数据
        conn.commit()
        print(f'{time.asctime()}数据更新完毕')
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

In [19]:
update_hotsearch()

Wed Nov 25 15:33:17 2020开始更新热搜数据
Wed Nov 25 15:33:17 2020数据更新完毕
