## 爬取腾讯的疫情数据

In [1]:
import json

import requests

url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5'
res = requests.get(url)
d = json.loads(res.text)
print(d.keys())

dict_keys(['ret', 'data'])


In [2]:
data_all = json.loads(d['data'])
print(data_all.keys())

dict_keys(['lastUpdateTime', 'chinaTotal', 'chinaAdd', 'isShowAdd', 'showAddSwitch', 'areaTree'])


In [3]:
print(len(data_all['areaTree']))
print(data_all['areaTree'][0].keys())

1
dict_keys(['name', 'today', 'total', 'children'])


In [4]:
print(data_all['areaTree'][0]['name'])
print(data_all['areaTree'][0]['today'])
print(data_all['areaTree'][0]['total'])

中国
{'confirm': 21, 'isUpdated': True}
{'nowConfirm': 521, 'confirm': 92511, 'suspect': 0, 'dead': 4749, 'deadRate': '5.13', 'showRate': False, 'heal': 87241, 'healRate': '94.30', 'showHeal': True}


In [5]:
print(len(data_all['areaTree'][0]['children']))

for i in data_all['areaTree'][0]['children']:
    print(i['name'])

34
香港
上海
台湾
四川
陕西
广东
福建
内蒙古
天津
江苏
湖北
浙江
山西
辽宁
云南
河南
河北
新疆
北京
广西
甘肃
湖南
安徽
山东
重庆
青海
江西
吉林
宁夏
黑龙江
海南
贵州
澳门
西藏


In [6]:
import json
import time
import traceback

import pymysql

In [7]:
def get_tencent_data():
    url1 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"
    url2 = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_other"
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
    }
    r1 = requests.get(url1, headers)
    r2 = requests.get(url2, headers)

    res1 = json.loads(r1.text)
    res2 = json.loads(r2.text)

    data_all1 = json.loads(res1["data"])
    data_all2 = json.loads(res2["data"])

    history = {}
    for i in data_all2["chinaDayList"]:
        ds = "2020." + i["date"]
        tup = time.strptime(ds, "%Y.%m.%d")  # 匹配时间
        ds = time.strftime("%Y-%m-%d", tup)  # 改变时间格式
        confirm = i["confirm"]
        suspect = i["suspect"]
        heal = i["heal"]
        dead = i["dead"]
        history[ds] = {"confirm": confirm,
                       "suspect": suspect, "heal": heal, "dead": dead}
    for i in data_all2["chinaDayAddList"]:
        ds = "2020." + i["date"]
        tup = time.strptime(ds, "%Y.%m.%d")  # 匹配时间
        ds = time.strftime("%Y-%m-%d", tup)  # 改变时间格式
        confirm = i["confirm"]
        suspect = i["suspect"]
        heal = i["heal"]
        dead = i["dead"]
        history[ds].update(
            {"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead})

    details = []
    update_time = data_all1["lastUpdateTime"]
    data_country = data_all1["areaTree"]
    data_province = data_country[0]["children"]
    for pro_infos in data_province:
        province = pro_infos["name"]
        for city_infos in pro_infos["children"]:
            city = city_infos["name"]
            confirm = city_infos["total"]["confirm"]
            confirm_add = city_infos["today"]["confirm"]
            heal = city_infos["total"]["heal"]
            dead = city_infos["total"]["dead"]
            details.append([update_time, province, city,
                            confirm, confirm_add, heal, dead])
    return history, details

## 存储数据

In [8]:
def get_conn():
    # 建立连接
    conn = pymysql.connect(host='localhost',
                           user='root',
                           password='123456',
                           db='cov')
    # 创建游标
    cursor = conn.cursor()
    return conn, cursor


def close_conn(conn, cursor):
    if cursor:
        cursor.close()
    if conn:
        conn.close()

In [9]:
def update_details():
    """更新details表"""
    cursor = None
    conn = None
    try:
        li = get_tencent_data()[1]
        conn, cursor = get_conn()
        sql = 'insert into details(update_time,province,city,confirm,confirm_add,heal,dead) values(%s,%s,%s,%s,%s,%s,%s)'
        # 对比当前最大时间戳
        sql_query = 'select %s=(select update_time from details order by id desc limit 1)'
        cursor.execute(sql_query, li[0][0])
        if not cursor.fetchone()[0]:
            print(f'{time.asctime()}开始更新最新数据')
            for item in li:
                cursor.execute(sql, item)
            conn.commit()
            print(f'{time.asctime()}更新最新数据完毕')
        else:
            print(f'{time.asctime()}已是最新数据')
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

In [10]:
def insert_history():
    """插入history表"""
    cursor = None
    conn = None
    try:
        dic = get_tencent_data()[0]
        print(f'{time.asctime()}开始插入历史数据')
        conn, cursor = get_conn()
        sql = 'insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        for k, v in dic.items():
            cursor.execute(sql, [k, v.get('confirm'), v.get('confirm_add'),
                                 v.get('suspect'), v.get('suspect_add'),
                                 v.get('heal'), v.get('heal_add'),
                                 v.get('dead'), v.get('dead_add')])
        conn.commit()
        print(f'{time.asctime()}插入历史数据完毕')
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

In [11]:
def update_history():
    """插入history表"""
    cursor = None
    conn = None
    try:
        dic = get_tencent_data()[0]
        print(f'{time.asctime()}开始更新历史数据')
        conn, cursor = get_conn()
        sql = 'insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        sql_query = 'select confirm from history where ds=%s'
        for k, v in dic.items():
            cursor.execute(sql, [k, v.get('confirm'), v.get('confirm_add'),
                                 v.get('suspect'), v.get('suspect_add'),
                                 v.get('heal'), v.get('heal_add'),
                                 v.get('dead'), v.get('dead_add')])
        conn.commit()
        print(f'{time.asctime()}历史数据更新完毕')
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

In [12]:
# 插入历史数据
insert_history()

Thu Nov 19 10:16:05 2020开始插入历史数据
Thu Nov 19 10:16:06 2020插入历史数据完毕


In [13]:
# 建立连接
conn = pymysql.connect(host='localhost',
                       user='root',
                       password='123456',
                       db='cov')
# 创建游标，默认是元组型
cursor = conn.cursor()
sql = 'select * from history limit 5'
cursor.execute(sql)
# 提交事务
res = cursor.fetchall()
print(res)
cursor.close()
conn.close()

((datetime.datetime(2020, 1, 13, 0, 0), 41, None, 0, None, 0, None, 1, None), (datetime.datetime(2020, 1, 14, 0, 0), 41, None, 0, None, 0, None, 1, None), (datetime.datetime(2020, 1, 15, 0, 0), 41, None, 0, None, 5, None, 2, None), (datetime.datetime(2020, 1, 16, 0, 0), 45, None, 0, None, 8, None, 2, None), (datetime.datetime(2020, 1, 17, 0, 0), 62, None, 0, None, 12, None, 2, None))


In [14]:
update_details()

Thu Nov 19 10:16:07 2020开始更新最新数据
Thu Nov 19 10:16:08 2020更新最新数据完毕


## 爬取热搜数据

In [15]:
from selenium.webdriver import Chrome, ChromeOptions

In [16]:
def get_baidu_hot():
    # 无头浏览器
    option = ChromeOptions()
    option.add_argument('--headless')
#     option.add_argument('--no-sandbox')

    url = 'http://top.baidu.com/?fr=mhd_card'
    broswer = Chrome(options=option, executable_path='chromedriver.exe')
    broswer.get(url)
    # 找到更多按钮
    button = broswer.find_elements_by_xpath(
        '//*[@id="main"]/div[1]/div[1]/div[3]/div[1]/a')[0]
    button.click()
    time.sleep(1)
    # 获取热搜标题
    c = broswer.find_elements_by_xpath(
        '//*[@id="main"]/div[2]/div/table/tbody/tr/td/a[@class="list-title"]')
    context = [i.text for i in c]
#     broswer.close()
    print(context)
    return context

In [17]:
def update_hotsearch():
    """插入热搜数据"""
    cursor = None
    conn = None
    try:
        context = get_baidu_hot()
        print(f'{time.asctime()}开始更新热搜数据')
        conn, cursor = get_conn()
        sql = 'insert into hotsearch(dt,content) values(%s,%s)'
        ts = time.strftime('%y-%m-%d %X')
        for i in context:
            cursor.execute(sql, (ts, i))  # 插入数据
        conn.commit()
        print(f'{time.asctime()}数据更新完毕')
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

In [18]:
update_hotsearch()

['特朗普政府官员悄悄接触拜登团队', '特斯拉CEO马斯克确诊新冠', '国务院鼓励各地增加号牌指标投放', '洗手间设偷看女性标志 网红店道歉', '普陀山景区天价便饭调查结果', '特朗普团队已申请威斯康星重新计票', '上海杀妻焚尸案受害者家属发声', '周星驰被前女友追讨7000万案开审', '哈尔滨鼓励房企让利销售新房', '秦岭太白山雪后雾凇美景', '外媒:特朗普计划视频出席APEC', '拜登:美国要与他国结盟抗衡中国', '12岁男生被断裂球拍击中身亡', '猪肉价格已连续两个多月下降', '孙正义出售软银800亿美元资产', '辉瑞称其疫苗有效率达95%', '10省份出手根治欠薪', '2.9元买头绳收3元好评返现卡', '上海一公厕15分钟不出来自动报警', '男子服刑22年出狱申诉 案件再审', '秦霄贤妈妈被银行告上法院', '民航局对多个入境航班发熔断指令', '中方回应孟晚舟案关键证人拒出庭', '厦门2份进口牛肉包装核酸阳性', '国家邮政局回应信息泄露', '日本40岁男性成为化妆品市场主力', '赵立坚2个5概括中国疫苗研发进展', '湖南一餐馆发生爆炸 多人受伤', '"黑老大"14年半刑期坐不到7年牢', '河南杀6人嫌犯行凶前视频曝光']
Thu Nov 19 10:16:18 2020开始更新热搜数据
Thu Nov 19 10:16:18 2020数据更新完毕
