# 美国新冠肺炎疫情预测项目-数据分析以及可视化部分

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import pyecharts.options as opts
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB
from pyecharts.charts import Bar
from pyecharts.charts import Line
from pyecharts.components import Table
from pyecharts.charts import WordCloud
from pyecharts.charts import Pie
from pyecharts.charts import Funnel
from pyecharts.charts import Scatter
from pyecharts.charts import Map
from pyecharts.charts import PictorialBar
from pyecharts.options import ComponentTitleOpts
from pyecharts.globals import SymbolType
from pyecharts.globals import ThemeType

In [2]:
data = pd.read_csv('us-counties.csv')

## 1.数据基本信息与预处理

### 1.1 数据基本信息

In [3]:
data.head()

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


In [4]:
data.columns

Index(['date', 'county', 'state', 'fips', 'cases', 'deaths'], dtype='object')

In [5]:
data.info

<bound method DataFrame.info of               date      county       state     fips  cases  deaths
0       2020-01-21   Snohomish  Washington  53061.0      1       0
1       2020-01-22   Snohomish  Washington  53061.0      1       0
2       2020-01-23   Snohomish  Washington  53061.0      1       0
3       2020-01-24        Cook    Illinois  17031.0      1       0
4       2020-01-24   Snohomish  Washington  53061.0      1       0
...            ...         ...         ...      ...    ...     ...
233735  2020-06-13  Sweetwater     Wyoming  56037.0     40       0
233736  2020-06-13       Teton     Wyoming  56039.0    104       1
233737  2020-06-13       Uinta     Wyoming  56041.0     71       0
233738  2020-06-13    Washakie     Wyoming  56043.0     38       3
233739  2020-06-13      Weston     Wyoming  56045.0      1       0

[233740 rows x 6 columns]>

### 1.2 数据预处理
这里主要分析了数据的缺失情况

In [6]:
'''
显示不同字段的缺失比例
'''
na = [(c, data[c].isna().mean()*100) for c in data]
na = pd.DataFrame(na, columns=["column_name", "percentage"])
na

Unnamed: 0,column_name,percentage
0,date,0.0
1,county,0.0
2,state,0.0
3,fips,1.057585
4,cases,0.0
5,deaths,0.0


从上表可以看出，在所有的数据中，只用`fips`字段存在缺失情况，这个字段与前面的`county`是一一对应的，因此我们使用之前的`country`列的信息来填充这一列中的缺失值

In [7]:
county_list = list(data['county'])
fips_list = list(data['fips'])
county_fips_dict = {}
for i in range(len(county_list)):
    if not np.isnan(fips_list[i]):
        county_fips_dict[county_list[i]] = fips_list[i]
    
for index in data.index:
    if np.isnan(data.loc[index, 'fips']):
        if data.loc[index, 'county'] in county_fips_dict:
            print(county_fips_dict[data.loc[index, 'county']])
            data.loc[index, 'fips'] = county_fips_dict[data.loc[index, 'county']]
        else:
            data.loc[index, 'fips'] = -1

In [8]:
na1 = [(c, data[c].isna().mean()*100) for c in data]
na1 = pd.DataFrame(na1, columns=["column_name", "percentage"])
na1

Unnamed: 0,column_name,percentage
0,date,0.0
1,county,0.0
2,state,0.0
3,fips,0.0
4,cases,0.0
5,deaths,0.0


可以看到，此时数据已经没有缺失值了，`fips`表示的含义之前的`county`一致，因此，实际上我们并没有使用`fips`中的数据

## 2.数据分析以及可视化
这里对数据的可视化以及分析主要有：
- 全美新冠肺炎确诊以及死亡病例日趋势
- 目前全美各州的疫情情况

In [9]:
'''
按照日期统计全美的确诊病例以及死亡病例信息
'''
data_accu = data.groupby(["date"])[["cases", "deaths"]].sum().reset_index()

In [10]:
data_accu.head()

Unnamed: 0,date,cases,deaths
0,2020-01-21,1,0
1,2020-01-22,1,0
2,2020-01-23,1,0
3,2020-01-24,2,0
4,2020-01-25,3,0


### 2.1 全美新冠肺炎确诊病例数量以及死亡病例数量整体趋势分析

In [17]:
bar = (
    Bar(init_opts=opts.InitOpts(theme=ThemeType.MACARONS))
    .add_xaxis(list(data_accu['date']))
    .add_yaxis("确诊人数", list(data_accu['cases']), stack="stack1")
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(title_opts=opts.TitleOpts(title="全美确诊病例发展趋势图"))
)
bar.load_javascript()
bar.render_notebook()

  super().__init__(init_opts=init_opts)


In [18]:
bar = (
    Bar(init_opts=opts.InitOpts(theme=ThemeType.MACARONS))
    .add_xaxis(list(data_accu['date']))
    .add_yaxis("死亡人数", list(data_accu['deaths']), stack="stack1")
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(title_opts=opts.TitleOpts(title="全美死亡病例发展趋势图"))
)
bar.load_javascript()
bar.render_notebook()

  super().__init__(init_opts=init_opts)


In [19]:
bar = (
    Bar(init_opts=opts.InitOpts(theme=ThemeType.MACARONS))
    .add_xaxis(list(data_accu['date']))
    .add_yaxis("确诊人数", list(data_accu['cases']), stack="stack1")
    .add_yaxis("死亡人数", list(data_accu['deaths']), stack="stack1")
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(title_opts=opts.TitleOpts(title="全美确诊-死亡病例发展趋势图"))
)
bar.load_javascript()
bar.render_notebook()

  super().__init__(init_opts=init_opts)


### 2.2 每日全美确诊病例以及死亡病例数量发展趋势分析

In [20]:
dates = list(data_accu['date'][1:])
cases_per_day = [int(data_accu['cases'][i+1]-data_accu['cases'][i]) for i in range(len(dates))]
deaths_per_day = [int(data_accu['deaths'][i+1]-data_accu['deaths'][i]) for i in range(len(dates))]

In [21]:
line = (
    Line()
    .add_xaxis(dates)
    .add_yaxis(series_name="日确认病例量", 
               y_axis=cases_per_day,
               markpoint_opts=opts.MarkPointOpts(
                   data=[opts.MarkPointItem(type_="max", name="最大值")]
               ),
               markline_opts=opts.MarkLineOpts(
                   data=[
                       opts.MarkLineItem(type_="average", name="平均值"),
                       opts.MarkLineItem(symbol="none", x="90%", y="max"),
                       opts.MarkLineItem(symbol="circle", type_="max", name="最高点"),
                        ]
               ),)
    .set_global_opts(title_opts=opts.TitleOpts(title="全美日确诊病例数量趋势图"))
)

line.load_javascript()
line.render_notebook()

  super().__init__(init_opts=init_opts)


In [22]:
line = (
    Line()
    .add_xaxis(dates)
    .add_yaxis(series_name="日死亡病例量", 
               y_axis=deaths_per_day,
               markpoint_opts=opts.MarkPointOpts(
                   data=[opts.MarkPointItem(type_="max", name="最大值")]
               ),
               markline_opts=opts.MarkLineOpts(
                   data=[
                       opts.MarkLineItem(type_="average", name="平均值"),
                       opts.MarkLineItem(symbol="none", x="90%", y="max"),
                       opts.MarkLineItem(symbol="circle", type_="max", name="最高点"),
                        ]
               ),)
    .set_global_opts(title_opts=opts.TitleOpts(title="全美日确诊病例数量趋势图"))
)

line.load_javascript()
line.render_notebook()

  super().__init__(init_opts=init_opts)


### 2.3 截止目前全美各州新冠肺炎确诊与死亡病例比例情况

In [23]:
last_data = data[data["date"]=='2020-06-13']
last_data = last_data.groupby(["state"])[["cases", "deaths"]].sum().reset_index()
last_data.head()

Unnamed: 0,state,cases,deaths
0,Alabama,24601,773
1,Alaska,722,10
2,Arizona,34773,1190
3,Arkansas,12095,177
4,California,150418,5059


In [24]:
'''
按照确诊病例以及死亡病例数量排序
'''
cases_data_now = last_data.sort_values(by="cases", ascending=False).reset_index()
cases_data_now.head()

Unnamed: 0,index,state,cases,deaths
0,33,New York,387402,30565
1,31,New Jersey,166605,12589
2,4,California,150418,5059
3,14,Illinois,133117,6491
4,22,Massachusetts,105395,7576


In [25]:
deaths_data_now = last_data.sort_values(by="deaths", ascending=False).reset_index()
deaths_data_now.head()

Unnamed: 0,index,state,cases,deaths
0,33,New York,387402,30565
1,31,New Jersey,166605,12589
2,22,Massachusetts,105395,7576
3,14,Illinois,133117,6491
4,40,Pennsylvania,82988,6264


In [32]:
c = (
    Bar()
    .add_xaxis(list(cases_data_now['state'])[9:0:-1])
    .add_yaxis("确诊病例", list(cases_data_now['cases'])[9:0:-1])
    .add_yaxis("死亡病例", list(cases_data_now['deaths'])[9:0:-1])
    .reversal_axis()
    .set_series_opts(label_opts=opts.LabelOpts(position="right"))
    .set_global_opts(title_opts=opts.TitleOpts(title="全美各州确诊-死亡病例对比"))
)
c.load_javascript()
c.render_notebook()

  super().__init__(init_opts=init_opts)


In [35]:
last_data['percent'] = last_data['deaths'] / last_data['cases']
percent_data = last_data.sort_values(by='percent', ascending=False).reset_index()
percent_data[:10]

Unnamed: 0,index,state,cases,deaths,percent
0,6,Connecticut,44994,4186,0.093035
1,23,Michigan,66024,6017,0.091134
2,49,Virgin Islands,72,6,0.083333
3,33,New York,387402,30565,0.078897
4,31,New Jersey,166605,12589,0.075562
5,40,Pennsylvania,82988,6264,0.075481
6,22,Massachusetts,105395,7576,0.071882
7,36,Northern Mariana Islands,30,2,0.066667
8,19,Louisiana,46396,3004,0.064747
9,37,Ohio,40848,2554,0.062524


In [26]:
pbar = (
    PictorialBar()
    .add_xaxis(list(cases_data_now['state'][9::-1]))
    .add_yaxis(
        "",
        list(cases_data_now['cases'][9::-1]),
        label_opts=opts.LabelOpts(is_show=False),
        symbol_size=18,
        symbol_repeat="fixed",
        symbol_offset=[0, 0],
        is_symbol_clip=True,
        symbol=SymbolType.ROUND_RECT,
    )
    .reversal_axis()
    .set_global_opts(
        title_opts=opts.TitleOpts(title="全美各州确诊人数TOP10"),
        xaxis_opts=opts.AxisOpts(is_show=False),
        yaxis_opts=opts.AxisOpts(
            axistick_opts=opts.AxisTickOpts(is_show=False),
            axisline_opts=opts.AxisLineOpts(
                linestyle_opts=opts.LineStyleOpts(opacity=0)
            ),
        ),
    )
)
pbar.load_javascript()
pbar.render_notebook()

  super().__init__(init_opts=init_opts)


In [36]:
pbar = (
    PictorialBar()
    .add_xaxis(list(deaths_data_now['state'][9::-1]))
    .add_yaxis(
        "",
        list(deaths_data_now['deaths'][9::-1]),
        label_opts=opts.LabelOpts(is_show=False),
        symbol_size=18,
        symbol_repeat="fixed",
        symbol_offset=[0, 0],
        is_symbol_clip=True,
        symbol=SymbolType.ROUND_RECT,
    )
    .reversal_axis()
    .set_global_opts(
        title_opts=opts.TitleOpts(title="全美各州死亡人数TOP10"),
        xaxis_opts=opts.AxisOpts(is_show=False),
        yaxis_opts=opts.AxisOpts(
            axistick_opts=opts.AxisTickOpts(is_show=False),
            axisline_opts=opts.AxisLineOpts(
                linestyle_opts=opts.LineStyleOpts(opacity=0)
            ),
        ),
    )
)
pbar.load_javascript()
pbar.render_notebook()

  super().__init__(init_opts=init_opts)


In [39]:
sumCases = last_data['cases'].sum()
sumDeaths = last_data['deaths'].sum()
values = [["Cases", int(sumCases)], ["Deaths", int(sumDeaths)]]
pie = (
    Pie()
    .add("", values)
    .set_global_opts(title_opts=opts.TitleOpts(title="全美病死率"))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
pie.load_javascript()
pie.render_notebook()

  super().__init__(init_opts=init_opts)


In [41]:
sumCases = last_data['cases'].sum()
sumDeaths = last_data['deaths'].sum()
values = [list(z) for z in zip(list(cases_data_now['state'][:10]), list(cases_data_now['cases'][:10]))]
# values = [["Cases", int(sumCases)], ["Deaths", int(sumDeaths)]]
pie = (
    Pie()
    .add("", values)
    .set_global_opts(title_opts=opts.TitleOpts(title="全美确诊病例数量TOP10州比例"), legend_opts = opts.LegendOpts(pos_right='0%', orient='vertical'))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
pie.load_javascript()
pie.render_notebook()

  super().__init__(init_opts=init_opts)


In [42]:
values1 = [list(z) for z in zip(list(deaths_data_now['state'][:10]), list(deaths_data_now['deaths'][:10]))]
# values = [["Cases", int(sumCases)], ["Deaths", int(sumDeaths)]]
pie1 = (
    Pie()
    .add("", values1)
    .set_global_opts(title_opts=opts.TitleOpts(title="全美死亡病例TOP10比例"), legend_opts = opts.LegendOpts(pos_right='0%', orient='vertical'))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
pie1.load_javascript()
pie1.render_notebook()

  super().__init__(init_opts=init_opts)


In [43]:
funnel = (
    Funnel()
    .add(
        "State",
        [list(z) for z in zip(list(cases_data_now['state'][-10:]), cases_data_now['cases'][-10:])],
        sort_="ascending",
        label_opts=opts.LabelOpts(position="right"),
    )
    .set_global_opts(title_opts=opts.TitleOpts(title="全美各州确诊人数最少TOP10"), legend_opts = opts.LegendOpts(pos_right='0%', orient='vertical'))
)
funnel.load_javascript()
funnel.render_notebook()

  super().__init__(init_opts=init_opts)


In [44]:
funnel = (
    Funnel()
    .add(
        "State",
        [list(z) for z in zip(list(deaths_data_now['state'][-10:]), deaths_data_now['deaths'][-10:])],
        sort_="ascending",
        label_opts=opts.LabelOpts(position="right"),
    )
    .set_global_opts(title_opts=opts.TitleOpts(title="全美各州死亡人数最少TOP10"), legend_opts = opts.LegendOpts(pos_right='0%', orient='vertical'))
)
funnel.load_javascript()
funnel.render_notebook()

  super().__init__(init_opts=init_opts)


In [45]:
state_list = list(last_data['state'])
cases_list = list(last_data['cases'])
deaths_list = list(last_data['deaths'])

In [47]:
from pyecharts.datasets import register_url
c = (
    Map()
    .add("确诊人数", [list(z) for z in zip(state_list, cases_list)], "美国")
    .set_global_opts(title_opts=opts.TitleOpts(title="美国各州确诊人数分布"),
                     visualmap_opts=opts.VisualMapOpts(max_=400000),)
    .render('usa-cases.html')
)

# c.load_javascript()
# c.render_notebook()

In [None]:
from pyecharts.datasets import register_url
c = (
    Map()
    .add("确诊人数", [list(z) for z in zip(stateList, deathsList)], "美国")
    .set_global_opts(title_opts=opts.TitleOpts(title="美国各州死亡人数分布"),
                     visualmap_opts=opts.VisualMapOpts(max_=35000),)
    .render('usa-deaths.html')
)