In [None]:
#@title Student Information
Name = 'Lingxuan Ye' #@param {type:"string"}
Login_ID = '' #@param {type:"string"}
SIS_ID = 'value' #@param {type:"string"}

# The USA COVID-19 Daily State Reports in the IE6600 git repo

The USA COVID-19 Daily State Reports in the JHU COVID-19 git repo https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports_us have been converted into a pandas DataFrame and saved into our course git repo at https://github.com/hr8799/IE6600/blob/main/JHU_COVID-US_Database/JHU_COVID-US.h5. The HDF key value for the converted Pandas DataFrame is `csse_covid_19_data/csse_covid_19_daily_reports_us`. 

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data Retrieving

## Repo Cloning 

In [None]:
!git clone git@github.com:hr8799/IE6600.git _data/

## Read HDF5

In [None]:
data = pd.read_hdf("_data/JHU_COVID-US_Database/JHU_COVID-US.h5")
data

# Problem 1. Case_Fatality_Ratio of Massachusetts and California (2 percentage points)

Create a line plot showing the Case_Fatality_Ratios of Massachusetts and California in the same plot. 

**Requirements**

1. The two lines should be plotted together.
2. A legend should be created showing which line is for which state.
3. The `Last_Update` column in the DataFrame should be mapped to the x-axis and its values should be sorted in the ascending order.
4. The `Case_Fatality_Ratio` column in the DataFrame should be mapped to the y-axis.
5. x-axis and y-axis labels should be created.
6. A plot title should be created.

**Hints**

1. `pandas.sort_values` could be used to sort a DataFrame according to one or multiple columns

In [None]:
COLUMNS = ["Last_Update", "Case_Fatality_Ratio"]

groups = data.groupby("Province_State")
data_MA = groups.get_group("Massachusetts")[COLUMNS].sort_values("Last_Update")
data_CA = groups.get_group("California")[COLUMNS].sort_values("Last_Update")

fig, ax = plt.subplots()

fig.set_size_inches(16, 8)

ax.set_title("Case_Fatality_Ratios of Massachusetts and California")
ax.set_xlabel(COLUMNS[0])
ax.set_ylabel(COLUMNS[1])

line_MA, = ax.plot(data_MA[COLUMNS[0]], data_MA[COLUMNS[1]])
line_CA, = ax.plot(data_CA[COLUMNS[0]], data_CA[COLUMNS[1]])

ax.legend([line_MA, line_CA], [COLUMNS[0], COLUMNS[1]])

plt.show()

# Problem 2. Distributions of the average Case_Fatality_Ratios after 04/01/2022 and before 01/01/2021 (2 percentage points)

Create a plot consisting of two histogram subplots showing the distributions of the state-wise average Case_Fatality_Ratios after 04/01/2022 and before 01/01/2021

**Notes**

1. Use the `Last_Update` column to filter out data based on time
2. The state-wise average Case_Fatality_Ratios should include those of the US territories and DC: 'American Samoa', 'Diamond Princess', 'District of Columbia', 'Grand Princess', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'Recovered', 'Virgin Islands'

**Requirements**

1. Use 8 bins for the histograms.
2. The two histogram subplots should have the same x-axis range
3. The two histogram subplots should occupy two rows of the plot
4. A plot title should be created for each subplot.
5. x-axis and y-axis labels should be created for each subplot.

**Hints**
1. Use the pandas.to_datetime() to convert a str into a pandas Timestamp so that you can compare it with values in the `Last_Update` column
2. The generated plot should look similar to this: https://github.com/hr8799/IE6600/blob/main/JHU_COVID-US_Database/HW3_Problem2.png

In [None]:
after = data[
    data["Last_Update"] > pd.to_datetime("04/01/2022")
][
    ["Province_State", "Case_Fatality_Ratio"]
].groupby("Province_State").mean()

before = data[
    data["Last_Update"] < pd.to_datetime("01/01/2021")
][
    ["Province_State", "Case_Fatality_Ratio"]
].groupby("Province_State").mean()

fig, (ax_after, ax_before) = plt.subplots(2)

fig.set_size_inches(16, 16)

range_ = (
    min(after.min()[0], before.min()[0]),
    max(after.max()[0], before.max()[0])
)

ax_after.hist(after, bins=8, range=range_)
ax_after.set_title(
    "Destribution of State-wise Case Fatality Ratio after 04/01/2022"
)
ax_after.set_xticks(np.linspace(*range_, num=9))
ax_after.set_xlabel("Case Fatality Ratio (%)")
ax_after.set_ylabel("Counts")

ax_before.hist(before, bins=8, range=range_)
ax_before.set_title(
    "Destribution of State-wise Case Fatality Ratio before 01/01/2021"
)
ax_before.set_xticks(np.linspace(*range_, num=9))
ax_before.set_xlabel("Case Fatality Ratio (%)")
ax_before.set_ylabel("Counts")

plt.show()

# Problem 3. Distributions of the average Case_Fatality_Ratios after 04/01/2022 according to party affiliations (2 percentage points)

Create a single plot showing two histograms of the state-wise average Case_Fatality_Ratios after 04/01/2022, one for states that voted for a Repulican Governor in the 2020 election and one for states that voted for a Demoncratic Governor in the 2020 election. The party affiliations of the states can be retrieved from the `Governor` column under the section `U.S. state party control as of January 2022` in this wiki page: https://en.wikipedia.org/wiki/Political_party_strength_in_U.S._states


**Notes**

1. The state-wise average Case_Fatality_Ratios should **NOT** include those of the US territories and DC because there were no Governors elected for them in the 2020 presidential election

**Requirements**

1. Use 8 bins for the histograms.
2. The two histograms need to be ploted in the same plot
3. The histogram on the front layer should be semi-transprant so that it does not block part of the histogram on the back layer completely
4. Party affiliations can only be extracted from the provided [wiki page](https://en.wikipedia.org/wiki/Political_party_strength_in_U.S._states)
4. A plot title should be created.
5. x-axis and y-axis labels should be created.
6. A legend should be created
7. The histogram for states voted for a Republic Governor should be plotted using the red color while the histogram for states voted for a Democratic Governor should be plotted using the blue color

**Hints**
1. Use the pandas.to_datetime() to convert a str into a pandas Timestamp so that you can compare it with values in the `Last_Update` column
2. The generated plot should look similar to this: https://github.com/hr8799/IE6600/blob/main/JHU_COVID-US_Database/HW3_Problem3.png

In [None]:
EXCLUDES = [
    'American Samoa', 'Diamond Princess', 'District of Columbia',
    'Grand Princess', 'Guam', 'Northern Mariana Islands',
    'Puerto Rico', 'Recovered', 'Virgin Islands'
]

after_ = after[~ after.index.isin(EXCLUDES)]

affiliations = pd.read_html(
    "https://en.wikipedia.org/wiki/Political_party_strength_in_U.S._states"
)[5][["State", "Governor"]]

groups = affiliations.groupby("Governor")

# to make sure there are only Democratic and Republican parties,
# which may not always be true
groups.describe()

In [None]:
democratic = after_[after_.index.isin(groups.get_group("Democratic")["State"])]
republican = after_[after_.index.isin(groups.get_group("Republican")["State"])]

fig, ax = plt.subplots()

fig.set_size_inches(16, 8)

range_ = (after_.min()[0], after_.max()[0])

ax.set_title(
    "Destribution of State-wise Case Fatality Ratio after 04/01/2022"
    "by Party Affiliation of the Governor"
)
ax.set_xticks(np.linspace(*range_, num=9))
ax.set_xlabel("Case Fatality Ratio (%)")
ax.set_ylabel("Counts")

ax.hist(democratic, bins=8, range=range_, alpha=1, color="b", label="Democratic")

# if by 'semi' it means 50% transparency, set `alpha` to 0.5 (ugly though)
ax.hist(republican, bins=8, range=range_, alpha=0.8, color="r", label="Republican")

plt.legend()

plt.show()