In [1]:
# import dependencies 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as sts 
from scipy.stats import linregress

In [2]:
# Load data 
confirmed_xl = 'Resources/tx_confirmed.xlsx'
fatalities_xl = 'Resources/tx_fatalities.xlsx'

# Note: get dataset for county populations  

# APIs - documentation in resources folder
hospital_cap_url = 'https://opendata.arcgis.com/datasets/1044bb19da8d4dbfb6a96eb1b4ebf629_0.geojson'
nursing_homes_url = 'https://data.cms.gov/resource/s2uc-8wxp.json'# requires login and maybe sodapy ?
deaths_county_race = 'https://data.cdc.gov/api/views/k8wy-p9cg'
excess_death_comparisons = 'https://data.cdc.gov/api/views/m74n-4hbs'

# Save in dataframe 
confirmed_tx = pd.read_excel(confirmed_xl)
fatalities_tx = pd.read_excel(fatalities_xl)

In [3]:
# Drop null rows 
confirmed_tx = confirmed_tx.drop([254, 255])

# View data
confirmed_tx

Unnamed: 0,County,New Cases 03-04-2020,New Cases 03-05-2020,New Cases 03-06-2020,New Cases 03-09-2020,New Cases 03-10-2020,New Cases 03-11-2020,New Cases 03-12-2020,New Cases 03-13-2020,New Cases 03-15-2020,...,New Cases 01-25-2021,New Cases 01-26-2021,New Cases 01-27-2021,New Cases 01-28-2021,New Cases 01-29-2021,New Cases 01-30-2021,New Cases 01-31-2021,New Cases 02-01-2021,New Cases 02-02-2021,New Cases 02-03-2021
0,Anderson,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8.0,1.0,32.0,6.0,-2.0,12.0,0.0,93.0,10.0
1,Andrews,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,6.0,0.0
2,Angelina,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,112.0,0.0
3,Aransas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,7.0,5.0,12.0,16.0,4.0,0.0,19.0,4.0
4,Archer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,Wood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,85.0,10.0,29.0,0.0,9.0,0.0,0.0,39.0
250,Yoakum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0
251,Young,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,12.0,6.0,6.0,8.0,6.0,0.0,0.0,20.0,5.0
252,Zapata,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,25.0,1.0,3.0,18.0,3.0,17.0,0.0,21.0,6.0


In [4]:
# Clean up: Delete last row (avoid totaling the total count) / null values & running total columns
fatalities_tx = fatalities_tx[['County Name','Fatalities 02-02-2021']].drop([254,255])

# View data (* data is a running total not additional fatalities per day)
fatalities_tx

Unnamed: 0,County Name,Fatalities 02-02-2021
0,ANDERSON,91
1,ANDREWS,44
2,ANGELINA,225
3,ARANSAS,31
4,ARCHER,11
...,...,...
249,WOOD,98
250,YOAKUM,25
251,YOUNG,37
252,ZAPATA,23


In [6]:
# Get total for each county 
confirmed_tx['Total Confirmed'] = confirmed_tx.sum(axis=1)

# Merge data (to make sure indices match)
merged_df = pd.merge(confirmed_tx, fatalities_tx, on='County', how='right')

KeyError: 'County'

In [None]:
# Merge lists (making sure county names match up)
total_confirmed

In [None]:
## Create dataframe with county name, confirmed count, fatality count, proportion 65+ in population. 

# County list 
county_list = fatalities_tx['County Name']

# Create dictionary 
tx_summary_dict = {'County Name': county_list, 
                  'Confirmed Cases': total_confirmed,
                  'Total Fatalities': total_fatalities}
tx_summary = pd.DataFrame(tx_summary_dict)

# View dataframe
tx_summary

In [None]:
# Add death rate (# deaths / # infections)
fatality_rate = total_fatalities / total_confirmed
tx_summary['Fatalities per Infection'] = fatality_rate.round(2)

# View table
tx_summary

In [None]:
# Print dataset for counties with: highest infection (over popn), highest fatalities (over popn)
counties_highest_fatality = tx_summary.sort_values(['Fatalities per Infection'], ascending=False, ignore_index=True).head(5)
counties_highest_fatality

In [None]:
# Compare infections to fatalities (% recovered vs. % not recovered)


In [None]:
# Excess deaths api - may be useful to chart 


In [None]:
# Hospital capacity compared to death rate - may also be useful to get number of hospitals 


In [None]:
# Compare nursing home data to rest of population (?) 
