In [18]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from datetime import datetime

In [19]:
dtypes = {
    'cases': 'float64',
    'deaths': 'float64',
    'population': 'float64',
    'aggregate': 'object',
    'city': 'object',
    'country': 'object',
    'county': 'object',
    'state': 'object',
    'level': 'object'
}

In [20]:
df = dd.read_csv('timeseries.csv', 
                 dtype=dtypes,
                 low_memory=False)

In [21]:
# Convert date column to datetime
df['date'] = dd.to_datetime(df['date'])

# Filter for US states and date range
mask = (df['country'] == 'United States') & \
       (df['level'] == 'state') & \
       (df['date'] >= '2020-01-01') & \
       (df['date'] <= '2021-02-28')

In [22]:
columns_needed = ['state', 'date', 'cases', 'deaths', 'population']
us_states_df = df[mask][columns_needed].compute()

In [23]:
us_states_df = us_states_df.dropna()

# Calculate per-capita mortality
state_metrics = us_states_df.groupby('state').agg({
    'deaths': lambda x: x.iloc[-1] - x.iloc[0],  # Total deaths in period
    'population': 'mean'  # Average population
}).reset_index()

state_metrics['per_capita_mortality'] = state_metrics['deaths'] / state_metrics['population']
state_metrics['mortality_rank'] = state_metrics['per_capita_mortality'].rank(ascending=False)
print(state_metrics.nsmallest(10, 'mortality_rank')[['state', 'mortality_rank', 'per_capita_mortality']])

               state  mortality_rank  per_capita_mortality
30        New Jersey             1.0              0.001712
32          New York             2.0              0.001280
6        Connecticut             3.0              0.001216
21     Massachusetts             4.0              0.001187
41      Rhode Island             5.0              0.000903
51  Washington, D.C.             6.0              0.000791
18         Louisiana             7.0              0.000706
22          Michigan             8.0              0.000623
13          Illinois             9.0              0.000553
20          Maryland            10.0              0.000536


In [24]:
us_states_df['month_year'] = us_states_df['date'].dt.to_period('M')

# Get end of month values for cases and deaths
end_of_month = us_states_df.groupby(['state', 'month_year']).last()[['cases', 'deaths']]
start_of_month = us_states_df.groupby(['state', 'month_year']).first()[['cases', 'deaths']]

# Calculate differences
monthly_metrics = (end_of_month - start_of_month).reset_index()

# Handle potential negative values or zero cases
monthly_metrics['cases'] = monthly_metrics['cases'].clip(lower=1)
monthly_metrics['deaths'] = monthly_metrics['deaths'].clip(lower=0)

# Calculate CFR (Case Fatality Rate)
monthly_metrics['CFR'] = (monthly_metrics['deaths'] / monthly_metrics['cases']) * 100

cfr_matrix = monthly_metrics.pivot(index='state', 
                                 columns='month_year', 
                                 values='CFR').fillna(0)
cfr_changes = cfr_matrix.diff(axis=1).fillna(0)
total_cfr_change = cfr_changes.abs().sum(axis=1)
cfr_change_ranks = total_cfr_change.rank(ascending=False)

In [25]:
print(cfr_change_ranks.nsmallest(10).to_frame('rank'))

                              rank
state                             
Rhode Island                   1.0
United States Virgin Islands   2.0
New Jersey                     3.0
Montana                        4.0
Michigan                       5.0
Missouri                       6.0
Pennsylvania                   7.0
Delaware                       8.0
Connecticut                    9.0
New Hampshire                 10.0
