# AmericaData Prep

# Setup

## Imports

In [2]:
import pandas as pd
import numpy as np

## Parameters

In [3]:
AMERICA_DATASET = "../../../data/RQ3/raw/AmericaData.csv"

PROCESSED_AMERICA_DATASET = "../../../data/RQ3/processed/state_diabetes_death_df.csv"

# Loading the Dataset

In [4]:
df = pd.read_csv(AMERICA_DATASET)
df.shape

(2080, 10)

In [5]:
df.head(10)

Unnamed: 0,measure,location,sex,age,cause,metric,year,val,upper,lower
0,Deaths,United States of America,Both,All Ages,Chronic kidney disease due to diabetes mellitu...,Number,2018,2200.241096,3434.386975,1375.86671
1,Deaths,United States of America,Both,All Ages,Chronic kidney disease due to diabetes mellitu...,Percent,2018,0.000766,0.001195,0.000479
2,Deaths,United States of America,Both,All Ages,Chronic kidney disease due to diabetes mellitu...,Number,2018,31956.368547,39416.322034,24984.750774
3,Deaths,United States of America,Both,All Ages,Chronic kidney disease due to diabetes mellitu...,Percent,2018,0.011133,0.013693,0.008716
4,Deaths,Alaska,Both,All Ages,Interpersonal violence,Number,2018,37.827691,43.129855,33.053277
5,Deaths,Alaska,Both,All Ages,Interpersonal violence,Percent,2018,0.007634,0.008233,0.006977
6,Deaths,United States of America,Both,All Ages,Interpersonal violence,Number,2018,18085.63356,18573.392026,17133.089412
7,Deaths,United States of America,Both,All Ages,Interpersonal violence,Percent,2018,0.0063,0.006465,0.005966
8,Deaths,Arizona,Both,All Ages,Chronic kidney disease due to diabetes mellitu...,Number,2018,590.794301,766.754618,442.170847
9,Deaths,Arizona,Both,All Ages,Chronic kidney disease due to diabetes mellitu...,Percent,2018,0.009657,0.012018,0.007475


In [6]:
df[df['location'].str.contains('Arizona')]

Unnamed: 0,measure,location,sex,age,cause,metric,year,val,upper,lower
8,Deaths,Arizona,Both,All Ages,Chronic kidney disease due to diabetes mellitu...,Number,2018,590.794301,766.754618,442.170847
9,Deaths,Arizona,Both,All Ages,Chronic kidney disease due to diabetes mellitu...,Percent,2018,0.009657,0.012018,0.007475
64,Deaths,Arizona,Both,All Ages,Chronic kidney disease due to diabetes mellitu...,Number,2018,41.744007,65.9275,24.510901
65,Deaths,Arizona,Both,All Ages,Chronic kidney disease due to diabetes mellitu...,Percent,2018,0.000682,0.001064,0.000425
100,Deaths,Arizona,Both,All Ages,Diabetes mellitus,Number,2018,1704.999247,1972.62629,1462.503578
101,Deaths,Arizona,Both,All Ages,Diabetes mellitus,Percent,2018,0.027866,0.029815,0.025615
146,Deaths,Arizona,Both,All Ages,Interpersonal violence,Number,2018,398.961806,467.243424,338.307769
147,Deaths,Arizona,Both,All Ages,Interpersonal violence,Percent,2018,0.006519,0.007074,0.00594
452,YLLs (Years of Life Lost),Arizona,Both,All Ages,Chronic kidney disease due to diabetes mellitu...,Number,2018,1044.848103,1577.748743,613.932625
453,YLLs (Years of Life Lost),Arizona,Both,All Ages,Chronic kidney disease due to diabetes mellitu...,Percent,2018,0.000854,0.001281,0.000516


In [7]:
df.columns

Index(['measure', 'location', 'sex', 'age', 'cause', 'metric', 'year', 'val',
       'upper', 'lower'],
      dtype='object')

In [8]:
#get rid of kidney diseases
df = df[~df['cause'].str.contains('kidney')]
df.shape

(1040, 10)

In [9]:
#drop unneeded columns
key_cols = ['measure', 'location', 'cause', 'metric', 'val']
df2 = df[key_cols]
df2.head(10)

Unnamed: 0,measure,location,cause,metric,val
4,Deaths,Alaska,Interpersonal violence,Number,37.827691
5,Deaths,Alaska,Interpersonal violence,Percent,0.007634
6,Deaths,United States of America,Interpersonal violence,Number,18085.63356
7,Deaths,United States of America,Interpersonal violence,Percent,0.0063
10,Deaths,California,Diabetes mellitus,Number,7011.540842
11,Deaths,California,Diabetes mellitus,Percent,0.025681
12,Deaths,California,Interpersonal violence,Number,2122.337312
13,Deaths,California,Interpersonal violence,Percent,0.007772
14,Deaths,Indiana,Diabetes mellitus,Number,1839.277706
15,Deaths,Indiana,Diabetes mellitus,Percent,0.028074


In [10]:
df2[df2['measure'].str.contains('Deaths')]

Unnamed: 0,measure,location,cause,metric,val
4,Deaths,Alaska,Interpersonal violence,Number,37.827691
5,Deaths,Alaska,Interpersonal violence,Percent,0.007634
6,Deaths,United States of America,Interpersonal violence,Number,18085.633560
7,Deaths,United States of America,Interpersonal violence,Percent,0.006300
10,Deaths,California,Diabetes mellitus,Number,7011.540842
...,...,...,...,...,...
411,Deaths,Utah,Diabetes mellitus,Percent,0.034213
412,Deaths,Vermont,Diabetes mellitus,Number,159.771998
413,Deaths,Vermont,Diabetes mellitus,Percent,0.025828
414,Deaths,Vermont,Interpersonal violence,Number,11.535084


In [11]:
US_total_df = df2[df2['location'].str.contains('United States of America')]
US_total_df = US_total_df[US_total_df['measure'].str.contains('Deaths')]
US_total_df = US_total_df[US_total_df['metric'].str.contains('Number')]
US_total_df

Unnamed: 0,measure,location,cause,metric,val
6,Deaths,United States of America,Interpersonal violence,Number,18085.63356
24,Deaths,United States of America,Diabetes mellitus,Number,75081.298757


### Prepare death by diabetes by state data

In [12]:
df2['cause'].iloc[2]

'Interpersonal violence'

In [13]:
state_df = df2[~df2['location'].str.contains('United States of America')]
state_df = state_df[state_df['metric'].str.contains('Number')]
state_df = state_df[state_df['measure'].str.contains('Deaths')]
state_df = state_df[~state_df['cause'].str.contains('Interpersonal violence')]
state_df = state_df.drop(['measure'], axis=1)
state_df

Unnamed: 0,location,cause,metric,val
10,California,Diabetes mellitus,Number,7011.540842
14,Indiana,Diabetes mellitus,Number,1839.277706
22,Delaware,Diabetes mellitus,Number,242.393749
26,Alabama,Diabetes mellitus,Number,1422.852766
34,Kansas,Diabetes mellitus,Number,668.137047
44,New Jersey,Diabetes mellitus,Number,2177.957953
48,Georgia,Diabetes mellitus,Number,2268.024269
62,West Virginia,Diabetes mellitus,Number,797.195811
86,Maine,Diabetes mellitus,Number,407.820605
88,New York,Diabetes mellitus,Number,3810.14252


In [14]:
state_diabetes_death_df = state_df.drop(['cause', 'metric'], axis=1)
state_diabetes_death_df

Unnamed: 0,location,val
10,California,7011.540842
14,Indiana,1839.277706
22,Delaware,242.393749
26,Alabama,1422.852766
34,Kansas,668.137047
44,New Jersey,2177.957953
48,Georgia,2268.024269
62,West Virginia,797.195811
86,Maine,407.820605
88,New York,3810.14252


In [15]:
state_diabetes_death_df = state_diabetes_death_df.rename(columns = {"location": "State", "val": "Diabetes Deaths"})
state_diabetes_death_df['State'] = state_diabetes_death_df['State'].str.upper()

state_diabetes_death_df

Unnamed: 0,State,Diabetes Deaths
10,CALIFORNIA,7011.540842
14,INDIANA,1839.277706
22,DELAWARE,242.393749
26,ALABAMA,1422.852766
34,KANSAS,668.137047
44,NEW JERSEY,2177.957953
48,GEORGIA,2268.024269
62,WEST VIRGINIA,797.195811
86,MAINE,407.820605
88,NEW YORK,3810.14252


In [16]:
state_diabetes_death_df = state_diabetes_death_df.groupby(['State']).sum()
state_diabetes_death_df.shape

(51, 1)

In [17]:
#round Deaths by Diabetes column
state_diabetes_death_df['Diabetes Deaths'] = state_diabetes_death_df['Diabetes Deaths'].round(0).astype(int)

In [18]:
state_diabetes_death_df

Unnamed: 0_level_0,Diabetes Deaths
State,Unnamed: 1_level_1
ALABAMA,1423
ALASKA,131
ARIZONA,1705
ARKANSAS,867
CALIFORNIA,7012
COLORADO,856
CONNECTICUT,710
DELAWARE,242
DISTRICT OF COLUMBIA,150
FLORIDA,5556


In [19]:
state_diabetes_death_df.head()

Unnamed: 0_level_0,Diabetes Deaths
State,Unnamed: 1_level_1
ALABAMA,1423
ALASKA,131
ARIZONA,1705
ARKANSAS,867
CALIFORNIA,7012


# Saving the Dataframe

In [20]:
state_diabetes_death_df.to_csv(PROCESSED_AMERICA_DATASET)