In [1]:
import pandas as pd
import os

## Obtain different dataset data.gov

https://catalog.data.gov/dataset

In [2]:
from six.moves import urllib

# where the datasets will be placed
ROOT_DATA = "../../../ROOT_DATA/data_gov/"

def fetch_data_from_URL(housing_url, file_name, sub_dir="tmp", root_path=ROOT_DATA):
    placement_dir = os.path.join(root_path, sub_dir)
    if not os.path.isdir(placement_dir):
        os.makedirs(placement_dir)
    placement_path = os.path.join(placement_dir, file_name)
    # only download if not already present
    if not os.path.isfile(placement_path):
        urllib.request.urlretrieve(housing_url, placement_path)
    return placement_path

In [3]:
# .CSV data
cod_csv_path = fetch_data_from_URL("https://data.cdc.gov/api/views/bi63-dtpu/rows.csv?accessType=DOWNLOAD",
                    "cause_of_death.csv", sub_dir="NCHS")

In [4]:
# read entire file into a dataframe
cod_df = pd.read_csv(cod_csv_path)
print(cod_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15028 entries, 0 to 15027
Data columns (total 6 columns):
Year                       15028 non-null int64
113 Cause Name             15028 non-null object
Cause Name                 15028 non-null object
State                      15028 non-null object
Deaths                     15013 non-null float64
Age-adjusted Death Rate    14917 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 704.5+ KB
None


In [5]:
d_ds = cod_df['Deaths']
print(d_ds.head())

0    2313.0
1     294.0
2    2214.0
3    1287.0
4    9198.0
Name: Deaths, dtype: float64


In [6]:
print(len(d_ds)) # will count NaN
print(d_ds.count()) # excludes NaN

15028
15013


In [7]:
print(d_ds.isnull().sum()) # should be 15

15


In [8]:
print(d_ds.sum()) # adds all values in series

153622150.0


In [9]:
print(d_ds.mean()) # average
print(d_ds.sum()/d_ds.count()) # equivalent to average

print(d_ds.sum()/len(d_ds)) # notice that this is different!

10232.608406
10232.608406
10222.3948629


In [10]:
print(d_ds.std())

90032.6104217


In [11]:
print(d_ds.min())
print(d_ds.max())

10.0
2712630.0


In [12]:
print(d_ds.median())

838.0


In [13]:
print(d_ds.mode())

0     23.0
1     74.0
2     86.0
3    133.0
dtype: float64


In [14]:
# using value_counts
d_ds.value_counts().head(6) # validating mode output

133.0    25
74.0     25
86.0     25
23.0     25
52.0     24
109.0    24
Name: Deaths, dtype: int64

In [15]:
print(d_ds.describe())

count    1.501300e+04
mean     1.023261e+04
std      9.003261e+04
min      1.000000e+01
25%      2.940000e+02
50%      8.380000e+02
75%      2.737000e+03
max      2.712630e+06
Name: Deaths, dtype: float64


## Index max and min

In [16]:
print(d_ds.idxmax())

1760


In [17]:
# validate
print(d_ds.max())
print(d_ds.get(d_ds.idxmax()))

2712630.0
2712630.0


In [18]:
print(d_ds.idxmin())

2755


In [19]:
# validate
print(d_ds.min())
print(d_ds.get(d_ds.idxmin()))

10.0
10.0
