In [1]:
import pandas as pd
import os

## Obtain different dataset data.gov

https://catalog.data.gov/dataset

In [2]:
from six.moves import urllib

# where the datasets will be placed
ROOT_DATA = "../../../ROOT_DATA/data_gov/"

def fetch_data_from_URL(housing_url, file_name, sub_dir="tmp", root_path=ROOT_DATA):
    placement_dir = os.path.join(root_path, sub_dir)
    if not os.path.isdir(placement_dir):
        os.makedirs(placement_dir)
    placement_path = os.path.join(placement_dir, file_name)
    # only download if not already present
    if not os.path.isfile(placement_path):
        urllib.request.urlretrieve(housing_url, placement_path)
    return placement_path

In [3]:
# .CSV data
la_csv_path = fetch_data_from_URL("https://data.lacity.org/api/views/nxs9-385f/rows.csv?accessType=DOWNLOAD",
                    "2010.csv", sub_dir="la_cencus")

In [4]:
# read entire file into a dataframe
la_df = pd.read_csv(la_csv_path)

# summmary of dataframe
print(la_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319 entries, 0 to 318
Data columns (total 7 columns):
Zip Code                  319 non-null int64
Total Population          319 non-null int64
Median Age                319 non-null float64
Total Males               319 non-null int64
Total Females             319 non-null int64
Total Households          319 non-null int64
Average Household Size    319 non-null float64
dtypes: float64(2), int64(5)
memory usage: 17.5 KB
None


## Viewing Index (labels) and values

In [5]:
# counts of each datatype
print(la_df.head())

   Zip Code  Total Population  Median Age  Total Males  Total Females  \
0     91371                 1        73.5            0              1   
1     90001             57110        26.6        28468          28642   
2     90002             51223        25.5        24876          26347   
3     90003             66266        26.3        32631          33635   
4     90004             62180        34.8        31302          30878   

   Total Households  Average Household Size  
0                 1                    1.00  
1             12971                    4.40  
2             11731                    4.36  
3             15642                    4.22  
4             22547                    2.73  


## Sum (rows or columns)

In [6]:
# will sum all columns (whether or not it makes sense)
print(la_df.sum())

Zip Code                  29029215.00
Total Population          10603988.00
Median Age                   11652.30
Total Males                5228909.00
Total Females              5375079.00
Total Households           3497698.00
Average Household Size         902.17
dtype: float64


In [7]:
# will sum all values in a row (whether or not it makes sense)
print(la_df.sum(axis=1).head())

0     91448.50
1    217223.00
2    204208.86
3    238207.52
4    236948.53
dtype: float64


## Extracting single columns

In [8]:
# bracket notation
# note: dot notation `la_df.colname` also works for simple
# column names
th_s = la_df['Total Households']
print(th_s.head())

0        1
1    12971
2    11731
3    15642
4    22547
Name: Total Households, dtype: int64


## Extracting multiple columns

In [9]:
# can use as many column names as you'd like and they'll
# be returned in the order you specify
select = ['Zip Code','Total Households']
ziphouse_df = la_df[select]
print(ziphouse_df.head())

   Zip Code  Total Households
0     91371                 1
1     90001             12971
2     90002             11731
3     90003             15642
4     90004             22547


In [10]:
# you can also use the list of the values you want to extract
select = la_df.columns
print(select)

Index(['Zip Code', 'Total Population', 'Median Age', 'Total Males',
       'Total Females', 'Total Households', 'Average Household Size'],
      dtype='object')


In [11]:
# only want the first 3
select = la_df.columns[:3]
three_df = la_df[select]
print(three_df.head())

   Zip Code  Total Population  Median Age
0     91371                 1        73.5
1     90001             57110        26.6
2     90002             51223        25.5
3     90003             66266        26.3
4     90004             62180        34.8


## Adding new columns

In [12]:
# be careful not to overwrite an existing column
# loc == location, 0 index
# column = "name"
# value = value to insert

# example (adding a scalar) 
three_df.insert(1, column="State", value="California")
print(three_df.head())

   Zip Code       State  Total Population  Median Age
0     91371  California                 1        73.5
1     90001  California             57110        26.6
2     90002  California             51223        25.5
3     90003  California             66266        26.3
4     90004  California             62180        34.8


In [13]:
# adding to the end of the columns, and using values from the df
three_df["Dog Years"] = three_df['Median Age']/7
print(three_df.head())

   Zip Code       State  Total Population  Median Age  Dog Years
0     91371  California                 1        73.5  10.500000
1     90001  California             57110        26.6   3.800000
2     90002  California             51223        25.5   3.642857
3     90003  California             66266        26.3   3.757143
4     90004  California             62180        34.8   4.971429


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
print(la_df.head())

   Zip Code  Total Population  Median Age  Total Males  Total Females  \
0     91371                 1        73.5            0              1   
1     90001             57110        26.6        28468          28642   
2     90002             51223        25.5        24876          26347   
3     90003             66266        26.3        32631          33635   
4     90004             62180        34.8        31302          30878   

   Total Households  Average Household Size  
0                 1                    1.00  
1             12971                    4.40  
2             11731                    4.36  
3             15642                    4.22  
4             22547                    2.73  


In [15]:
la_df['Total Households'].value_counts().head()

0        8
2        3
11944    2
9289     2
14038    2
Name: Total Households, dtype: int64

## Example

In [16]:
select = ['Zip Code','Total Males','Total Females']
gender_df = la_df[select]
print(gender_df.head())

   Zip Code  Total Males  Total Females
0     91371            0              1
1     90001        28468          28642
2     90002        24876          26347
3     90003        32631          33635
4     90004        31302          30878


In [17]:
gender_df.insert(3, column="MtoF Ratio", 
                 value=gender_df['Total Males']/gender_df['Total Females'])
print(gender_df.head())

   Zip Code  Total Males  Total Females  MtoF Ratio
0     91371            0              1    0.000000
1     90001        28468          28642    0.993925
2     90002        24876          26347    0.944168
3     90003        32631          33635    0.970150
4     90004        31302          30878    1.013731


In [18]:
print(gender_df.sort_values(by='MtoF Ratio').head())

     Zip Code  Total Males  Total Females  MtoF Ratio
0       91371            0              1    0.000000
180     91046           51            105    0.485714
210     91330         1103           1599    0.689806
90      90263          665            947    0.702218
48      90056         3436           4391    0.782510


In [19]:
print(gender_df.sort_values(by='MtoF Ratio', ascending=False).head())

     Zip Code  Total Males  Total Females  MtoF Ratio
62      90073          506             33   15.333333
166     90822          109              8   13.625000
61      90071           13              2    6.500000
303     93252         3301            875    3.772571
20      90021         2790           1161    2.403101
