In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [5]:
abb = pd.read_csv('作业数据/state-abbrevs.csv')

In [6]:
abb.head()

Unnamed: 0,state,abbreviation
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


In [8]:
area = pd.read_csv('作业数据/state-areas.csv')
area.head()

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [10]:
pop = pd.read_csv('作业数据/state-population.csv')
pop.head()

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


In [11]:
# 先查看abb这张表里面有多少个州的映射
abb.shape

(51, 2)

In [13]:
# eg 如何查看有重复项的列中出现了哪些值
pop.year.unique()

array([2012, 2010, 2011, 2009, 2013, 2007, 2008, 2005, 2006, 2004, 2003,
       2001, 2002, 1999, 2000, 1998, 1997, 1996, 1995, 1994, 1993, 1992,
       1991, 1990])

In [19]:
# 说明abb表提供了51个州的全称和缩写的映射
abb.abbreviation.unique().size

51

In [18]:
# 查看映射表中是否存在空值
abb.isnull().any()

state           False
abbreviation    False
dtype: bool

In [20]:
# 因为我们abb表最终要和人口表pop还有面积表area进行合并，所有先查看pop\area两个表中出现州的个数是否匹配
pop.head()

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0


In [24]:
# 查看pop里面州的个数,发现人口表中出现的州比abb表中多两个
pop["state/region"].unique().size

53

In [26]:
# 查看area表里面州的个数,发现面积表中出现的州比abb表多1个
area["state"].unique().size

52

In [28]:
pop.head(1)

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0


In [29]:
abb.head(1)

Unnamed: 0,state,abbreviation
0,Alabama,AL


In [32]:
pop_abb = pd.merge(pop, abb, how='outer',left_on="state/region",right_on="abbreviation")

In [33]:
pop_abb

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AK,total,1990,553290.0,Alaska,AK
1,AK,under18,1990,177502.0,Alaska,AK
2,AK,total,1992,588736.0,Alaska,AK
3,AK,under18,1991,182180.0,Alaska,AK
4,AK,under18,1992,184878.0,Alaska,AK
...,...,...,...,...,...,...
2539,WY,under18,1993,137458.0,Wyoming,WY
2540,WY,total,1991,459260.0,Wyoming,WY
2541,WY,under18,1991,136720.0,Wyoming,WY
2542,WY,under18,1990,136078.0,Wyoming,WY


In [38]:
# 找到哪些州只统计了人口但是没有全称和缩写的映射
# 只需要找到state这一列中有缺失值的行
# 这个bool列表返回True说明对应的行有缺失值
bool_list = pop_abb.isnull().any(axis=1)
tem = pop_abb.loc[bool_list]

In [41]:
# state\abbreviation返回的是True，说明这两列都为空
tem.isnull().all()

state/region    False
ages            False
year            False
population      False
state            True
abbreviation     True
dtype: bool

In [43]:
# 查看哪些州没有全称
tem["state/region"].unique()

array(['PR', 'USA'], dtype=object)

In [46]:
# 可以把PR，和USA的全称补充到pop_abb这个总表中
pop_abb

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AK,total,1990,553290.0,Alaska,AK
1,AK,under18,1990,177502.0,Alaska,AK
2,AK,total,1992,588736.0,Alaska,AK
3,AK,under18,1991,182180.0,Alaska,AK
4,AK,under18,1992,184878.0,Alaska,AK
...,...,...,...,...,...,...
2539,WY,under18,1993,137458.0,Wyoming,WY
2540,WY,total,1991,459260.0,Wyoming,WY
2541,WY,under18,1991,136720.0,Wyoming,WY
2542,WY,under18,1990,136078.0,Wyoming,WY


In [51]:
# 填充pr州的全称
# df.loc[index, column]
# df.loc[bool_list, column]
pop_abb.loc[pop_abb["state/region"]=="PR","state"]="Puerto Rico"

In [53]:
# 填充USA的全称
pop_abb.loc[pop_abb["state/region"]=="USA","state"] = "USA"

In [55]:
# 查看确实数据，现在人口有缺失，abbreviation有缺失，但是state/region和abbreviation是一个字段，并且state/abbreviation是没有缺失值的，所有可以把abbreviation删掉

pop_abb.isnull().any()

state/region    False
ages            False
year            False
population       True
state           False
abbreviation     True
dtype: bool

In [56]:
pop_abb.drop("abbreviation", axis=1, inplace=True)

In [58]:
pop_abb.isnull().any()

state/region    False
ages            False
year            False
population       True
state           False
dtype: bool

In [63]:
pop_abb["state"].unique().size

53

In [61]:
area["state"].unique().size

52

In [66]:
# 查看哪个州是多出来的,就是USA是多出来的
set(pop_abb["state"].unique())-set(area["state"].unique())

{'USA'}

In [67]:
# 合并总表
total = pd.merge(pop_abb,area,how='outer')

In [68]:
total.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool

In [75]:
#df.loc[bool_list]
# 发现只有USA是存在面积缺失的但是UAS是整个美国的数据，所以不涉及州的密度，可以删除
total.loc[total["area (sq. mi)"].isnull()]["state/region"].unique()

array(['USA'], dtype=object)

In [None]:
# 删除的逻辑
total.drop([所有面积为空的行标签])
# 过滤的逻辑
total.loc[所有面积非空的行索引列表]

In [79]:
# 获取面积为空的行所有
emtpy_indexes = total.loc[total["area (sq. mi)"].isnull()].index
total1 = total.drop(emtpy_indexes)

In [80]:
total1.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)    False
dtype: bool

In [82]:
# 第二种办法：获取所有面积非空的行
total2 = total.loc[total["area (sq. mi)"].notnull()]

In [83]:
total2.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)    False
dtype: bool

In [87]:
# 继续查看哪些州有人口数据的缺失
total2.loc[total2["population"].isnull()]

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
1872,PR,under18,1990,,Puerto Rico,3515.0
1873,PR,total,1990,,Puerto Rico,3515.0
1874,PR,total,1991,,Puerto Rico,3515.0
1875,PR,under18,1991,,Puerto Rico,3515.0
1876,PR,total,1993,,Puerto Rico,3515.0
1877,PR,under18,1993,,Puerto Rico,3515.0
1878,PR,under18,1992,,Puerto Rico,3515.0
1879,PR,total,1992,,Puerto Rico,3515.0
1880,PR,under18,1994,,Puerto Rico,3515.0
1881,PR,total,1994,,Puerto Rico,3515.0


In [90]:
# 因为人口数据官方没有统计，所以删除
# 这里采用过滤population非空的方式获取结果
result = total2.loc[total2["population"].notnull()]

In [91]:
result.isnull().any()

state/region     False
ages             False
year             False
population       False
state            False
area (sq. mi)    False
dtype: bool

In [92]:
result.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0


In [None]:
# year = 2010
# ages = total

In [99]:
condition = (result["year"]==2010 ) & (result["ages"]=="total")
total_2010= result.loc[condition]

In [102]:
total_2010_a = result.query("year==2010&ages=='total'")

In [103]:
total_2010["population"]/total_2010["area (sq. mi)"]

3         91.287603
91         1.087509
101       56.214497
189       54.948667
197      228.051342
283       48.493718
293      645.600649
379      460.445752
389     8898.897059
475      286.597129
485      163.409902
570      124.746707
581       18.794338
666      221.687472
677      178.197831
762       54.202751
773       34.745266
858      107.586994
869       87.676099
954       37.509990
965      466.445797
1050     621.815538
1061     102.015794
1146      61.078373
1157      61.321530
1242      86.015622
1253       6.736171
1338      23.654153
1349      24.448796
1434     140.799273
1445    1009.253268
1530      16.982737
1541     356.094135
1626     177.617157
1637       9.537565
1722     257.549634
1733      53.778278
1818      39.001565
1829     275.966651
1914    1058.665149
1962     681.339159
1973     144.854594
2058      10.583512
2069     150.825298
2154      93.987655
2213      32.677188
2298      65.085075
2309     187.622273
2394      94.557817
2405      76.519582


In [107]:
temp1 = total_2010.set_index("state")

In [110]:
density = temp1["population"]/temp1["area (sq. mi)"]

In [111]:
density

state
Alabama                   91.287603
Alaska                     1.087509
Arizona                   56.214497
Arkansas                  54.948667
California               228.051342
Colorado                  48.493718
Connecticut              645.600649
Delaware                 460.445752
District of Columbia    8898.897059
Florida                  286.597129
Georgia                  163.409902
Hawaii                   124.746707
Idaho                     18.794338
Illinois                 221.687472
Indiana                  178.197831
Iowa                      54.202751
Kansas                    34.745266
Kentucky                 107.586994
Louisiana                 87.676099
Maine                     37.509990
Maryland                 466.445797
Massachusetts            621.815538
Michigan                 102.015794
Minnesota                 61.078373
Mississippi               61.321530
Missouri                  86.015622
Montana                    6.736171
Nebraska              

In [115]:
density.sort_values()[:5]

state
Alaska           1.087509
Wyoming          5.768079
Montana          6.736171
North Dakota     9.537565
South Dakota    10.583512
dtype: float64

In [117]:
density.sort_values(ascending=0)[:5]

state
District of Columbia    8898.897059
Puerto Rico             1058.665149
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
dtype: float64

In [119]:
#查看是否保存