# 案例分析 ：美国各州人口数据分析

## 任务1： 首先导入文件,并查看数据样本

In [26]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# 各州的缩写

s_abb = pd.read_csv("../data/state-abbrevs.csv")

In [27]:
# 各州的面积

s_area = pd.read_csv("../data/state-areas.csv")

In [28]:
# 各州的人口

s_pop = pd.read_csv("../data/state-population.csv")

## 任务2：合并 pop 与abbrevs　
### 两个DataFrame 分别依据state/region列 和abbreviation，为了保留所有信息，使用外合并

In [29]:
pop = pd.merge(s_pop, s_abb, left_on = "state/region", right_on = "abbreviation", how = "outer")
pop.head()

Unnamed: 0,state/region,ages,year,population,state,abbreviation
0,AL,under18,2012,1117489.0,Alabama,AL
1,AL,total,2012,4817528.0,Alabama,AL
2,AL,under18,2010,1130966.0,Alabama,AL
3,AL,total,2010,4785570.0,Alabama,AL
4,AL,under18,2011,1125763.0,Alabama,AL


## 任务3：删除重复属性列

In [30]:
pop.drop("abbreviation", axis = 1,inplace = True)

In [31]:
pop.head()

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama


## 任务4：查看存在缺失数据的列

### 使用 isnull().any(),只要某一列存在一个缺失数据，就会显示True

In [32]:
pop.isnull().any()

state/region    False
ages            False
year            False
population       True
state            True
dtype: bool

## 任务5： 查看缺失数据

### 根据数据是否缺失情况显示数据，如果缺失为True， 那么显示

In [33]:
pop[pop["state"].isnull()]

Unnamed: 0,state/region,ages,year,population,state
2448,PR,under18,1990,,
2449,PR,total,1990,,
2450,PR,total,1991,,
2451,PR,under18,1991,,
2452,PR,total,1993,,
2453,PR,under18,1993,,
2454,PR,under18,1992,,
2455,PR,total,1992,,
2456,PR,under18,1994,,
2457,PR,total,1994,,


### 找到哪些state/region 使得state 的值为NaN，使用unqiue查看非重复值

In [34]:
df_lost = pop[pop["state"].isnull()]
df_lost["state/region"].unique()

array(['PR', 'USA'], dtype=object)

## 任务6： 补充缺失数据

### 为找到的这些state/region的state 想填补正确的值，从而去掉state这一列的所有NaN值

In [36]:
s_abb.loc[s_abb["abbreviation"] == "PR", ["state"]]

Unnamed: 0,state


In [37]:
s_abb.shape

(51, 2)

In [38]:
s_area.shape

(52, 2)

In [39]:
pr = pop["state/region"] == "PR"

In [40]:
pop["state"][pr] = "Puerto Rico"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [41]:
pop["state"][pr]

2448    Puerto Rico
2449    Puerto Rico
2450    Puerto Rico
2451    Puerto Rico
2452    Puerto Rico
2453    Puerto Rico
2454    Puerto Rico
2455    Puerto Rico
2456    Puerto Rico
2457    Puerto Rico
2458    Puerto Rico
2459    Puerto Rico
2460    Puerto Rico
2461    Puerto Rico
2462    Puerto Rico
2463    Puerto Rico
2464    Puerto Rico
2465    Puerto Rico
2466    Puerto Rico
2467    Puerto Rico
2468    Puerto Rico
2469    Puerto Rico
2470    Puerto Rico
2471    Puerto Rico
2472    Puerto Rico
2473    Puerto Rico
2474    Puerto Rico
2475    Puerto Rico
2476    Puerto Rico
2477    Puerto Rico
2478    Puerto Rico
2479    Puerto Rico
2480    Puerto Rico
2481    Puerto Rico
2482    Puerto Rico
2483    Puerto Rico
2484    Puerto Rico
2485    Puerto Rico
2486    Puerto Rico
2487    Puerto Rico
2488    Puerto Rico
2489    Puerto Rico
2490    Puerto Rico
2491    Puerto Rico
2492    Puerto Rico
2493    Puerto Rico
2494    Puerto Rico
2495    Puerto Rico
Name: state, dtype: object

In [42]:
pop["state"][pop["state/region"] == "USA"] = "United States"
pop["state"][pop["state/region"] == "USA"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


2496    United States
2497    United States
2498    United States
2499    United States
2500    United States
2501    United States
2502    United States
2503    United States
2504    United States
2505    United States
2506    United States
2507    United States
2508    United States
2509    United States
2510    United States
2511    United States
2512    United States
2513    United States
2514    United States
2515    United States
2516    United States
2517    United States
2518    United States
2519    United States
2520    United States
2521    United States
2522    United States
2523    United States
2524    United States
2525    United States
2526    United States
2527    United States
2528    United States
2529    United States
2530    United States
2531    United States
2532    United States
2533    United States
2534    United States
2535    United States
2536    United States
2537    United States
2538    United States
2539    United States
2540    United States
2541    Un

In [43]:
pop.isnull().any()

state/region    False
ages            False
year            False
population       True
state           False
dtype: bool

## 任务7： 合并各州的面积 area 使用左合并 
## 思考一下为什么使用左合并？

In [44]:
pop.head()

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama


In [45]:
s_area.head()

Unnamed: 0,state,area (sq. mi)
0,Alabama,52423
1,Alaska,656425
2,Arizona,114006
3,Arkansas,53182
4,California,163707


In [47]:
pop2 = pd.merge(pop, s_area, how = "left")

In [48]:
pop2.isnull().any()

state/region     False
ages             False
year             False
population        True
state            False
area (sq. mi)     True
dtype: bool

### 继续寻找存在缺失数据的列

### 发现面积这一列缺失数据，为了找出是哪一行，我们需要找出哪个state没有数据

## 任务8：去除含有的缺失数据

In [50]:
pop3 = pop2.dropna()
pop3.isnull().any()

state/region     False
ages             False
year             False
population       False
state            False
area (sq. mi)    False
dtype: bool

## 任务9：找出2013年的各个州成年人口数据  df.query(查询语句) 

In [51]:
pop3.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423.0
1,AL,total,2012,4817528.0,Alabama,52423.0
2,AL,under18,2010,1130966.0,Alabama,52423.0
3,AL,total,2010,4785570.0,Alabama,52423.0
4,AL,under18,2011,1125763.0,Alabama,52423.0


In [53]:
pop_2013 = pop3.query("year == 2013 & ages == 'total'")
pop_2013.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
9,AL,total,2013,4833722.0,Alabama,52423.0
87,AK,total,2013,735132.0,Alaska,656425.0
103,AZ,total,2013,6626624.0,Arizona,114006.0
185,AR,total,2013,2959373.0,Arkansas,53182.0
199,CA,total,2013,38332521.0,California,163707.0


## 任务10： 对查询结果进行处理，以state列作为新的行索引：set_index

In [61]:
pop_2013.set_index("state", inplace = True)

KeyError: 'state'

## 任务11: 计算2013年每个洲的成年人口密度。注意是Series/Series 其结构还是一个Series

## 任务12：排序，并找:2013年人口密度最高的5个洲

## 任务13：将人口密度merge 到之前的数据中 pop3

### 要点总结：
### （1） 统一用loc（）索引
### （2）善于使用 isnull().any()找到NaN的列
### （3）善于使用unique()确定该列中哪些key是我们需要的
### （4）一般使用外合并、左合并，目的宁可该列是NaN也不要丢弃其他列的信息   

## 任务14： 可视化展示