# 4 数据清洗

## 4.1 合并数据集

### 4.1.1 merge()
**通过键将数据连接起来, 操作仅在axis=1方向上进行**

**一对多**

In [43]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from numpy import arange, random

In [44]:
df1 = DataFrame({"key": list("bbacaab"), "data1": arange(7)})
df2 = DataFrame({"key": list("abd"), "data2": arange(3)})
df3 = DataFrame({"key": list("abd"), "data1":arange(3), "data2": arange(5, 8)})

In [45]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [46]:
s1 = df1["key"].value_counts()
s1

a    3
b    3
c    1
Name: key, dtype: int64

In [47]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [48]:
s2 = df2["key"].value_counts()
s2

b    1
d    1
a    1
Name: key, dtype: int64

In [49]:
df3

Unnamed: 0,key,data1,data2
0,a,0,5
1,b,1,6
2,d,2,7


In [50]:
s3 = df3["key"].value_counts()
s3

b    1
d    1
a    1
Name: key, dtype: int64

In [51]:
# 层次化索引
df1.set_index(["key"]).sort_index(level=0)

Unnamed: 0_level_0,data1
key,Unnamed: 1_level_1
a,2
a,4
a,5
b,0
b,1
b,6
c,3


In [52]:
pd.merge(df1, df2, on="key")  # 用key列做连接

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [53]:
s1 * s2

a    3.0
b    3.0
c    NaN
d    NaN
Name: key, dtype: float64

In [54]:
pd.merge(df1, df3, on="key", sort=False)

Unnamed: 0,key,data1_x,data1_y,data2
0,b,0,1,6
1,b,1,1,6
2,b,6,1,6
3,a,2,0,5
4,a,4,0,5
5,a,5,0,5


In [55]:
s1 * s3

a    3.0
b    3.0
c    NaN
d    NaN
Name: key, dtype: float64

In [56]:
pd.merge(df2, df3, on="key")

Unnamed: 0,key,data2_x,data1,data2_y
0,a,0,0,5
1,b,1,1,6
2,d,2,2,7


In [57]:
s2 * s3

b    1
d    1
a    1
Name: key, dtype: int64

**总结**  
1. merge()是按键连接
2. merge()的结果是行的笛卡尔乘积
3. merge()仅在row方向进行
4. 除键外，重叠的列都保留在结果中

**merge()函数的结果不包括NAN值，因为merge()默认做的是inner连接，结果取得是交集**

In [58]:
pd.merge(df1, df2, on="key", how="inner")  # 默认：交集 

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [59]:
pd.merge(df1, df2, how="outer")  # outer：合集

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [60]:
pd.merge(df1, df2, on="key", how="left")  # 保留左侧数据值及顺序

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,c,3,
4,a,4,0.0
5,a,5,0.0
6,b,6,1.0


In [61]:
pd.merge(df1, df2, how="right")  # 保留右侧数据值及顺序

Unnamed: 0,key,data1,data2
0,b,0.0,1
1,b,1.0,1
2,b,6.0,1
3,a,2.0,0
4,a,4.0,0
5,a,5.0,0
6,d,,2


**针对列名不同的列，可以分别指定连接的键**

In [64]:
df3 = DataFrame({"lkey": list("bbacaab"), "data1": arange(7)})
df4 = DataFrame({"rkey": list("abd"), "data2": arange(3)})

In [65]:
pd.merge(df3, df4, left_on="lkey", right_on="rkey")

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


**多对多**

In [66]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from numpy import arange, random

In [67]:
df1 = DataFrame({"key": list("bbacaab"), "data1": arange(7)})
df2 = DataFrame({"key": list("ababd"), "data2": arange(5)})

In [68]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [69]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [70]:
pd.merge(df1, df2, on="key")

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,6,1
5,b,6,3
6,a,2,0
7,a,2,2
8,a,4,0
9,a,4,2


In [71]:
pd.merge(df1, df2, on="key", how="left")

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,a,5,0.0


In [99]:
df3 = DataFrame({"key1": ["foo", "foo", "bar"], 
                 "key2": ["one", "two", "one"], 
                 "lval": [1, 2, 3]})
df4 = DataFrame({"key1": ["foo", "foo", "foo", "bar", "bar", "foo"], 
                 "key2": ["one", "one", "one", "two", "three", "three"], 
                 "rval": [4, 5, 6, 7, 8, 9]})

In [100]:
df3

Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3


In [101]:
key3 = df3["key1"] + "_" + df3["key2"]
s3 = key3.value_counts()
s3

foo_one    1
foo_two    1
bar_one    1
dtype: int64

In [102]:
df4

Unnamed: 0,key1,key2,rval
0,foo,one,4
1,foo,one,5
2,foo,one,6
3,bar,two,7
4,bar,three,8
5,foo,three,9


In [103]:
key4 = df4["key1"] + "_" + df4["key2"]
s4 = key4.value_counts()
s4

foo_one      3
foo_three    1
bar_two      1
bar_three    1
dtype: int64

In [104]:
pd.merge(df3, df4, on=["key1", "key2"], how="outer")

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,one,1.0,6.0
3,foo,two,2.0,
4,bar,one,3.0,
5,bar,two,,7.0
6,bar,three,,8.0
7,foo,three,,9.0


In [108]:
s3 * s4

bar_one      NaN
bar_three    NaN
bar_two      NaN
foo_one      3.0
foo_three    NaN
foo_two      NaN
dtype: float64

**suffixes处理重叠列名**

In [83]:
pd.merge(df3, df4, on="key1")

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,one,1,one,6
3,foo,two,2,one,4
4,foo,two,2,one,5
5,foo,two,2,one,6
6,bar,one,3,two,7
7,bar,one,3,three,8


In [84]:
pd.merge(df3, df4, on="key1", suffixes=("_left", "_right"))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,one,1,one,6
3,foo,two,2,one,4
4,foo,two,2,one,5
5,foo,two,2,one,6
6,bar,one,3,two,7
7,bar,one,3,three,8


**处理大数据集时，禁用sort会带来更好的性能**

In [86]:
pd.merge(df3, df4, on="key1", suffixes=("_left", "_right"), sort=False)

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,one,1,one,6
3,foo,two,2,one,4
4,foo,two,2,one,5
5,foo,two,2,one,6
6,bar,one,3,two,7
7,bar,one,3,three,8


### 4.1.2 merge()
**通过索引将数据连接起来**

In [109]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from numpy import arange, random

In [110]:
left1 = DataFrame({
    "key": list("abaabc"),
    "value": arange(6)
})
right1 = DataFrame({"group_val": [3.5, 7]}, index=list("ab"))

In [111]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [112]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [113]:
pd.merge(left1, right1, left_on="key", right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [114]:
pd.merge(left1, right1, left_on="key", right_index=True, how="outer")

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


**层次化数据**

In [115]:
lefth = DataFrame({
    "key1": ["Ohio"] * 3 + ["Nevada"] * 2,
    "key2": [(2000 + i) for i in range(5)],
    "data": arange(5)
})
righth = DataFrame(data=arange(12).reshape(6, 2), index=[["Nevada"]*2 + ["Ohio"]*4, [2000]*4+[2001, 2002]], columns=["event1", "event2"])

In [116]:
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2000,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [117]:
lefth

Unnamed: 0,key1,key2,data
0,Ohio,2000,0
1,Ohio,2001,1
2,Ohio,2002,2
3,Nevada,2003,3
4,Nevada,2004,4


In [106]:
pd.merge(lefth, righth, left_on=["key1", "key2"], right_index=True, how="outer")

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0.0,4.0,5.0
0,Ohio,2000,0.0,6.0,7.0
1,Ohio,2001,1.0,8.0,9.0
2,Ohio,2002,2.0,10.0,11.0
3,Nevada,2003,3.0,,
4,Nevada,2004,4.0,,
4,Nevada,2000,,0.0,1.0
4,Nevada,2000,,2.0,3.0


**同时使用双方索引**

In [119]:
lefth1 = lefth.set_index(["key1", "key2"])
righth1 = righth.copy()
righth1.index.names = ["key1", "key2"]

In [120]:
righth1

Unnamed: 0_level_0,Unnamed: 1_level_0,event1,event2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
Nevada,2000,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [121]:
lefth1

Unnamed: 0_level_0,Unnamed: 1_level_0,data
key1,key2,Unnamed: 2_level_1
Ohio,2000,0
Ohio,2001,1
Ohio,2002,2
Nevada,2003,3
Nevada,2004,4


In [122]:
pd.merge(lefth1, righth1, left_index=True, right_index=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,data,event1,event2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ohio,2000,0,4,5
Ohio,2000,0,6,7
Ohio,2001,1,8,9
Ohio,2002,2,10,11


**总结**  
![merge](https://github.com/JacobWongUED/DataScience-100-Days/blob/master/01_%E5%88%A9%E7%94%A8Python%E8%BF%9B%E8%A1%8C%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90/img/merge.png?raw=true)

### 4.1.3 join()
**按照索引进行连接，不管它们之间有没有重叠的列**  
**同merge()一样，join()仅在row方向进行**

In [133]:
lefth1

Unnamed: 0_level_0,Unnamed: 1_level_0,data
key1,key2,Unnamed: 2_level_1
Ohio,2000,0
Ohio,2001,1
Ohio,2002,2
Nevada,2003,3
Nevada,2004,4


In [135]:
righth1

Unnamed: 0_level_0,Unnamed: 1_level_0,event1,event2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
Nevada,2000,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [121]:
lefth1.join(righth1, how="outer")

Unnamed: 0_level_0,Unnamed: 1_level_0,data,event1,event2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nevada,2000,,0.0,1.0
Nevada,2000,,2.0,3.0
Nevada,2003,3.0,,
Nevada,2004,4.0,,
Ohio,2000,0.0,4.0,5.0
Ohio,2000,0.0,6.0,7.0
Ohio,2001,1.0,8.0,9.0
Ohio,2002,2.0,10.0,11.0


In [144]:
left = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'], columns=["Ohio", "Nevada"])
right = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13., 14.]], index=['b', 'c', 'd', 'e'], columns=["Ohio", "Alabama"])

In [145]:
left

Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [146]:
right

Unnamed: 0,Ohio,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [153]:
left.join(right, how="outer", lsuffix = "_left", rsuffix="_right")

Unnamed: 0,Ohio_left,Nevada,Ohio_right,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


**join()支持索引和列之间的连接**

In [154]:
left = DataFrame(data={"key": ["one"]*2 + ["two"]*2,
                 "value": arange(4)})
right = DataFrame(data={"area": ["Nevada", "Ohio"]}, index=["one", "two"])

In [155]:
left

Unnamed: 0,key,value
0,one,0
1,one,1
2,two,2
3,two,3


In [156]:
right

Unnamed: 0,area
one,Nevada
two,Ohio


In [172]:
left.join(right, on="key")

Unnamed: 0,key,value,area
0,one,0,Nevada
1,one,1,Nevada
2,two,2,Ohio
3,two,3,Ohio


**可以向join()传入一组DataFrame**

In [168]:
another = DataFrame(data={"year": [2000, 2001]}, index=["one", "two"])
another

Unnamed: 0,year
one,2000
two,2001


In [170]:
left.join(right, on="key").join(another, on="key")

Unnamed: 0,key,value,area,year
0,one,0,Nevada,2000
1,one,1,Nevada,2000
2,two,2,Ohio,2001
3,two,3,Ohio,2001


**总结**  
![join](https://github.com/JacobWongUED/DataScience-100-Days/blob/master/01_%E5%88%A9%E7%94%A8Python%E8%BF%9B%E8%A1%8C%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90/img/join.png?raw=true)

### 4.1.4 轴向连接

**Series**

In [254]:
import numpy as np
from numpy import arange, random
import pandas as pd
from pandas import DataFrame, Series

In [231]:
s1 = Series([0, 1], index=['a', 'b'])
s1.name = "one"
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s2.name = "two"
s3 = Series([5, 6], index=['f', 'g'])
s3.name = "three"

In [232]:
s1

a    0
b    1
Name: one, dtype: int64

In [233]:
s2

c    2
d    3
e    4
Name: two, dtype: int64

In [234]:
s3

f    5
g    6
Name: three, dtype: int64

In [235]:
pd.concat([s1, s2, s3]).sort_index()

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [236]:
pd.concat([s1, s2, s3], axis=1, sort=False)

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [237]:
s4 = pd.concat([s1*5, s3])
s4

a    0
b    5
f    5
g    6
dtype: int64

In [238]:
pd.concat([s1, s4], axis=1, join="inner")

Unnamed: 0,one,0
a,0,0
b,1,5


concat默认为outer

**通过join_axes指定对齐的轴索引**

In [290]:
pd.concat([s1, s4], axis=1, join_axes=[['a', 'b']])
# pd.concat([s1, s4], axis=1, join_axes=[s4.index])
# pd.concat([s1, s4], axis=1, join_axes=[['a', 'b', 'c', 'd']])

Unnamed: 0,one,0
a,0,0
b,1,5


**使用keys建立层次化索引**

In [242]:
result = pd.concat([s1, s2, s3], keys=[s1.name, s2.name, s3.name])
result

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64

In [243]:
result.unstack()

Unnamed: 0,a,b,c,d,e,f,g
one,0.0,1.0,,,,,
two,,,2.0,3.0,4.0,,
three,,,,,,5.0,6.0


**沿着axis=1的方向合并，keys会变成列头**

In [244]:
pd.concat([s1, s2, s3], axis=1, keys=[s1.name, s2.name, s3.name], sort=False)

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [245]:
result.unstack().T

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


**DataFrame**

In [246]:
df1 = DataFrame(data=arange(6).reshape(3, 2), index=list("abc"), columns=["one", "two"])
df2 = DataFrame(data=arange(4).reshape(2, 2)+5, index=['a', 'c'], columns=["two", "three"])

In [248]:
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [249]:
df2

Unnamed: 0,two,three
a,5,6
c,7,8


In [247]:
pd.concat([df1, df2], axis=1, keys=["level1", "level2"], sort=False)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,two,three
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


**names参数可以对层次化索引命名**

In [217]:
pd.concat({"level1": df1, "level2": df2}, axis=1, names=["upper","lower"], sort=False)

upper,level1,level1,level2,level2
lower,one,two,two,three
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [250]:
pd.concat({"level1": df1, "level2": df2}, axis=1, sort=False)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,two,three
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


**verify_integrity检查新对象轴上的重复对象**

In [251]:
try:
    pd.concat([df1, df2], axis=1, verify_integrity=True, sort=False)
except ValueError as e:
    print("ValueError:", e)

ValueError: Indexes have overlapping values: Index(['two'], dtype='object')


**ignore_index参数不保留连接轴上的索引**

In [294]:
pd.concat([df1, df2], ignore_index=True, sort=False)

Unnamed: 0,one,two,three
0,0.0,1,
1,2.0,3,
2,4.0,5,
3,,5,6.0
4,,7,8.0


In [296]:
pd.concat([df1, df2], axis=1, ignore_index=True, sort=False)

Unnamed: 0,0,1,2,3
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [297]:
pd.concat([df1, df2], axis=1, sort=False)

Unnamed: 0,one,two,two.1,three
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


**column方向**  
列相同

In [265]:
df3 = DataFrame(data=arange(12).reshape(4, 3), index=list("abcd"), columns=["key", "data", "rating"])
df4 = DataFrame(data=arange(4).reshape(2, 2), index=list("ce"), columns=["key", "data"])

In [266]:
df3

Unnamed: 0,key,data,rating
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11


In [267]:
df4

Unnamed: 0,key,data
c,0,1
e,2,3


In [270]:
pd.concat([df3, df4], sort=False)

Unnamed: 0,key,data,rating
a,0,1,2.0
b,3,4,5.0
c,6,7,8.0
d,9,10,11.0
c,0,1,
e,2,3,


**row方向**
键相同

In [287]:
df5 = DataFrame(data={"key": ["Ca", "Oh", "Te", "Ut"],
                     "state": ["California", "Ohio", "Texas", "Utah"],
                     "low": [1.5, 2.0, 1.7, 2.8]})
df6 = DataFrame(data={"key": ["Oh", "NY", "Ca", "Wa"],
                     "state": ["Ohio", "NewYork", "California", "Washington"],
                     "high": [3.0, 2.0, 2.9, 2.8]})

In [288]:
pd.concat([df5, df6], axis=1, sort=False, join="outer")

Unnamed: 0,key,state,low,key.1,state.1,high
0,Ca,California,1.5,Oh,Ohio,3.0
1,Oh,Ohio,2.0,NY,NewYork,2.0
2,Te,Texas,1.7,Ca,California,2.9
3,Ut,Utah,2.8,Wa,Washington,2.8


In [289]:
pd.merge(df5, df6, on=["key", "state"], how="outer", sort=False)

Unnamed: 0,key,state,low,high
0,Ca,California,1.5,2.9
1,Oh,Ohio,2.0,3.0
2,Te,Texas,1.7,
3,Ut,Utah,2.8,
4,NY,NewYork,,2.0
5,Wa,Washington,,2.8


**总结**  
![concat](https://github.com/JacobWongUED/DataScience-100-Days/blob/master/01_%E5%88%A9%E7%94%A8Python%E8%BF%9B%E8%A1%8C%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90/img/concat.png?raw=true)

### 4.1.5 合并重叠数据
**用参数对象的数据修补调用对象的数据**

In [301]:
import numpy as np
from numpy import arange, random
from numpy import nan as NAN
import pandas as pd
from pandas import Series, DataFrame

**combine_first()**

In [229]:
df1 = DataFrame({'a': [1., np.nan, 5., np.nan],
                'b': [np.nan, 2., np.nan, 6.],
                'c': arange(2, 18, 4)})
df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],
                'b': [np.nan, 3., 4., 6., 8.]})

In [230]:
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [231]:
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [232]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


In [310]:
df3 = DataFrame(data={"states": ["California", "Texas", "Ohio", "NewYork"],
                     "price": [3.12, NAN, NAN, 2.19],
                     "percent": [1.7, 2.8, NAN, 3.1]})
df4 = DataFrame(data={"states": ["California", "Utah", "Wshinton", "NewYork", "Texas", "Ohio"],
                     "price": [3.12, 1.9, 2.7, 2.19, 1.8, 2.5],
                     "percent": [1.7, 2.8, 1.9, 3.1, 2.8, NAN]})

In [323]:
df3

Unnamed: 0,states,price,percent
0,California,3.12,1.7
1,Texas,,2.8
2,Ohio,,
3,NewYork,2.19,3.1


In [312]:
df4

Unnamed: 0,states,price,percent
0,California,3.12,1.7
1,Utah,1.9,2.8
2,Wshinton,2.7,1.9
3,NewYork,2.19,3.1
4,Texas,1.8,2.8
5,Ohio,2.5,


In [335]:
df3["states"]

0    California
1         Texas
2          Ohio
3       NewYork
Name: states, dtype: object

In [334]:
df4_copy = df4.copy()
df4_copy = df4_copy.set_index("states")
result = df4_copy.loc[df3["states"]].reset_index()
result

Unnamed: 0,states,price,percent
0,California,3.12,1.7
1,Texas,1.8,2.8
2,Ohio,2.5,
3,NewYork,2.19,3.1


In [336]:
df3.combine_first(result)

Unnamed: 0,states,price,percent
0,California,3.12,1.7
1,Texas,1.8,2.8
2,Ohio,2.5,
3,NewYork,2.19,3.1


**总结**  
使用combine_first()函数，两个数据集的索引必须相同

**np.where()**

In [239]:
s1 = Series([np.nan, 2., np.nan, 3., 4., np.nan], index=list("fedcba"))
s2 = Series(arange(len(s1)), index=list("fedcba"), dtype=np.float64)

In [240]:
s1

f    NaN
e    2.0
d    NaN
c    3.0
b    4.0
a    NaN
dtype: float64

In [241]:
s2

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [243]:
np.where(pd.isnull(s1), s2, s1)

array([0., 2., 2., 3., 4., 5.])

## 4.2 重塑和轴转向