In [1]:
import pandas as pd
import numpy as np

#### DataFrame 是一种表格型的二维数据结构，既有行索引（index），又有列索引（columns），且默认都是从0开始递增的整数。可以把每一列看作是共同用一个索引的 Series，且不同列的数据类型可以不同。DataFrame 的结构图，如下所示：
![DataFrame](picture/DataFrame.png)

#### pd.DataFrame(data=None, index=None, columns=None, dtype=None)
- 创建 DataFrame 对象
- data：array-like, dict
- index：行索引。不指定时，默认为从 0 开始依次递增的整数
- columns：列索引。不指定时，默认为从 0 开始依次递增的整数
- dtype：数据类型，如果没有指定，则会自动推断得出

In [2]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


#### 单一列表创建DataFrame对象

In [3]:
lst = [['Tom'], ['Bob'], ['Linda']]
df = pd.DataFrame(data=lst)
print(df)
print('*' * 30)
lst = ['Tom', 'Bob', 'Linda']
df = pd.DataFrame(data=lst)
print(df)

       0
0    Tom
1    Bob
2  Linda
******************************
       0
0    Tom
1    Bob
2  Linda


#### 嵌套列表创建DataFrame对象

In [4]:
lst = [['Tom', 18, 188, 75],
       ['Bob', 19, 179, 68],
       ['Linda', 17, 177, 62]]
df = pd.DataFrame(lst)
print(df)

       0   1    2   3
0    Tom  18  188  75
1    Bob  19  179  68
2  Linda  17  177  62


#### ndarray创建DataFrame对象

In [5]:
arr = np.array(lst)
df = pd.DataFrame(arr)
print(df)

       0   1    2   3
0    Tom  18  188  75
1    Bob  19  179  68
2  Linda  17  177  62


#### Series创建DataFrame对象

In [6]:
lst = [pd.Series(['Tom', 18, 188, 75]),
       pd.Series(['Bob', 19, 179, 68]),
       pd.Series(['Linda', 17, 177, 62])]
df = pd.DataFrame(lst)
print(df)

       0   1    2   3
0    Tom  18  188  75
1    Bob  19  179  68
2  Linda  17  177  62


In [7]:
index = ['name', 'age', 'height', 'weight']
lst = [pd.Series(['Tom', 18, 188, 75], index, name='a'),
       pd.Series(['Bob', 19, 179, 68], index, name='b'),
       pd.Series(['Linda', 17, 177, 62], index, name='c')]
df = pd.DataFrame(lst)
print(df)
print('*' * 30)
lst = [['Tom', 18, 188, 75],
       ['Bob', 19, 179, 68],
       ['Linda', 17, 177, 62]]
df = pd.DataFrame(lst, index=['a', 'b', 'c'], columns=['name', 'age', 'height', 'weight'])
print(df)

    name  age  height  weight
a    Tom   18     188      75
b    Bob   19     179      68
c  Linda   17     177      62
******************************
    name  age  height  weight
a    Tom   18     188      75
b    Bob   19     179      68
c  Linda   17     177      62


#### 字典嵌套列表创建DataFrame对象
- 字典data中, 所有键对应的值的元素个数必须相同
- 默认情况下，字典的键被用作列索引

In [8]:
index = list('abc')
data = {'name': pd.Series(['Tom', 'Bob', 'Linda'], index),
        'age': pd.Series([18, 19, 17], index),
        'height': pd.Series([188, 179, 177], index),
        'weight': pd.Series([75, 68, 62], index)}
df = pd.DataFrame(data)
print(df)
print('*' * 30)
data = {'name': ['Tom', 'Bob', 'Linda'],
        'age': [18, 19, 17],
        'height': [188, 179, 177],
        'weight': [75, 68, 62]}
df = pd.DataFrame(data, index=list('abc'))
print(df)

    name  age  height  weight
a    Tom   18     188      75
b    Bob   19     179      68
c  Linda   17     177      62
******************************
    name  age  height  weight
a    Tom   18     188      75
b    Bob   19     179      68
c  Linda   17     177      62


In [9]:
data = {'name': ['Tom', 'Bob', 'Linda'],
        'age': [18, 19, 17],
        'height': [188, 179, 177],
        'weight': [75, 68, 62]}
df = pd.DataFrame(data, index=list('abc'), columns=['height', 'score', 'age'])
print(df)

   height score  age
a     188   NaN   18
b     179   NaN   19
c     177   NaN   17


In [10]:
lst = [['Tom', 18, 1.88, 75],
       ['Bob', 19, 179, 68],
       ['Linda', 17, 177, 62]]
df = pd.DataFrame(lst,
                  index=['a', 'b', 'c'],
                  columns=['name', 'age', 'height', 'weight'])
print(df)
print('*' * 30)
df = pd.DataFrame(lst,
                  index=['a', 'b', 'c'],
                  columns=['name', 'age', 'height', 'weight'],
                  dtype=np.float32)
print(df)
print('*' * 30)
lst = [['Tom', 18.0, 1.88, 75],
       ['Bob', 19, 179, 68],
       ['Linda', 17, 177, 62]]
df = pd.DataFrame(lst,
                  index=['a', 'b', 'c'],
                  columns=['name', 'age', 'height', 'weight'],
                  dtype=np.int32)
print(df)

    name  age  height  weight
a    Tom   18    1.88      75
b    Bob   19  179.00      68
c  Linda   17  177.00      62
******************************
    name   age  height  weight
a    Tom  18.0    1.88    75.0
b    Bob  19.0  179.00    68.0
c  Linda  17.0  177.00    62.0
******************************
    name  age  height  weight
a    Tom   18    1.88      75
b    Bob   19  179.00      68
c  Linda   17  177.00      62


In [11]:
lst = [['Tom', 18.0, 1.88, 75],
       ['Bob', 19, 179, 68],
       ['Linda', 17, 177, 62]]
df = pd.DataFrame(lst,
                  index=['a', 'b', 'c'],
                  columns=['name', 'age', 'height', 'weight'],
                  dtype=object)
print(df)

    name   age height weight
a    Tom  18.0   1.88     75
b    Bob    19    179     68
c  Linda    17    177     62


#### DataFrame 常用属性
- T &emsp;&emsp;&emsp;&emsp;&emsp; 转置
- dtypes &emsp;&emsp;&emsp; 返回每一列的数据类型
- shape &emsp;&emsp;&emsp;返回 DataFrame 的形状
- size &emsp;&emsp;&emsp;&emsp; 返回 DataFrame 中的元素数量
- index &emsp;&emsp;&emsp; 返回行索引
- columns &emsp;&emsp; 返回列索引
- axes &emsp;&emsp;&emsp;&emsp; 以列表形式返回行索引和列索引
- values &emsp;&emsp;&emsp; 以 ndarray 数组的形式返回 DataFrame 中的数据

In [12]:
print(df)

    name   age height weight
a    Tom  18.0   1.88     75
b    Bob    19    179     68
c  Linda    17    177     62


In [13]:
print(df.dtypes)
print('*' * 30)
print(df)
print('*' * 30)
print(df.T)

name      object
age       object
height    object
weight    object
dtype: object
******************************
    name   age height weight
a    Tom  18.0   1.88     75
b    Bob    19    179     68
c  Linda    17    177     62
******************************
           a    b      c
name     Tom  Bob  Linda
age     18.0   19     17
height  1.88  179    177
weight    75   68     62


In [14]:
print(df.shape)
print(df.size)
print(df.index)
print('*' * 30)
print(df.columns)
print('*' * 30)
print(df.axes)
print('*' * 30)
print(df.values)

(3, 4)
12
Index(['a', 'b', 'c'], dtype='object')
******************************
Index(['name', 'age', 'height', 'weight'], dtype='object')
******************************
[Index(['a', 'b', 'c'], dtype='object'), Index(['name', 'age', 'height', 'weight'], dtype='object')]
******************************
[['Tom' 18.0 1.88 75]
 ['Bob' 19 179 68]
 ['Linda' 17 177 62]]


In [15]:
df = pd.DataFrame(lst)
df.index = [7, 8, 9]
df.columns = list('ABCD')
print(df)

print('*' * 30)
print(df.index)
print('*' * 30)
print(df.columns)
print('*' * 30)
print(df.axes)
print('*' * 30)
print(df.values)

       A     B       C   D
7    Tom  18.0    1.88  75
8    Bob  19.0  179.00  68
9  Linda  17.0  177.00  62
******************************
Int64Index([7, 8, 9], dtype='int64')
******************************
Index(['A', 'B', 'C', 'D'], dtype='object')
******************************
[Int64Index([7, 8, 9], dtype='int64'), Index(['A', 'B', 'C', 'D'], dtype='object')]
******************************
[['Tom' 18.0 1.88 75]
 ['Bob' 19.0 179.0 68]
 ['Linda' 17.0 177.0 62]]


#### DataFrame 的运算
#### DataFrame 保留了 NumPy 中的数组运算，且 DataFrame 进行数组运算的时候，索引与值之间的映射关系不会发生改变。在进行 DataFrame 和DataFrame 的运算时，把两个 DataFrame 中行索引名和列索引名一样的值进行运算和排序，其他不一样的做并集且对应的值为NaN

In [16]:
np.random.seed(3)
arr = np.random.randint(1, 10, size=(4, 3))
df1 = pd.DataFrame(arr, index=list('ntbc'), columns=list('FAM'))
print(df1 + 10)

arr = np.random.randint(0, 10, size=(3, 3))
df2 = pd.DataFrame(arr, index=list('cgt'), columns=list('AQM'))
print(df2 + 10)
print(df1 + df2 + 20)

    F   A   M
n  19  14  19
t  19  11  16
b  14  16  18
c  17  11  15
    A   Q   M
c  17  18  11
g  16  12  12
t  11  13  15
      A   F     M   Q
b   NaN NaN   NaN NaN
c  28.0 NaN  26.0 NaN
g   NaN NaN   NaN NaN
n   NaN NaN   NaN NaN
t  22.0 NaN  31.0 NaN


#### 访问 DataFrame 数据
- 索引获取列数据，切片获取行数据

In [17]:
print(df)

       A     B       C   D
7    Tom  18.0    1.88  75
8    Bob  19.0  179.00  68
9  Linda  17.0  177.00  62


In [18]:
lst = [['Tom', 18, 188, 75],
       ['Bob', 19, 179, 68],
       ['Linda', 17, 177, 62]]
df = pd.DataFrame(lst, index=['a', 'b', 'c'], columns=['name', 'age', 'height', 'weight'])
print(df)

    name  age  height  weight
a    Tom   18     188      75
b    Bob   19     179      68
c  Linda   17     177      62


#### 索引针对列操作(只能给标签值)
#### 切片针对行操作(既可以用下标，也可以用标签)

In [19]:
print(df)

    name  age  height  weight
a    Tom   18     188      75
b    Bob   19     179      68
c  Linda   17     177      62


In [20]:
print(df['age'])
print('*' * 30)
print(df[['age']])
print('*' * 30)
print(df[['age', 'name', 'weight']])

a    18
b    19
c    17
Name: age, dtype: int64
******************************
   age
a   18
b   19
c   17
******************************
   age   name  weight
a   18    Tom      75
b   19    Bob      68
c   17  Linda      62


In [21]:
print(df[0:2])
print(df['a':'b'])

  name  age  height  weight
a  Tom   18     188      75
b  Bob   19     179      68
  name  age  height  weight
a  Tom   18     188      75
b  Bob   19     179      68


#### df.loc[针对行的操作，针对列的操作] &emsp;&emsp; 参数必须给标签
#### df.iloc[针对行的操作，针对列的操作] &emsp;&emsp; 参数必须给下标

In [22]:
print(df)

    name  age  height  weight
a    Tom   18     188      75
b    Bob   19     179      68
c  Linda   17     177      62


In [23]:
print(df.loc[:, ])
print('*' * 30)
print(df.iloc[:])

    name  age  height  weight
a    Tom   18     188      75
b    Bob   19     179      68
c  Linda   17     177      62
******************************
    name  age  height  weight
a    Tom   18     188      75
b    Bob   19     179      68
c  Linda   17     177      62


In [24]:
print(df.loc['a':'b'])
print('*' * 30)
print(df.iloc[0:2, ])

  name  age  height  weight
a  Tom   18     188      75
b  Bob   19     179      68
******************************
  name  age  height  weight
a  Tom   18     188      75
b  Bob   19     179      68


In [25]:
print(df.loc[:, 'age'])
print(df.iloc[:, 1])

a    18
b    19
c    17
Name: age, dtype: int64
a    18
b    19
c    17
Name: age, dtype: int64


In [26]:
print(df.loc[:, 'age':'height'])
print(df.iloc[:, 1:3])

   age  height
a   18     188
b   19     179
c   17     177
   age  height
a   18     188
b   19     179
c   17     177


In [27]:
print(df.loc[:, 'height'::-2])
print(df.iloc[:, 2::-2])

   height   name
a     188    Tom
b     179    Bob
c     177  Linda
   height   name
a     188    Tom
b     179    Bob
c     177  Linda


In [28]:
print(df.loc['b'])
print('*' * 30)
print(df.iloc[1])

name      Bob
age        19
height    179
weight     68
Name: b, dtype: object
******************************
name      Bob
age        19
height    179
weight     68
Name: b, dtype: object


In [29]:
print(df.loc[['b']])
print('*' * 30)
print(df.iloc[[1]])

  name  age  height  weight
b  Bob   19     179      68
******************************
  name  age  height  weight
b  Bob   19     179      68


In [30]:
print(df.loc['b':'b'])
print('*' * 30)
print(df.iloc[1:2])

  name  age  height  weight
b  Bob   19     179      68
******************************
  name  age  height  weight
b  Bob   19     179      68


In [31]:
print(df.loc[['a', 'b']])
print('*' * 30)
print(df.iloc[[0, 1]])

  name  age  height  weight
a  Tom   18     188      75
b  Bob   19     179      68
******************************
  name  age  height  weight
a  Tom   18     188      75
b  Bob   19     179      68


#### 修改 DataFrame 索引
- 修改对应的属性即可
- 对访问的数据重新赋值，即可修改数据；

In [32]:
print(df)

    name  age  height  weight
a    Tom   18     188      75
b    Bob   19     179      68
c  Linda   17     177      62


In [33]:
df['height'] = [1.88, 1.79, 1.77]
df.loc[:, 'weight'] = pd.Series([7.5, 6.8, 6.2], index=df.index)
df.loc[:, 'weight'] = pd.Series([7.5, 6.8, 6.2], index=list('cba'))
print(df)

    name  age  height  weight
a    Tom   18    1.88     6.2
b    Bob   19    1.79     6.8
c  Linda   17    1.77     7.5


In [34]:
df[['height', 'age']] = [[188, 20],
                         [179, 21],
                         [177, 22]]
print(df)

    name  age  height  weight
a    Tom   20     188     6.2
b    Bob   21     179     6.8
c  Linda   22     177     7.5


In [35]:
data = {'height': [1.88, 1.79, 1.77],
        'age': [18, 19, 20]}
df2 = pd.DataFrame(data, index=df.index)
print(df2)
df[['height', 'age']] = df2
print(df)

   height  age
a    1.88   18
b    1.79   19
c    1.77   20
    name  age  height  weight
a    Tom   18    1.88     6.2
b    Bob   19    1.79     6.8
c  Linda   20    1.77     7.5


In [36]:
df.loc['b'] = ['sam', 20, 1.79, 86]
print(df)
df[::2] = [['Tony', 20, 1.88, 80], ['Rose', 23, 1.77, 68]]
print(df)

    name  age  height  weight
a    Tom   18    1.88     6.2
b    sam   20    1.79    86.0
c  Linda   20    1.77     7.5
   name  age  height  weight
a  Tony   20    1.88    80.0
b   sam   20    1.79    86.0
c  Rose   23    1.77    68.0


#### 如果访问数据不存在，则会添加数据

In [37]:
print(df)

   name  age  height  weight
a  Tony   20    1.88    80.0
b   sam   20    1.79    86.0
c  Rose   23    1.77    68.0


In [38]:
df['score'] = [100, 98, 80]
print(df)

   name  age  height  weight  score
a  Tony   20    1.88    80.0    100
b   sam   20    1.79    86.0     98
c  Rose   23    1.77    68.0     80


In [39]:
df[['score', 'stu_id']] = [[100, 101],
                           [98, 102],
                           [95, 103]]
print(df)

   name  age  height  weight  score  stu_id
a  Tony   20    1.88    80.0    100     101
b   sam   20    1.79    86.0     98     102
c  Rose   23    1.77    68.0     95     103


In [40]:
df.loc['d'] = ['Jake', 23, 187, 68, 80, 104]
print(df)

   name  age  height  weight  score  stu_id
a  Tony   20    1.88    80.0    100     101
b   sam   20    1.79    86.0     98     102
c  Rose   23    1.77    68.0     95     103
d  Jake   23  187.00    68.0     80     104


#### 删除

In [41]:
print(df)

   name  age  height  weight  score  stu_id
a  Tony   20    1.88    80.0    100     101
b   sam   20    1.79    86.0     98     102
c  Rose   23    1.77    68.0     95     103
d  Jake   23  187.00    68.0     80     104


In [42]:
del df['age'], df['weight']
df = df.T
del df['a'], df['c']
df = df.T
print(df)

   name height score stu_id
b   sam   1.79    98    102
d  Jake  187.0    80    104


#### DataFrame 常用方法

#### DataFrame.isnull() / DataFrame.notnull()
- 检测 DataFrame 中的缺失值

In [43]:
print(df)

   name height score stu_id
b   sam   1.79    98    102
d  Jake  187.0    80    104


In [44]:
d = [[8, np.nan],
     [np.nan, 7],
     [0, 2],
     [np.nan, np.nan]]
df = pd.DataFrame(d)
print(df)
print(df.isnull())
print(df.notnull())

     0    1
0  8.0  NaN
1  NaN  7.0
2  0.0  2.0
3  NaN  NaN
       0      1
0  False   True
1   True  False
2  False  False
3   True   True
       0      1
0   True  False
1  False   True
2   True   True
3  False  False


#### DataFrame.insert(loc, column, value)
- loc：int，整数列索引，指定插入数据列的位置
- column：新插入的数据列的名字
- value：int, Series, or array-like，插入的数据

In [45]:
print(df)

     0    1
0  8.0  NaN
1  NaN  7.0
2  0.0  2.0
3  NaN  NaN


In [46]:
df = pd.DataFrame(lst, index=['a', 'b', 'c'], columns=['name', 'age', 'height', 'weight'])
df.insert(1, 'score', [100, 98, 95])
print(df)
df = df.T
df.insert(2, 'd', ['Sam', 100, 20, 199, 68])
df = df.T
print(df)

    name  score  age  height  weight
a    Tom    100   18     188      75
b    Bob     98   19     179      68
c  Linda     95   17     177      62
    name score age height weight
a    Tom   100  18    188     75
b    Bob    98  19    179     68
d    Sam   100  20    199     68
c  Linda    95  17    177     62


#### DataFrame.reindex(labels=None, axis=0, index=None, columns=None,fill_value=np.NaN)
- labels：要获取数据的列标签或者行标签，传入列表，与axis对应
- axis：轴的方向，0为行，1为列
- index：要获取数据的行索引，传入列表
- columns：要获取数据的列索引，传入列表
- fill_value：填充的缺失值（标量），默认为 np.NaN
- 返回重新索引组成的新的 DataFrame 对象

In [47]:
print(df)

    name score age height weight
a    Tom   100  18    188     75
b    Bob    98  19    179     68
d    Sam   100  20    199     68
c  Linda    95  17    177     62


In [48]:
df = pd.DataFrame(lst, index=list('abc'), columns=['name', 'age', 'height', 'weight'])
print(df.reindex(labels=list('cmak'), axis=0))
print(df.reindex(labels=['height', 'score', 'name'], axis=1))

    name   age  height  weight
c  Linda  17.0   177.0    62.0
m    NaN   NaN     NaN     NaN
a    Tom  18.0   188.0    75.0
k    NaN   NaN     NaN     NaN
   height  score   name
a     188    NaN    Tom
b     179    NaN    Bob
c     177    NaN  Linda


In [49]:
df = pd.DataFrame(lst, index=list('abc'), columns=['name', 'age', 'height', 'weight'])
print(df.reindex(index=list('cmak'), columns=['height', 'score', 'name'], fill_value=0))

   height  score   name
c     177      0  Linda
m       0      0      0
a     188      0    Tom
k       0      0      0


#### DataFrame.drop(labels=None, axis=0, index=None, columns=None,inplace=False)
- labels：要删除的列标签或者行标签，如果要删除多个，传入列表，与axis对应
- axis：轴的方向，0为行，1为列
- index：要删除的行索引，如果要删除多个，传入列表
- columns：要删除的列索引，如果要删除多个，传入列表
- inplace：inplace=True时，对原数据操作，返回None

In [50]:
print(df.drop(labels=list('ac')))
print('*' * 30)
print(df.drop(labels=['height', 'age'], axis=1))

  name  age  height  weight
b  Bob   19     179      68
******************************
    name  weight
a    Tom      75
b    Bob      68
c  Linda      62


In [51]:
print(df)

    name  age  height  weight
a    Tom   18     188      75
b    Bob   19     179      68
c  Linda   17     177      62


In [52]:
df.drop(index=list('ac'), columns=['height', 'age'], inplace=True)
print(df)

  name  weight
b  Bob      68


In [53]:
lst = [['Tom', 18, 188, 75],
       ['Bob', 19, 179, 68],
       ['Linda', 17, 177, 62]]
df = pd.DataFrame(lst, index=['a', 'b', 'c'], columns=['name', 'age', 'height', 'weight'])
print(df)

    name  age  height  weight
a    Tom   18     188      75
b    Bob   19     179      68
c  Linda   17     177      62


#### pd.concat(objs, axis=0, join='outer', ignore_index=False)
- objs：DataFrame对象的序列
- axis：要拼接的轴
- join：外连接（'outer'）保留两个表中的所有信息；内连接（'inner'）只保留共有信息
- ignore_index：如果指定为 True，则索引将变为从0开始递增的整数
- 返回一个新的 DataFrame

In [54]:
df1 = pd.DataFrame([[1, 2], [3, 4]], index=['p1', 'p2'], columns=list('AB'))
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AC'))
print(df1)
print(df2)
print('*' * 30)
print(pd.concat((df1, df2)))

    A  B
p1  1  2
p2  3  4
   A  C
0  5  6
1  7  8
******************************
    A    B    C
p1  1  2.0  NaN
p2  3  4.0  NaN
0   5  NaN  6.0
1   7  NaN  8.0


In [55]:
print(pd.concat((df1, df2), join='inner'))
print('*' * 30)
print(pd.concat([df1, df2], join='inner', ignore_index=True))

    A
p1  1
p2  3
0   5
1   7
******************************
   A
0  1
1  3
2  5
3  7


In [56]:
print(pd.concat([df1, df2], axis=1))
print('*' * 30)
print(pd.concat([df1, df2], axis=1, ignore_index=True))

      A    B    A    C
p1  1.0  2.0  NaN  NaN
p2  3.0  4.0  NaN  NaN
0   NaN  NaN  5.0  6.0
1   NaN  NaN  7.0  8.0
******************************
      0    1    2    3
p1  1.0  2.0  NaN  NaN
p2  3.0  4.0  NaN  NaN
0   NaN  NaN  5.0  6.0
1   NaN  NaN  7.0  8.0


In [57]:
print(pd.concat([df1, df2], axis=1, join='inner'))
print('*' * 30)
df2.index = [0, 'p1']  # 修改索引
print(df2)
print(pd.concat([df1, df2], axis=1, join='inner'))

Empty DataFrame
Columns: [A, B, A, C]
Index: []
******************************
    A  C
0   5  6
p1  7  8
    A  B  A  C
p1  1  2  7  8


In [58]:
df2 = pd.DataFrame([[5, 6], [7, 8]], index=['p1', 'p2'], columns=list('AB'))
print(pd.concat([df1, df2]))
df2 = pd.DataFrame([[5, 6], [7, 8]], index=['p1', 'p2'], columns=list('AB'))
print(pd.concat([df1, df2], axis=1))

    A  B
p1  1  2
p2  3  4
p1  5  6
p2  7  8
    A  B  A  B
p1  1  2  5  6
p2  3  4  7  8


#### pd.merge(left, right, how='inner', on=None)
- left：左侧 DataFrame 对象
- right：右侧 DataFrame 对象
- how：要执行的合并类型。'inner'为内连接，取左右两个DataFrame 的键的交集进行合并；
- 'left'为左连接，以左侧DataFrame 的键为基准进行合并，如果左侧 DataFrame 中的键在右侧不存在，则用缺失值NaN 填充；
- 'right'为右连接，以右侧DataFrame 的键为基准进行合并，如果右侧 DataFrame 中的键在左侧不存在，则用缺失值NaN 填充；'outer'为外连接，取左右两个DataFrame 的键的并集进行合并
- on：指定用于连接的键（即列标签的名字），该键必须同时存在于左右两个 DataFrame 中，如果没有指定，那么将会以两个DataFrame 的列名交集做为连接键

In [59]:
d1 = {'name': ['Tom', 'Bob', 'Jack'], 'age': [18, 17,19], 'weight': [65, 66, 67]}
df1 = pd.DataFrame(data=d1)
d2 = {'name': ['Tom', 'Jack'], 'height': [168, 187],'weight': [65, 68]}
df2 = pd.DataFrame(data=d2)
print(df1)
print(df2)

   name  age  weight
0   Tom   18      65
1   Bob   17      66
2  Jack   19      67
   name  height  weight
0   Tom     168      65
1  Jack     187      68


In [60]:
print(pd.merge(df1, df2))
print(pd.merge(df1, df2, how='inner'))
print(pd.merge(df1, df2, how='inner', on=['name','weight']))

  name  age  weight  height
0  Tom   18      65     168
  name  age  weight  height
0  Tom   18      65     168
  name  age  weight  height
0  Tom   18      65     168


In [61]:
print(pd.merge(df1, df2, how='outer', on=['name','weight']))

   name   age  weight  height
0   Tom  18.0      65   168.0
1   Bob  17.0      66     NaN
2  Jack  19.0      67     NaN
3  Jack   NaN      68   187.0


In [62]:
print(pd.merge(df1, df2, how='left', on=['name','weight']))

   name  age  weight  height
0   Tom   18      65   168.0
1   Bob   17      66     NaN
2  Jack   19      67     NaN


In [63]:
print(pd.merge(df1, df2, how='right', on=['name','weight']))

   name   age  weight  height
0   Tom  18.0      65     168
1  Jack   NaN      68     187


In [64]:
print(pd.merge(df1, df2, how='inner', on='name'))
print(pd.merge(df1, df2, how='right', on='name'))

   name  age  weight_x  height  weight_y
0   Tom   18        65     168        65
1  Jack   19        67     187        68
   name  age  weight_x  height  weight_y
0   Tom   18        65     168        65
1  Jack   19        67     187        68


In [65]:
print(pd.merge(df1, df2, how='right', on='weight'))

  name_x   age  weight name_y  height
0    Tom  18.0      65    Tom     168
1    NaN   NaN      68   Jack     187


In [66]:
print(pd.merge(df1, df2, how='left', on='weight'))

  name_x  age  weight name_y  height
0    Tom   18      65    Tom   168.0
1    Bob   17      66    NaN     NaN
2   Jack   19      67    NaN     NaN


In [67]:
print(pd.merge(df1, df2, how='outer', on='weight'))

  name_x   age  weight name_y  height
0    Tom  18.0      65    Tom   168.0
1    Bob  17.0      66    NaN     NaN
2   Jack  19.0      67    NaN     NaN
3    NaN   NaN      68   Jack   187.0
