# Chapter 6 连接

In [1]:
import numpy as np
import pandas as pd

## 1. 关系型连接
### 1.1 连接的基本概念
按照某一关键词/特征把两个表合并为一张表。    
how=连接形式（left、right、inner、outer）。   
on=键。

### 1.2 值连接
df.merge

In [2]:
#same key: on
df1=pd.DataFrame({'Name':['San Zhang','Si Li'], 'Age':[20,30]})
df2=pd.DataFrame({'Name':['Si Li','Wu Wang'], 'Gender':['F','M']})
df1.merge(df2, on='Name', how='left')

Unnamed: 0,Name,Age,Gender
0,San Zhang,20,
1,Si Li,30,F


In [3]:
#different keys: left_on, right_on
df1=pd.DataFrame({'df1_Name':['San Zhang','Si Li'], 'Age':[20,30]})
df2=pd.DataFrame({'df2_Name':['Si Li','Wu Wang'], 'Gender':['F','M']})
df1.merge(df2, left_on='df1_Name', right_on='df2_Name', how='left')

Unnamed: 0,df1_Name,Age,df2_Name,Gender
0,San Zhang,20,,
1,Si Li,30,Si Li,F


In [4]:
#same two or more cols, but one key: suffixes
df1=pd.DataFrame({'Name':['San Zhang'], 'Grade':[70]})
df2=pd.DataFrame({'Name':['San Zhang'], 'Grade':[80]})
df1.merge(df2, on='Name', how='left', suffixes=['_Chinese','_Math'])

Unnamed: 0,Name,Grade_Chinese,Grade_Math
0,San Zhang,70,80


In [5]:
#two or more keys
df1 = pd.DataFrame({'Name':['San Zhang','San Zhang'], 'Age':[20, 21], 'Class':['one','two']})
df2 = pd.DataFrame({'Name':['San Zhang','San Zhang'], 'Gender':['F','M'], 'Class':['two','one']})
df1.merge(df2, on=['Name','Class'], how='left')

Unnamed: 0,Name,Age,Class,Gender
0,San Zhang,20,one,M
1,San Zhang,21,two,F


用duplicated检查重复，或用merge中的validate参数。

In [6]:
#练一练
df2 = pd.DataFrame({'Name':['San Zhang','San Zhang'], 'Gender':['F','M'], 'Class':['one','one']})
df1.merge(df2, on=['Name','Class'], how='left', validate='1:m')
#df1.merge(df2, on=['Name','Class'], how='left', validate='m:1') error!

Unnamed: 0,Name,Age,Class,Gender
0,San Zhang,20,one,F
1,San Zhang,20,one,M
2,San Zhang,21,two,


### 1.3 索引连接
把索引作为键。     
join函数参数：on（单层索引时可忽略）、how、lsuffix、rsuffix。

In [7]:
df1 = pd.DataFrame({'Age':[20,30]}, index=pd.Series(['San Zhang','Si Li'], name='Name'))
df2 = pd.DataFrame({'Gender':['F','M']}, index=pd.Series(['Si Li','Wu Wang'], name='Name'))
df1.join(df2, how='left')

Unnamed: 0_level_0,Age,Gender
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
San Zhang,20,
Si Li,30,F


In [8]:
df1 = pd.DataFrame({'Grade':[70]}, index=pd.Series(['San Zhang'], name='Name'))
df2 = pd.DataFrame({'Grade':[80]}, index=pd.Series(['San Zhang'], name='Name'))
df1.join(df2, how='left', lsuffix='_Chinese', rsuffix='_Math')

Unnamed: 0_level_0,Grade_Chinese,Grade_Math
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
San Zhang,70,80


In [9]:
df1 = pd.DataFrame({'Age':[20,21]}, index=pd.MultiIndex.from_arrays([['San Zhang','San Zhang'], ['one','two']], names=('Name','Class')))
df2 = pd.DataFrame({'Gender':['F','M']}, index=pd.MultiIndex.from_arrays([['San Zhang','San Zhang'], ['two','one']], names=('Name','Class')))
df1.join(df2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Gender
Name,Class,Unnamed: 2_level_1,Unnamed: 3_level_1
San Zhang,one,20,M
San Zhang,two,21,F


## 2. 方向连接
### 2.1 concat
将两个及以上表按照纵向或横向拼接，但不关心键。    
concat函数参数：axis（0/1: 纵向/横向）、join（outer/inner）、keys（产生多级索引标记数据来源表）。     
参数默认分别为axis=0、join=‘outer’     

In [10]:
#纵向
df1 = pd.DataFrame({'Name':['San Zhang','Si Li'], 'Age':[20,30]})
df2 = pd.DataFrame({'Name':['Wu Wang'], 'Age':[40]})
pd.concat([df1,df2])

Unnamed: 0,Name,Age
0,San Zhang,20
1,Si Li,30
0,Wu Wang,40


In [11]:
#横向
pd.concat([df1,df2], axis=1)

Unnamed: 0,Name,Age,Name.1,Age.1
0,San Zhang,20,Wu Wang,40.0
1,Si Li,30,,


In [12]:
#outer
pd.concat([df1,df2], axis=1, join='outer')

Unnamed: 0,Name,Age,Name.1,Age.1
0,San Zhang,20,Wu Wang,40.0
1,Si Li,30,,


In [13]:
#keys
pd.concat([df1,df2], keys=['one','two'])

Unnamed: 0,Unnamed: 1,Name,Age
one,0,San Zhang,20
one,1,Si Li,30
two,0,Wu Wang,40


### 2.2 序列与表的合并
追加：append（行末）、assign（列末）。   
在append中使用ignore_index=True对新序列对应索引的自动标号，否则需要指定name属性。     
assign返回一个临时副本。

In [14]:
#append
s=pd.Series(['Wu Wang',21], index=df1.columns)
df1.append(s, ignore_index=True)

Unnamed: 0,Name,Age
0,San Zhang,20
1,Si Li,30
2,Wu Wang,21


In [15]:
#assign
s=pd.Series([80,90])
df1.assign(Grade=s)

Unnamed: 0,Name,Age,Grade
0,San Zhang,20,80
1,Si Li,30,90


In [16]:
#直接通过定义列的方式创建
df1['Grade']=s
df1

Unnamed: 0,Name,Age,Grade
0,San Zhang,20,80
1,Si Li,30,90


## 3. 类连接操作
### 3.1 比较
compare：比较两个表或序列的不同并汇总展示，结果返回不同值所在的行列。

In [17]:
df1 = pd.DataFrame({'Name':['San Zhang','Si Li','Wu Wang'], 'Age':[20,21,21], 'Class':['one','two','three']})
df2 = pd.DataFrame({'Name':['San Zhang','Li Si','Wu Wang'], 'Age':[20,21,21], 'Class':['one','two','Three']})
df1.compare(df2)

Unnamed: 0_level_0,Name,Name,Class,Class
Unnamed: 0_level_1,self,other,self,other
1,Si Li,Li Si,,
2,,,three,Three


In [18]:
#设置参数keep_shape=True来完整显示表中所有元素的比较情况。
df1.compare(df2, keep_shape=True)

Unnamed: 0_level_0,Name,Name,Age,Age,Class,Class
Unnamed: 0_level_1,self,other,self,other,self,other
0,,,,,,
1,Si Li,Li Si,,,,
2,,,,,three,Three


### 3.2 组合
combine：按照一定的规则进行组合，并自动进行列索引的对齐。     
参数为两个表的同名Series，列为两个表列名的并集。

In [19]:
def choose_min(s1, s2): 
    s2 = s2.reindex_like(s1)
    res = s1.where(s1<s2, s2)
    res = res.mask(s1.isna()) # isna表示是否为缺失值，返回布尔序列
    return res

df1 = pd.DataFrame({'A':[1,2], 'B':[3,4], 'C':[5,6]})
df2 = pd.DataFrame({'B':[5,6], 'C':[7,8], 'D':[9,10]}, index=[1,2])
df1.combine(df2, choose_min)
#注意index是不一样的。。。

Unnamed: 0,A,B,C,D
0,,,,
1,,4.0,6.0,
2,,,,


In [20]:
#练一练
def choose_min1(s1, s2): 
    s2 = s2.reindex_like(s1)
    res = s1.where(s1<s2, s2)
    res = res.mask(s1.isna(), s2)
    return res
df1.combine(df2, choose_min1)

Unnamed: 0,A,B,C,D
0,,,,
1,,4.0,6.0,9.0
2,,6.0,8.0,10.0


In [21]:
#overwrite=False用来保留被调用表（df1）中未出现在传入的参数表中的列
df1.combine(df2, choose_min, overwrite=False)

Unnamed: 0,A,B,C,D
0,1.0,,,
1,2.0,4.0,6.0,
2,,,,


In [22]:
#练一练
#combine_first：若第二个表在第一个表中对应索引位置的值不是缺失状态，则用第一个表的值填充
df1.combine_first(df2)

Unnamed: 0,A,B,C,D
0,1.0,3.0,5.0,
1,2.0,4.0,6.0,9.0
2,,6.0,8.0,10.0


In [23]:
#combine
def choose_min2(s1, s2): 
    s2 = s2.reindex_like(s1)
    res = s1.where(s1<s2, s2)
    res = res.mask(s1.isna(), s2)
    res = res.mask(s2.isna(), s1)
    return res
df1.combine(df2, choose_min2)

Unnamed: 0,A,B,C,D
0,1.0,3.0,5.0,
1,2.0,4.0,6.0,9.0
2,,6.0,8.0,10.0


## 4. 练习
### Ex1：美国疫情数据集

In [24]:
date = pd.date_range('20200412','20201116').to_series()
date = date.dt.month.astype('string').str.zfill(2) +'-'+ date.dt.day.astype('string').str.zfill(2) +'-'+ '2020'
date = date.tolist()
date[:5]

['04-12-2020', '04-13-2020', '04-14-2020', '04-15-2020', '04-16-2020']

In [25]:
#难点在提取数据部分。。。参考了答案
L=[]
for d in date:
    df=pd.read_csv('/Users/jie/Documents/Python/joyful-pandas-master/data/us_report/' + d + '.csv', index_col='Province_State')
    data=df.loc['New York', ['Confirmed','Deaths','REcovered','Active']]
    L.append(data.to_frame().T)

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Index(['REcovered'], dtype='object'). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"