In [1]:
import pandas as pd

# Pandas 数据整合

## concat 拼接数据

##### 作用：将 dataframe 或 series 数据按行和按列进行拼接

### 行拼接：按照列标签索引对齐

#### DataFrame和DataFrame 进行拼接

In [2]:
df1 = pd.read_csv('./data/concat_1.csv')
df2 = pd.read_csv('./data/concat_2.csv')
df3 = pd.read_csv('./data/concat_3.csv')

In [3]:
df1

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2
3,a3,b3,c3,d3


In [4]:
df2

Unnamed: 0,A,B,C,D
0,a4,b4,c4,d4
1,a5,b5,c5,d5
2,a6,b6,c6,d6
3,a7,b7,c7,d7


In [5]:
df3

Unnamed: 0,A,B,C,D
0,a8,b8,c8,d8
1,a9,b9,c9,d9
2,a10,b10,c10,d10
3,a11,b11,c11,d11


In [6]:
row_concat = pd.concat([df1, df2, df3])
row_concat

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2
3,a3,b3,c3,d3
0,a4,b4,c4,d4
1,a5,b5,c5,d5
2,a6,b6,c6,d6
3,a7,b7,c7,d7
0,a8,b8,c8,d8
1,a9,b9,c9,d9


In [10]:
# 按照行位置获取数据
row_concat.iloc[3]

A    a3
B    b3
C    c3
D    d3
Name: 3, dtype: object

In [11]:
# 按照行标签获取数据
row_concat.loc[3]

Unnamed: 0,A,B,C,D
3,a3,b3,c3,d3
3,a7,b7,c7,d7
3,a11,b11,c11,d11


In [12]:
# ignore_index=True：表示 concat 拼接时忽略索引
pd.concat([df1, df2, df3], ignore_index=True)

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2
3,a3,b3,c3,d3
4,a4,b4,c4,d4
5,a5,b5,c5,d5
6,a6,b6,c6,d6
7,a7,b7,c7,d7
8,a8,b8,c8,d8
9,a9,b9,c9,d9


#### DataFrame 和 Series 进行拼接

In [7]:
new_series = pd.Series(['n1', 'n2', 'n3', 'n4'])
new_series

0    n1
1    n2
2    n3
3    n4
dtype: object

In [13]:
pd.concat([df1, new_series])

Unnamed: 0,A,B,C,D,0
0,a0,b0,c0,d0,
1,a1,b1,c1,d1,
2,a2,b2,c2,d2,
3,a3,b3,c3,d3,
0,,,,,n1
1,,,,,n2
2,,,,,n3
3,,,,,n4


### 列拼接：按照行标签索引对齐

#### DataFrame和DataFrame 进行拼接

In [14]:
pd.concat([df1, df2, df3], axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1,A.2,B.2,C.2,D.2
0,a0,b0,c0,d0,a4,b4,c4,d4,a8,b8,c8,d8
1,a1,b1,c1,d1,a5,b5,c5,d5,a9,b9,c9,d9
2,a2,b2,c2,d2,a6,b6,c6,d6,a10,b10,c10,d10
3,a3,b3,c3,d3,a7,b7,c7,d7,a11,b11,c11,d11


#### DataFrame 和 Series 进行拼接

In [8]:
new_series = pd.Series(['n1', 'n2', 'n3', 'n4'])
new_series

0    n1
1    n2
2    n3
3    n4
dtype: object

In [15]:
pd.concat([df1, new_series], axis=1)

Unnamed: 0,A,B,C,D,0
0,a0,b0,c0,d0,n1
1,a1,b1,c1,d1,n2
2,a2,b2,c2,d2,n3
3,a3,b3,c3,d3,n4


### concat join 参数的设置

In [16]:
df1

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2
3,a3,b3,c3,d3


In [17]:
df3

Unnamed: 0,A,B,C,D
0,a8,b8,c8,d8
1,a9,b9,c9,d9
2,a10,b10,c10,d10
3,a11,b11,c11,d11


In [19]:
df1.columns = ['A', 'B', 'C', 'D']
df3.columns = ['A', 'C', 'F', 'H']

In [20]:
df1

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2
3,a3,b3,c3,d3


In [21]:
df3

Unnamed: 0,A,C,F,H
0,a8,b8,c8,d8
1,a9,b9,c9,d9
2,a10,b10,c10,d10
3,a11,b11,c11,d11


##### concat 方法的 join参数：
**1）默认为 outer：无法对齐的行/列，默认在拼接后的数据中会填充为 NaN，因为默认拼接是 outer 拼接**<br/>
**2）设置为 inner：只有能够对齐的行/列，才会出现在拼接的结果中**

In [22]:
pd.concat([df1, df3])

Unnamed: 0,A,B,C,D,F,H
0,a0,b0,c0,d0,,
1,a1,b1,c1,d1,,
2,a2,b2,c2,d2,,
3,a3,b3,c3,d3,,
0,a8,,b8,,c8,d8
1,a9,,b9,,c9,d9
2,a10,,b10,,c10,d10
3,a11,,b11,,c11,d11


In [23]:
pd.concat([df1, df3], join='inner')

Unnamed: 0,A,C
0,a0,c0
1,a1,c1
2,a2,c2
3,a3,c3
0,a8,b8
1,a9,b9
2,a10,b10
3,a11,b11


## merge 关联数据

##### 作用：类似于 sql 中的 join 语句，用于两个数据集之间按照行标签列或列标签列进行连接，默认是inner，可以设置为：left、right、outer

**加载部门数据和员工数据**

In [24]:
# 加载部门数据
departments = pd.read_csv('./data/departments.csv')
departments

Unnamed: 0,id,name
0,1,IT
1,2,Management
2,3,Human Resources
3,4,Accounting
4,5,Help Desk


In [25]:
# 查看数据列的结构
departments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5 non-null      int64 
 1   name    5 non-null      object
dtypes: int64(1), object(1)
memory usage: 208.0+ bytes


In [26]:
# 加载员工数据
employees = pd.read_csv('./data/employees.csv')
employees.head()

Unnamed: 0,id,first_name,last_name,department_id,salary,years_worked
0,1,Diane,Turner,1,5330,4
1,2,Clarence,Robinson,1,3617,2
2,3,Eugene,Phillips,1,4877,2
3,4,Philip,Mitchell,1,5259,3
4,5,Ann,Wright,2,2094,5


In [27]:
# 查看数据列的结构
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             20 non-null     int64 
 1   first_name     20 non-null     object
 2   last_name      20 non-null     object
 3   department_id  20 non-null     int64 
 4   salary         20 non-null     int64 
 5   years_worked   20 non-null     int64 
dtypes: int64(4), object(2)
memory usage: 1.1+ KB


**将部门数据和员工数据进行关联**

In [29]:
# 部门和员工数据进行关联
merge_data = pd.merge(departments, employees, left_on='id', right_on='department_id')

merge_data = departments.merge(employees, left_on='id', right_on='department_id')
merge_data.head()

Unnamed: 0,id_x,name,id_y,first_name,last_name,department_id,salary,years_worked
0,1,IT,1,Diane,Turner,1,5330,4
1,1,IT,2,Clarence,Robinson,1,3617,2
2,1,IT,3,Eugene,Phillips,1,4877,2
3,1,IT,4,Philip,Mitchell,1,5259,3
4,2,Management,5,Ann,Wright,2,2094,5


## join 关联数据

##### 作用：join 方法类是 merge 方法的一个特殊情况，被调用的数据集按照行标签列或列标签列和另一个数据集的行标签列进行关联，
##### 默认是left，可以设置为：right，inner、outer

In [30]:
# 加载股票数据集
stock_2016 = pd.read_csv('./data/stocks_2016.csv')
stock_2017 = pd.read_csv('./data/stocks_2017.csv')
stock_2018 = pd.read_csv('./data/stocks_2018.csv')

In [31]:
stock_2016

Unnamed: 0,Symbol,Shares,Low,High
0,AAPL,80,95,110
1,TSLA,50,80,130
2,WMT,40,55,70
