# Chapeter 8 資料處理：連接、合併和重塑

## Data Wrangling: Join, Combine and Reshape

### 階層式索引 Hierarchical indexing

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
data=pd.Series(np.random.randn(9), index=[['a','a','b','b','c','c','d','d','d'],[1,2,3,1,2,3,1,2,3]])

In [4]:
data

a  1    0.520187
   2    0.969870
b  3   -0.315344
   1    1.362334
c  2    0.465128
   3    0.919127
d  1   -1.444234
   2   -0.217776
   3    1.659840
dtype: float64

In [5]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 3),
            ('b', 1),
            ('c', 2),
            ('c', 3),
            ('d', 1),
            ('d', 2),
            ('d', 3)],
           )

### 部份索引 partical indexing

In [6]:
data['b']

3   -0.315344
1    1.362334
dtype: float64

In [7]:
data['b':'c']

b  3   -0.315344
   1    1.362334
c  2    0.465128
   3    0.919127
dtype: float64

In [8]:
data[['b','d']]

b  3   -0.315344
   1    1.362334
d  1   -1.444234
   2   -0.217776
   3    1.659840
dtype: float64

In [9]:
data.loc[:,2]

a    0.969870
c    0.465128
d   -0.217776
dtype: float64

### unstack(): 可以將多重index的Seriest轉成DataFrame。

In [10]:
data.unstack()

Unnamed: 0,1,2,3
a,0.520187,0.96987,
b,1.362334,,-0.315344
c,,0.465128,0.919127
d,-1.444234,-0.217776,1.65984


### stack(): unstack()的相反。

In [11]:
data.unstack().stack()

a  1    0.520187
   2    0.969870
b  1    1.362334
   3   -0.315344
c  2    0.465128
   3    0.919127
d  1   -1.444234
   2   -0.217776
   3    1.659840
dtype: float64

### DataFrame也可以有多重的index與column name。

In [12]:
frame=pd.DataFrame(np.arange(12).reshape(4,3), index=[['a','b','c','b'],['one','one','two','two']],
                   columns=[['Jerry','Tom','Jerry'],['Water','Tea','Tea']])

In [13]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Jerry,Tom,Jerry
Unnamed: 0_level_1,Unnamed: 1_level_1,Water,Tea,Tea
a,one,0,1,2
b,one,3,4,5
c,two,6,7,8
b,two,9,10,11


In [14]:
frame.index.names=['alphabet','numbers']

In [15]:
frame.columns.names=['names','drinks']

In [16]:
frame

Unnamed: 0_level_0,names,Jerry,Tom,Jerry
Unnamed: 0_level_1,drinks,Water,Tea,Tea
alphabet,numbers,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,one,0,1,2
b,one,3,4,5
c,two,6,7,8
b,two,9,10,11


### 多重index的dataframe，會將names, drinks視為index，僅能取出欄。

In [17]:
frame['Jerry']

Unnamed: 0_level_0,drinks,Water,Tea
alphabet,numbers,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,0,2
b,one,3,5
c,two,6,8
b,two,9,11


In [18]:
frame['Jerry','Tea']

alphabet  numbers
a         one         2
b         one         5
c         two         8
b         two        11
Name: (Jerry, Tea), dtype: int64

### pd.MultiIndex.from_arrays: 可以將多個index儲存成物件。

In [19]:
a=pd.MultiIndex.from_arrays([['Work','Lesiure','Rest'],['KPI','SCRUM','PDCA']], names=['matters','time_management'])

In [20]:
a

MultiIndex([(   'Work',   'KPI'),
            ('Lesiure', 'SCRUM'),
            (   'Rest',  'PDCA')],
           names=['matters', 'time_management'])

In [21]:
b_frame=pd.DataFrame(np.arange(6).reshape(3,2),index=a, columns=['a','b'])

In [22]:
b_frame

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
matters,time_management,Unnamed: 2_level_1,Unnamed: 3_level_1
Work,KPI,0,1
Lesiure,SCRUM,2,3
Rest,PDCA,4,5


### 重排階層與依階層排序值

In [23]:
frame

Unnamed: 0_level_0,names,Jerry,Tom,Jerry
Unnamed: 0_level_1,drinks,Water,Tea,Tea
alphabet,numbers,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,one,0,1,2
b,one,3,4,5
c,two,6,7,8
b,two,9,10,11


### swaplevel(): 可以將Index前後對調，數值不會變動。

In [24]:
frame.swaplevel('alphabet','numbers')

Unnamed: 0_level_0,names,Jerry,Tom,Jerry
Unnamed: 0_level_1,drinks,Water,Tea,Tea
numbers,alphabet,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,a,0,1,2
one,b,3,4,5
two,c,6,7,8
two,b,9,10,11


### sort_index(): 讓數值按照index的level做排序。

In [25]:
frame

Unnamed: 0_level_0,names,Jerry,Tom,Jerry
Unnamed: 0_level_1,drinks,Water,Tea,Tea
alphabet,numbers,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,one,0,1,2
b,one,3,4,5
c,two,6,7,8
b,two,9,10,11


In [26]:
frame.sort_index(level=0) #按照alphabet排序。

Unnamed: 0_level_0,names,Jerry,Tom,Jerry
Unnamed: 0_level_1,drinks,Water,Tea,Tea
alphabet,numbers,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,one,0,1,2
b,one,3,4,5
b,two,9,10,11
c,two,6,7,8


In [27]:
frame.swaplevel('alphabet','numbers').sort_index(level=1) #按照alphabet排序。

Unnamed: 0_level_0,names,Jerry,Tom,Jerry
Unnamed: 0_level_1,drinks,Water,Tea,Tea
numbers,alphabet,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,a,0,1,2
one,b,3,4,5
two,b,9,10,11
two,c,6,7,8


### 指定階層統計資訊

In [28]:
frame

Unnamed: 0_level_0,names,Jerry,Tom,Jerry
Unnamed: 0_level_1,drinks,Water,Tea,Tea
alphabet,numbers,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,one,0,1,2
b,one,3,4,5
c,two,6,7,8
b,two,9,10,11


In [29]:
frame.sum(level='alphabet')

  frame.sum(level='alphabet')


names,Jerry,Tom,Jerry
drinks,Water,Tea,Tea
alphabet,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,0,1,2
b,12,14,16
c,6,7,8


In [30]:
frame

Unnamed: 0_level_0,names,Jerry,Tom,Jerry
Unnamed: 0_level_1,drinks,Water,Tea,Tea
alphabet,numbers,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,one,0,1,2
b,one,3,4,5
c,two,6,7,8
b,two,9,10,11


In [31]:
frame.sum(level='names', axis=1)

  frame.sum(level='names', axis=1)


Unnamed: 0_level_0,names,Jerry,Tom
alphabet,numbers,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,2,1
b,one,8,4
c,two,14,7
b,two,20,10


### 用DataFrame的欄當index

In [32]:
frame=pd.DataFrame({'a':range(7), 'b': range(7,0,-1), 'c': ['one','one','one','two','two','two','two'],
                   'd':[0,1,2,0,1,2,3]})

In [33]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


### set_index(['c','d']): 將c,d欄位轉變成frame2的index。

In [34]:
frame2=frame.set_index(['c','d'])

In [35]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


### set_index(['c','d']): 預設值會把c,d欄位刪除，但是也可以寫上drop=False，就可以保留c,d欄位。

In [36]:
frame.set_index(['c','d'],drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


### reset_index(): 和set_index()是相反的功能，會將frame2攤平成一維度的index。

In [37]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [38]:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


### 合併資料集合

### DataFrame資料庫中的join動作

In [39]:
df1= pd.DataFrame({'key':['b','b','a','a','c','c','b'], 'data1': range(7)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,a,3
4,c,4
5,c,5
6,b,6


In [40]:
df2=pd.DataFrame({'key':['a','b','d'],'data2':range(3)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


### pd.merge(df1,df2): 會依據兩個DataFrame共同的key進行值的合併。 
### 預設值為inner join，兩邊的共有值才會出現，故c,d不見了。

In [41]:
pd.merge(df1,df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,3,0


### pd.merge(df1,df2, on='key')：可以透過on來指定要合併的欄位。

In [42]:
pd.merge(df1,df2,on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,3,0


In [43]:
df3=pd.DataFrame({'1key':['b','b','a','c','a','a','b'],'data1':range(7)})
df3

Unnamed: 0,1key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [44]:
df4=pd.DataFrame({'rkey':['a','b','d'],'data2':range(3)})
df4

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


### pd.merge(df3, df4, left_on='1key', right_on='rkey'): 如果都沒有共同的key名稱，可以使用left_on, rigjt_on去定義連接的欄位。
### 因為以左邊為主，右邊如果對應不到的欄位，就會自動移除，像是df4的d就不會出現在合併的欄位內。

In [45]:
pd.merge(df3,df4, left_on='1key', right_on='rkey')

Unnamed: 0,1key,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


### pd.merge(df1, df2, how='outer') ：將預設值從inner改為outer，可以讓對應不到的值也出現。

In [46]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,a,3
4,c,4
5,c,5
6,b,6


In [47]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [48]:
pd.merge(df1,df2, how='outer') #c,d就出現了！

Unnamed: 0,key,data1,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,3.0,0.0
5,c,4.0,
6,c,5.0,
7,d,,2.0


In [49]:
pd.merge(df1,df2, how='right') #出現df2所有值。

Unnamed: 0,key,data1,data2
0,a,2.0,0
1,a,3.0,0
2,b,0.0,1
3,b,1.0,1
4,b,6.0,1
5,d,,2


In [50]:
pd.merge(df1,df2, how='left') #出現df1所有值。

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,a,3,0.0
4,c,4,
5,c,5,
6,b,6,1.0


In [51]:
pd.merge(df1, df2, how='inner') # 兩邊都有共同值才會出現。

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,3,0


### 多對多的合併: 會產生笛卡爾積，原本df1只有3個b，合併後出現6個b。

In [52]:
df1=pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [53]:
df2=pd.DataFrame({'key':['a','b','a','b','d'],'data2':range(5)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [54]:
pd.merge(df1,df2, on='key', how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [55]:
pd.merge(df1,df2, how='inner')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,5,1
5,b,5,3
6,a,2,0
7,a,2,2
8,a,4,0
9,a,4,2


In [56]:
left=pd.DataFrame({'key1':['cool','cool','bar'],'key2':['one','two','one'],'lval':[1,2,3]})
left

Unnamed: 0,key1,key2,lval
0,cool,one,1
1,cool,two,2
2,bar,one,3


In [57]:
right=pd.DataFrame({'key1':['cool','cool','bar','bar'],'key2':['one','one','two','two'],'rval':[4,5,6,7]})
right

Unnamed: 0,key1,key2,rval
0,cool,one,4
1,cool,one,5
2,bar,two,6
3,bar,two,7


### 如果遇到多個key，在寫入merge內的時候可以使用list。
### 在進行欄位名稱合併時，系統會自動忽略Index。

In [58]:
pd.merge(left,right, on=['key1','key2'], how='outer')

Unnamed: 0,key1,key2,lval,rval
0,cool,one,1.0,4.0
1,cool,one,1.0,5.0
2,cool,two,2.0,
3,bar,one,3.0,
4,bar,two,,6.0
5,bar,two,,7.0


### 當兩個data有重複的欄位名稱，merge內建suffixes的功能，會在欄位後面補上_x, _y作為欄位區分。

In [59]:
pd.merge(left, right, on='key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,cool,one,1,one,4
1,cool,one,1,one,5
2,cool,two,2,one,4
3,cool,two,2,one,5
4,bar,one,3,two,6
5,bar,one,3,two,7


### suffiexes=()：你也可以自己定義後綴的名稱要叫做什麼。

In [60]:
pd.merge(left, right, on='key1', suffixes=('_left','_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,cool,one,1,one,4
1,cool,one,1,one,5
2,cool,two,2,one,4
3,cool,two,2,one,5
4,bar,one,3,two,6
5,bar,one,3,two,7


### 依據index做合併

In [61]:
left1=pd.DataFrame({'key':['a','b','a','a','b','c'],'value':range(6)})
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [62]:
right1=pd.DataFrame({'group_val':[3.5,7]}, index=['a','b'])
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [63]:
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [64]:
pd.merge(left1, right1, left_on='key', right_index=True, )

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


### 如果要使用多個值進行merge，可以使用left_on('key1','key2')，只是要注意合併後的值如果要全數出現，需要how='outer'。

In [65]:
lefth=pd.DataFrame({'key1':['Sun','Sun','Sun','Moon','Moon'],'key2':[2022,2021,2020,2021,2020],
                   'data':np.arange(5)})
lefth

Unnamed: 0,key1,key2,data
0,Sun,2022,0
1,Sun,2021,1
2,Sun,2020,2
3,Moon,2021,3
4,Moon,2020,4


In [66]:
righth=pd.DataFrame(np.arange(12).reshape((6,2)),
               index=[['Moon','Moon','Sun','Sun','Moon','Moon'],[2021,2022,2022,2022,2021,2020]],
               columns=['event1','event2'])
righth.sort_index()

Unnamed: 0,Unnamed: 1,event1,event2
Moon,2020,10,11
Moon,2021,0,1
Moon,2021,8,9
Moon,2022,2,3
Sun,2022,4,5
Sun,2022,6,7


### 只有key1, key2兩邊都有出現的值，才會出現在合併的表格內。

In [67]:
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)

Unnamed: 0,key1,key2,data,event1,event2
0,Sun,2022,0,4,5
0,Sun,2022,0,6,7
3,Moon,2021,3,0,1
3,Moon,2021,3,8,9
4,Moon,2020,4,10,11


### how='outer'，即使兩邊有不一樣的key1與key2，會使用NaN的方式全數出現。

In [68]:
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True, how='outer')

Unnamed: 0,key1,key2,data,event1,event2
0,Sun,2022,0.0,4.0,5.0
0,Sun,2022,0.0,6.0,7.0
1,Sun,2021,1.0,,
2,Sun,2020,2.0,,
3,Moon,2021,3.0,0.0,1.0
3,Moon,2021,3.0,8.0,9.0
4,Moon,2020,4.0,10.0,11.0
4,Moon,2022,,2.0,3.0


### 如果merge的依據為左右兩個表的index，那麼可以寫入left_index=True, right_index=True

In [69]:
left2=pd.DataFrame([[1,2],[3,4],[5,6]],index=['a','c','e'],columns=['Sun','Moon'])
left2

Unnamed: 0,Sun,Moon
a,1,2
c,3,4
e,5,6


In [70]:
right2=pd.DataFrame([[7,8],[9,10],[11,12],[13,14]],index=['b','c','d','e'],columns=['Sky','Cloud'])
right2

Unnamed: 0,Sky,Cloud
b,7,8
c,9,10
d,11,12
e,13,14


In [71]:
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

Unnamed: 0,Sun,Moon,Sky,Cloud
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


### join(): 本身合併的條件就是依據兩個表格的Index，如果是要用index合併表格，很簡單的直接寫上join即可。

In [72]:
left2.join(right2,how='outer')

Unnamed: 0,Sun,Moon,Sky,Cloud
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [73]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [74]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


### join(): 本身採用的預設合併原則為left join，會保留所有左邊表格的資料，以及採用Index為key值。
### left1與right1在合併時，系統會判定left1的連接條件是range(6)，對應到right1的a,b就會找不到值。

In [75]:
left1.join(right1)

Unnamed: 0,key,value,group_val
0,a,0,
1,b,1,
2,a,2,
3,a,3,
4,b,4,
5,c,5,


### 因此要告訴系統left1的連接條件來自key，這樣才能對應到right1的a,b。

In [76]:
left1.join(right1, on='key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [77]:
another=pd.DataFrame([[7,8],[9,10],[11,12],[16,17]],index=['a','c','e','f'],columns=['Rainbow','Rain'])
another

Unnamed: 0,Rainbow,Rain
a,7,8
c,9,10
e,11,12
f,16,17


In [78]:
left2

Unnamed: 0,Sun,Moon
a,1,2
c,3,4
e,5,6


In [79]:
right2

Unnamed: 0,Sky,Cloud
b,7,8
c,9,10
d,11,12
e,13,14


### 以left2的index=['a','c','e']為主，同時合併right2表格與another表格，left2會全數保留，其餘表格沒有對應到會消失。

In [80]:
left2.join([right2,another])

Unnamed: 0,Sun,Moon,Sky,Cloud,Rainbow,Rain
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0


### how='outer'，即使沒有對應到left2的index，其餘沒有對到的值都會以NaN出現。

In [81]:
left2.join([right2,another],how='outer')

Unnamed: 0,Sun,Moon,Sky,Cloud,Rainbow,Rain
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0
b,,,7.0,8.0,,
d,,,11.0,12.0,,
f,,,,,16.0,17.0


### 延軸做連接

### np.concatenate(): 不同於加法，會將兩個表格做垂直或橫向的擴編。

In [82]:
arr=np.arange(12).reshape(4,3)
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

### axis=1 以列位為主的橫向合併。

In [83]:
np.concatenate([arr,arr],axis=1)

array([[ 0,  1,  2,  0,  1,  2],
       [ 3,  4,  5,  3,  4,  5],
       [ 6,  7,  8,  6,  7,  8],
       [ 9, 10, 11,  9, 10, 11]])

### np.concatenate([arr,arr]): 預設值為以欄位為主的垂直合併。

In [84]:
np.concatenate([arr,arr])

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [85]:
arr+arr

array([[ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16],
       [18, 20, 22]])

### pandas的concat

In [86]:
s1=pd.Series([0,1],index=['a','b'])
s2=pd.Series([2,3,4],index=['c','d','e'])
s3=pd.Series([5,6],index=['f','g'])

### pd.concat預設值也會是垂直的拓展資料。

In [87]:
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

### 如果axis=1，將會以欄為主橫向拓展資料，形成一個DataFrame。

In [88]:
pd.concat([s1,s2,s3],axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


### 對於np.concatenate來說s1,s2,s3=array，只有一個維度，沒辦法使用axis=1。

In [89]:
np.concatenate([s1,s2,s3])

array([0, 1, 2, 3, 4, 5, 6])

In [90]:
s4=pd.concat([s1,s3])
s4

a    0
b    1
f    5
g    6
dtype: int64

### axis=1 橫向的拓展欄位。

### pd.concat的預設值為outer，所以可以看到NAN。

In [91]:
pd.concat([s1,s4],axis=1)

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


### 如果不想要看到NAN的值，可以使用join='inner'，就可以只取兩個表格都有的資料。

In [92]:
pd.concat([s1,s4],axis=1,join='inner')

Unnamed: 0,0,1
a,0,0
b,1,1


### join_axes=[] 在python3中已經不支援，可改用reindex()去更換Index的值。

In [93]:
df1=pd.concat([s1,s4],axis=1)
df1=df1.reindex(['a','c','b','e'])
df1

Unnamed: 0,0,1
a,0.0,0.0
c,,
b,1.0,1.0
e,,


### 有時候想要標記合併的index區塊，就可以使用keys=[]去標示值原先來自哪裡或是另作區分。

In [94]:
result=pd.concat([s1,s2,s3],keys=['one','two','three'])
result

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64

### unstack(): 會將重疊的index攤平，重組成另一個DataFrame。

In [95]:
result.unstack()

Unnamed: 0,a,b,c,d,e,f,g
one,0.0,1.0,,,,,
two,,,2.0,3.0,4.0,,
three,,,,,,5.0,6.0


### 如果同時寫上axis=1, keys=[]，此時的keys會成為欄位的名稱。

In [96]:
pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


### pd.concat([df1,df2])

In [97]:
df1=pd.DataFrame(np.arange(6).reshape(3,2), index=['a','b','c'], columns=['one','two'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [98]:
df2=pd.DataFrame(5+np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
df2

Unnamed: 0,three,four
a,5,6
c,7,8


### 同時寫上axis=1, keys=[]，此時keys一樣會變成欄位名稱。

In [99]:
pd.concat([df1,df2],axis=1,keys=['level1','level2'])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


### pd.concat(dict) 如果是直接傳入dict，dict的key值會直接被當成keys。

In [100]:
pd.concat({'level1':df1, 'level2':df2}, axis=1)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


### names 可以用來命名軸的名稱。

In [101]:
pd.concat([df1,df2], axis=1, keys=['level1','level2'], names=['upper','lower'])

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


### pd.concat遇到列index和資料完全不相關，也就是想合併的不是index而是columns。

In [102]:
df1=pd.DataFrame(np.random.randn(3,4), columns=['a','b','c','d'])
df1

Unnamed: 0,a,b,c,d
0,-0.532369,0.580585,-0.79869,-0.761613
1,-0.007583,0.383612,-0.010075,0.44294
2,0.541938,0.435125,0.069307,-0.242656


In [103]:
df2=pd.DataFrame(np.random.randn(2,3), columns=['b','d','a'])
df2

Unnamed: 0,b,d,a
0,0.735761,-1.407513,-0.267585
1,-0.673174,-0.212824,0.353483


### 如果你想合併的不是index，而是columns，就可以寫上ignore_index=Ture。

In [104]:
pd.concat([df1,df2],ignore_index=True)

Unnamed: 0,a,b,c,d
0,-0.532369,0.580585,-0.79869,-0.761613
1,-0.007583,0.383612,-0.010075,0.44294
2,0.541938,0.435125,0.069307,-0.242656
3,-0.267585,0.735761,,-1.407513
4,0.353483,-0.673174,,-0.212824


### 合併有重複的資料

In [105]:
a=pd.Series([np.nan,2.5,0,3.5,4.5,np.nan], index=['f','e','d','c','b','a'])
a

f    NaN
e    2.5
d    0.0
c    3.5
b    4.5
a    NaN
dtype: float64

In [106]:
b=pd.Series([0,np.nan,2,np.nan,np.nan,5],index=['a','b','c','d','e','f'])
b

a    0.0
b    NaN
c    2.0
d    NaN
e    NaN
f    5.0
dtype: float64

### pd.isnull(): 為布林值，判斷a內誰為null值。
### np.where(‘判斷條件', True的狀況, False的狀況)：如下為如果是null值，填入b值，如果不是null值，填入a值。
### np.where(): 會直接忽略index，直接按照值的排序做資料替換。

In [107]:
np.where(pd.isnull(a),b,a)

array([0. , 2.5, 0. , 3.5, 4.5, 5. ])

### combine_first(array): 功用同np.where(pd.isnull(a),b,a)，不過，index會以b為主。

In [108]:
b.combine_first(a)

a    0.0
b    4.5
c    2.0
d    0.0
e    2.5
f    5.0
dtype: float64

### combine_first(DataFrame)

### range(start, stop, step)
### range(2,18,4)= [2,6,10,14]

In [109]:
df1=pd.DataFrame({'a':[1,np.nan,5,np.nan],
                 'b':[np.nan,2,np.nan,6],
                 'c': range(2,18,4)})
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [110]:
df2=pd.DataFrame({'a':[5,4,np.nan,3,7], 'b':[np.nan,3,4,6,8]})
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


### df1.combine_first(df1): 如果df1有缺失值，優先以df2遞補。

In [111]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


### 重塑和旋轉 Reshape & Pivot

### pd.Index([],name='') 可以幫軸命名。

In [112]:
data=pd.DataFrame(np.arange(6).reshape(2,3), 
                  index=pd.Index(['Sun','Moon'],name='sky'),
                  columns=pd.Index(['one','two','three'],name='number'))

In [113]:
data

number,one,two,three
sky,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sun,0,1,2
Moon,3,4,5


### stack(): 會將欄位名稱旋轉到列。

In [114]:
result=data.stack()
result

sky   number
Sun   one       0
      two       1
      three     2
Moon  one       3
      two       4
      three     5
dtype: int64

### unstack(): 將列旋轉到欄位。

In [115]:
result.unstack()

number,one,two,three
sky,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sun,0,1,2
Moon,3,4,5


### 可以指定層編號作為stack(), unstack()的變動層。
### sky=0, number=1，unstack(0)即是指將sky變成欄位名稱。

In [116]:
result.unstack(0)

sky,Sun,Moon
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


### 也可以直接寫入軸名稱。

In [117]:
result.unstack('sky')

sky,Sun,Moon
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [118]:
s1=pd.Series([0,1,2,3],index=['a','b','c','d'])
s2=pd.Series([4,5,6],index=['c','d','e'])

In [119]:
data2=pd.concat([s1,s2], keys=['one','two'])
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

### 進行unstack()時，如果有缺失值，系統會直接補上NAN。

In [120]:
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


### 進行stack()時，會自動濾除遺失值。

In [121]:
data2.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

### 如果stack(dropna=False)，會先濾除的遺失值還原。

In [122]:
data2.unstack().stack(dropna=False)

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
     e    NaN
two  a    NaN
     b    NaN
     c    4.0
     d    5.0
     e    6.0
dtype: float64

In [123]:
result

sky   number
Sun   one       0
      two       1
      three     2
Moon  one       3
      two       4
      three     5
dtype: int64

### columns=pd.Index(['left','right'],name='side')：給予軸名稱。

In [124]:
df=pd.DataFrame({'left':result,'right':result+5},columns=pd.Index(['left','right'],name='side'))
df

Unnamed: 0_level_0,side,left,right
sky,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Sun,one,0,5
Sun,two,1,6
Sun,three,2,7
Moon,one,3,8
Moon,two,4,9
Moon,three,5,10


In [125]:
df.unstack('sky')

side,left,left,right,right
sky,Sun,Moon,Sun,Moon
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


### 在進行stack()由欄轉為列時，也可以指定欄位名稱。

In [126]:
df.unstack('sky').stack('side')

Unnamed: 0_level_0,sky,Moon,Sun
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,3,0
one,right,8,5
two,left,4,1
two,right,9,6
three,left,5,2
three,right,10,7


### 長格式旋轉成寬格式

In [127]:
data=pd.read_csv('examples/macrodata.csv')

In [128]:
data.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959,1,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959,2,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959,3,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959,4,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960,1,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


### pd.PeriodIndex(): 合併時間欄位。

In [129]:
period=pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')

In [130]:
period, type(period)

(PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
              '1960Q3', '1960Q4', '1961Q1', '1961Q2',
              ...
              '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
              '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
             dtype='period[Q-DEC]', name='date', length=203),
 pandas.core.indexes.period.PeriodIndex)

### columns=pd.Index() 挑出需要的欄位。

In [131]:
columns=pd.Index(['realgdp','infl','unemp'],name='item')

In [132]:
columns, type(columns)

(Index(['realgdp', 'infl', 'unemp'], dtype='object', name='item'),
 pandas.core.indexes.base.Index)

### data.reindex() 將欄位做置換。

In [133]:
data=data.reindex(columns=columns)

In [134]:
data.head()

item,realgdp,infl,unemp
0,2710.349,0.0,5.8
1,2778.801,2.34,5.1
2,2775.488,2.74,5.3
3,2785.204,0.27,5.6
4,2847.699,2.31,5.2


### period.to_timestamp 調整日期index的格式，為精簡的日期。

In [135]:
data.index=period.to_timestamp('D')

In [136]:
data.index

DatetimeIndex(['1959-01-01', '1959-04-01', '1959-07-01', '1959-10-01',
               '1960-01-01', '1960-04-01', '1960-07-01', '1960-10-01',
               '1961-01-01', '1961-04-01',
               ...
               '2007-04-01', '2007-07-01', '2007-10-01', '2008-01-01',
               '2008-04-01', '2008-07-01', '2008-10-01', '2009-01-01',
               '2009-04-01', '2009-07-01'],
              dtype='datetime64[ns]', name='date', length=203, freq='QS-OCT')

In [137]:
data.head()

item,realgdp,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,2710.349,0.0,5.8
1959-04-01,2778.801,2.34,5.1
1959-07-01,2775.488,2.74,5.3
1959-10-01,2785.204,0.27,5.6
1960-01-01,2847.699,2.31,5.2


In [138]:
data.stack() ###將item從欄轉為列。

date        item   
1959-01-01  realgdp     2710.349
            infl           0.000
            unemp          5.800
1959-04-01  realgdp     2778.801
            infl           2.340
                         ...    
2009-04-01  infl           3.370
            unemp          9.200
2009-07-01  realgdp    12990.341
            infl           3.560
            unemp          9.600
Length: 609, dtype: float64

In [139]:
data.stack().reset_index() ###給予data新的index。

Unnamed: 0,date,item,0
0,1959-01-01,realgdp,2710.349
1,1959-01-01,infl,0.000
2,1959-01-01,unemp,5.800
3,1959-04-01,realgdp,2778.801
4,1959-04-01,infl,2.340
...,...,...,...
604,2009-04-01,infl,3.370
605,2009-04-01,unemp,9.200
606,2009-07-01,realgdp,12990.341
607,2009-07-01,infl,3.560


### rename(columns={0:'value'}) 將欄位名稱從0改為value。

In [140]:
ldata=data.stack().reset_index().rename(columns={0:'value'})

### 確認前10列的資料。

In [141]:
ldata[:10] 

Unnamed: 0,date,item,value
0,1959-01-01,realgdp,2710.349
1,1959-01-01,infl,0.0
2,1959-01-01,unemp,5.8
3,1959-04-01,realgdp,2778.801
4,1959-04-01,infl,2.34
5,1959-04-01,unemp,5.1
6,1959-07-01,realgdp,2775.488
7,1959-07-01,infl,2.74
8,1959-07-01,unemp,5.3
9,1959-10-01,realgdp,2785.204


### pivot手法(index, columns, value): 使用此功能，可以將折成列的品項，翻轉成欄位的形式。

In [142]:
ldata

Unnamed: 0,date,item,value
0,1959-01-01,realgdp,2710.349
1,1959-01-01,infl,0.000
2,1959-01-01,unemp,5.800
3,1959-04-01,realgdp,2778.801
4,1959-04-01,infl,2.340
...,...,...,...
604,2009-04-01,infl,3.370
605,2009-04-01,unemp,9.200
606,2009-07-01,realgdp,12990.341
607,2009-07-01,infl,3.560


In [143]:
pivoted=ldata.pivot('date','item','value')

### 可以看到原先在item內的infl, realgdp, unemp都從行被翻轉為欄位。

In [144]:
pivoted

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,0.00,2710.349,5.8
1959-04-01,2.34,2778.801,5.1
1959-07-01,2.74,2775.488,5.3
1959-10-01,0.27,2785.204,5.6
1960-01-01,2.31,2847.699,5.2
...,...,...,...
2008-07-01,-3.16,13324.600,6.0
2008-10-01,-8.79,13141.920,6.9
2009-01-01,0.94,12925.410,8.1
2009-04-01,3.37,12901.504,9.2


In [145]:
len(ldata) #ldata的總行數有609行。

609

### ladta['value2']: 插入一的新的欄位value2，並且給予該欄位隨機亂數609個數值。

In [146]:
ldata['value2']=np.random.randn(len(ldata))

In [147]:
ldata[:10]

Unnamed: 0,date,item,value,value2
0,1959-01-01,realgdp,2710.349,-1.013319
1,1959-01-01,infl,0.0,0.45138
2,1959-01-01,unemp,5.8,-0.340262
3,1959-04-01,realgdp,2778.801,0.76277
4,1959-04-01,infl,2.34,-2.073874
5,1959-04-01,unemp,5.1,0.116674
6,1959-07-01,realgdp,2775.488,0.070825
7,1959-07-01,infl,2.74,0.427771
8,1959-07-01,unemp,5.3,0.211883
9,1959-10-01,realgdp,2785.204,-1.321146


### pivot如果省略第三個欄位的值，會出現如下更加完整的表格。

In [148]:
pivoted=ldata.pivot('date','item')

In [149]:
pivoted[:5]

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-01-01,0.0,2710.349,5.8,0.45138,-1.013319,-0.340262
1959-04-01,2.34,2778.801,5.1,-2.073874,0.76277,0.116674
1959-07-01,2.74,2775.488,5.3,0.427771,0.070825,0.211883
1959-10-01,0.27,2785.204,5.6,1.609821,-1.321146,0.09423
1960-01-01,2.31,2847.699,5.2,0.241108,0.760125,0.713418


In [150]:
pivoted['value'][:5]

item,infl,realgdp,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,0.0,2710.349,5.8
1959-04-01,2.34,2778.801,5.1
1959-07-01,2.74,2775.488,5.3
1959-10-01,0.27,2785.204,5.6
1960-01-01,2.31,2847.699,5.2


### set_index().unstack()=pivot

In [151]:
ldata[:5]

Unnamed: 0,date,item,value,value2
0,1959-01-01,realgdp,2710.349,-1.013319
1,1959-01-01,infl,0.0,0.45138
2,1959-01-01,unemp,5.8,-0.340262
3,1959-04-01,realgdp,2778.801,0.76277
4,1959-04-01,infl,2.34,-2.073874


In [152]:
ldata.set_index(['date','item']) ### 先讓date, item成為index

Unnamed: 0_level_0,Unnamed: 1_level_0,value,value2
date,item,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-01-01,realgdp,2710.349,-1.013319
1959-01-01,infl,0.000,0.451380
1959-01-01,unemp,5.800,-0.340262
1959-04-01,realgdp,2778.801,0.762770
1959-04-01,infl,2.340,-2.073874
...,...,...,...
2009-04-01,infl,3.370,-2.090778
2009-04-01,unemp,9.200,-0.443409
2009-07-01,realgdp,12990.341,0.312084
2009-07-01,infl,3.560,-1.106816


In [153]:
unstacked=ldata.set_index(['date','item']).unstack('item') ### 將item從列轉成欄位名稱。

In [154]:
unstacked[:7] ### 呈現的結果就會跟pivot一模一樣。

Unnamed: 0_level_0,value,value,value,value2,value2,value2
item,infl,realgdp,unemp,infl,realgdp,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1959-01-01,0.0,2710.349,5.8,0.45138,-1.013319,-0.340262
1959-04-01,2.34,2778.801,5.1,-2.073874,0.76277,0.116674
1959-07-01,2.74,2775.488,5.3,0.427771,0.070825,0.211883
1959-10-01,0.27,2785.204,5.6,1.609821,-1.321146,0.09423
1960-01-01,2.31,2847.699,5.2,0.241108,0.760125,0.713418
1960-04-01,0.14,2834.39,5.2,0.820053,0.796722,-0.593472
1960-07-01,2.7,2839.022,5.6,0.633291,0.36106,1.422959


### 寬格式旋轉成長格式

### pivot: 把一個欄位，轉為多個欄位。

### pandas.melt：把多個欄位合併成一個欄位。

In [155]:
df=pd.DataFrame({'key':['sun','moon','lake'],
                'A':[1,2,3],
                'B':[4,5,6],
                'C':[7,8,9]})

In [156]:
df

Unnamed: 0,key,A,B,C
0,sun,1,4,7
1,moon,2,5,8
2,lake,3,6,9


### pd.melt(DataFrame, ['key']): 可以將多個欄位儲存為一個欄位，並將其value獨立為一個欄位。

In [157]:
melted=pd.melt(df,['key'])

In [158]:
melted

Unnamed: 0,key,variable,value
0,sun,A,1
1,moon,A,2
2,lake,A,3
3,sun,B,4
4,moon,B,5
5,lake,B,6
6,sun,C,7
7,moon,C,8
8,lake,C,9


### pivot(index, columns, value): 就可以將一個欄位轉為多個欄位。

In [159]:
reshaped=melted.pivot('key','variable','value')

In [160]:
reshaped

variable,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lake,3,6,9
moon,2,5,8
sun,1,4,7


### reset_index(): 可以用來重置index。

In [161]:
reshaped.reset_index()

variable,key,A,B,C
0,lake,3,6,9
1,moon,2,5,8
2,sun,1,4,7


### id_vars=[], value_vars=[]: 可以指定只有哪幾個欄位需要被轉置。

In [162]:
pd.melt(df,id_vars=['key'],value_vars=['A','B'])

Unnamed: 0,key,variable,value
0,sun,A,1
1,moon,A,2
2,lake,A,3
3,sun,B,4
4,moon,B,5
5,lake,B,6


### pd.melt() 也可以不指定任何分組的欄。

In [163]:
pd.melt(df,value_vars=['A','B','C'])

Unnamed: 0,variable,value
0,A,1
1,A,2
2,A,3
3,B,4
4,B,5
5,B,6
6,C,7
7,C,8
8,C,9


In [164]:
pd.melt(df,value_vars=['key','A','B'])

Unnamed: 0,variable,value
0,key,sun
1,key,moon
2,key,lake
3,A,1
4,A,2
5,A,3
6,B,4
7,B,5
8,B,6
