Outline
* Reshaping
* Pivoting
* Duplicates in DataFrames
* Mapping
* Replace
* Rename Index
* Binning
* Outliers
* Permutation

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# Reshaping

In [4]:
dframe1 = DataFrame(np.arange(8).reshape(2, 4),
                    index=pd.Index(['LA', 'SF'], name='city'),
                    columns=pd.Index(['A', 'B', 'C', 'D'], name='letter'))
# 這邊使用pd.Index(...)的原因是可以同時指定name
# 若只用index=['LA', 'SF']，之後要再指定dframe.index.name = 'city'
dframe1

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [12]:
# stack(): Dataframe -> Series
dframe_st = dframe1.stack()
dframe_st

city  letter
LA    A         0
      B         1
      C         2
      D         3
SF    A         4
      B         5
      C         6
      D         7
dtype: int64

In [10]:
# unstack(): Series -> Dataframe
dframe_st.unstack()

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [13]:
dframe_st.unstack('letter')

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [14]:
dframe_st.unstack('city')

city,LA,SF
letter,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,4
B,1,5
C,2,6
D,3,7


### 處理no value

In [24]:
ser1 = Series([0, 1, 2], index=list('QXY'))
ser2 = Series([4, 5, 6], index=list('XYZ'))

In [25]:
dframe = pd.concat([ser1, ser2], keys=['Alpha', 'Beta'])
dframe

Alpha  Q    0
       X    1
       Y    2
Beta   X    4
       Y    5
       Z    6
dtype: int64

In [26]:
dframe = dframe.unstack()
dframe

Unnamed: 0,Q,X,Y,Z
Alpha,0.0,1.0,2.0,
Beta,,4.0,5.0,6.0


In [28]:
dframe.stack()

Alpha  Q    0.0
       X    1.0
       Y    2.0
Beta   X    4.0
       Y    5.0
       Z    6.0
dtype: float64

In [30]:
# 如果我們想要保留NaN?
dframe.stack(dropna=False)

Alpha  Q    0.0
       X    1.0
       Y    2.0
       Z    NaN
Beta   Q    NaN
       X    4.0
       Y    5.0
       Z    6.0
dtype: float64

# Pivoting
樞紐分析: [Pivot Table wiki](https://en.wikipedia.org/wiki/Pivot_table)

這邊不用管example data是怎麼產生的

In [33]:
# import pandas testing utility
import pandas.util.testing as tm; tm.N = 3

# Create a unpivoted function
def unpivot(frame):
    N, K = frame.shape
    
    data = {'value' : frame.values.ravel('F'),
            'variable' : np.asarray(frame.columns).repeat(N),
            'date' : np.tile(np.asarray(frame.index), K)}
    
    # Return the DataFrame
    return DataFrame(data, columns=['date', 'variable', 'value'])

# Set the DataFrame we'll be using
dframe = unpivot(tm.makeTimeDataFrame())

In [32]:
dframe

Unnamed: 0,date,variable,value
0,2000-01-03,A,0.419584
1,2000-01-04,A,0.664967
2,2000-01-05,A,-0.535116
3,2000-01-03,B,1.606345
4,2000-01-04,B,0.289858
5,2000-01-05,B,0.125475
6,2000-01-03,C,0.786324
7,2000-01-04,C,-0.237918
8,2000-01-05,C,-0.303522
9,2000-01-03,D,1.00848


In [35]:
# arg: rows / columns / values
dframe_piv = dframe.pivot('date', 'variable', 'value')
dframe_piv

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,-2.413799,-1.179769,-0.711841,0.656675
2000-01-04,-1.480727,-1.476714,-0.549341,-0.135639
2000-01-05,0.010679,1.125492,0.466099,-0.288909


# Duplicates in DataFrames
檢查重複的資料

In [39]:
dframe = DataFrame({'key1': ['A']*2 + ['B']*3,
                    'key2': [2, 2, 2, 3, 3]})
dframe

Unnamed: 0,key1,key2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


In [40]:
dframe.duplicated()

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [41]:
# 不會真得改變原object
dframe.drop_duplicates()

Unnamed: 0,key1,key2
0,A,2
2,B,2
3,B,3


In [43]:
# 只檢查key1是否重複，取第一筆
dframe.drop_duplicates(['key1'])

Unnamed: 0,key1,key2
0,A,2
2,B,2


In [45]:
# 只檢查key1是否重複，取最後一筆
dframe.drop_duplicates(['key1'], keep='last')

Unnamed: 0,key1,key2
1,A,2
4,B,3


# Mapping

In [46]:
dframe = DataFrame({'city': ['Alam', 'Brian Head', 'Fox Park'],
                    'altitude': [3158, 3000, 2762]})
dframe

Unnamed: 0,altitude,city
0,3158,Alam
1,3000,Brian Head
2,2762,Fox Park


In [50]:
# Mapping dictionary
state_map = {'Alam': 'Colorado', 'Brian Head': 'Utah', 'Fox Park': 'Wyoming'}

In [51]:
# create a new column 'state'
dframe['state'] = dframe['city'].map(state_map)

In [52]:
dframe

Unnamed: 0,altitude,city,state
0,3158,Alam,Colorado
1,3000,Brian Head,Utah
2,2762,Fox Park,Wyoming


# Replace

In [53]:
ser1 = Series([1, 2, 3, 4, 1, 2, 3, 4])
ser1

0    1
1    2
2    3
3    4
4    1
5    2
6    3
7    4
dtype: int64

In [54]:
ser1.replace(1, np.nan)

0    NaN
1    2.0
2    3.0
3    4.0
4    NaN
5    2.0
6    3.0
7    4.0
dtype: float64

In [56]:
ser1.replace([1, 4], [100, 400])

0    100
1      2
2      3
3    400
4    100
5      2
6      3
7    400
dtype: int64

In [57]:
ser1.replace({4: np.nan})

0    1.0
1    2.0
2    3.0
3    NaN
4    1.0
5    2.0
6    3.0
7    NaN
dtype: float64

# Rename Index

In [73]:
dframe = DataFrame(np.arange(12).reshape(3, 4),
                   index=['NY', 'LA', 'SF'],
                   columns=list('ABCD'))
dframe

Unnamed: 0,A,B,C,D
NY,0,1,2,3
LA,4,5,6,7
SF,8,9,10,11


In [74]:
dframe.index.map(str.lower)

array(['ny', 'la', 'sf'], dtype=object)

In [75]:
# 不使用rename
dframe.index = dframe.index.map(str.lower)
dframe

Unnamed: 0,A,B,C,D
ny,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


In [76]:
# 使用rename，傳入function
dframe.rename(index=str.title, columns=str.lower)

Unnamed: 0,a,b,c,d
Ny,0,1,2,3
La,4,5,6,7
Sf,8,9,10,11


In [77]:
# 使用rename，傳入dictionary
dframe.rename(index={'ny': 'NEW YORK'}, columns={'A': 'ALPHA'}) # 要取代原object，使用inplace=True

Unnamed: 0,ALPHA,B,C,D
NEW YORK,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


# Binning

In [78]:
years = [1990, 1991, 1992, 2008, 2012, 2015, 1987, 1969, 2013, 1999]

In [81]:
decade_bins = [1960, 1970, 1980, 1990, 2000, 2010, 2020]

In [87]:
decade_cat = pd.cut(years, decade_bins)
decade_cat
# 注意這邊的頭尾符號: (不包含開頭, 包含結尾]
# 第一個element，1990，屬於(1980, 1990]的區間

[(1980, 1990], (1990, 2000], (1990, 2000], (2000, 2010], (2010, 2020], (2010, 2020], (1980, 1990], (1960, 1970], (2010, 2020], (1990, 2000]]
Categories (6, object): [(1960, 1970] < (1970, 1980] < (1980, 1990] < (1990, 2000] < (2000, 2010] < (2010, 2020]]

In [84]:
# 根據decade_bins來切，總共有6個區間
decade_cat.categories

Index([u'(1960, 1970]', u'(1970, 1980]', u'(1980, 1990]', u'(1990, 2000]',
       u'(2000, 2010]', u'(2010, 2020]'],
      dtype='object')

In [85]:
# 計算每個區間的element數量
pd.value_counts(decade_cat)

(2010, 2020]    3
(1990, 2000]    3
(1980, 1990]    2
(2000, 2010]    1
(1960, 1970]    1
(1970, 1980]    0
dtype: int64

In [103]:
# 區間不使用list，將soucre分成3等分，精確度設為小數一位
pd.cut(years, 3, precision=1)

[(1984.3, 1999.7], (1984.3, 1999.7], (1984.3, 1999.7], (1999.7, 2015], (1999.7, 2015], (1999.7, 2015], (1984.3, 1999.7], (1969, 1984.3], (1999.7, 2015], (1984.3, 1999.7]]
Categories (3, object): [(1969, 1984.3] < (1984.3, 1999.7] < (1999.7, 2015]]

# Outliers

In [104]:
np.random.seed(12345)

In [105]:
# 1000 x 4 random values
dframe = DataFrame(np.random.randn(1000, 4))

In [109]:
dframe.head()

Unnamed: 0,0,1,2,3
0,-0.204708,0.478943,-0.519439,-0.55573
1,1.965781,1.393406,0.092908,0.281746
2,0.769023,1.246435,1.007189,-1.296221
3,0.274992,0.228913,1.352917,0.886429
4,-2.001637,-0.371843,1.669025,-0.43857


In [110]:
dframe.tail()

Unnamed: 0,0,1,2,3
995,1.089085,0.251232,-1.451985,1.653126
996,-0.478509,-0.010663,-1.060881,-1.50287
997,-1.946267,1.013592,0.037333,0.133304
998,-1.293122,-0.322542,-0.78296,-0.30334
999,0.089987,0.292291,1.177706,0.882755


In [111]:
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [112]:
col = dframe[0]
col.head()

0   -0.204708
1    1.965781
2    0.769023
3    0.274992
4   -2.001637
Name: 0, dtype: float64

In [113]:
# Series的case
col[np.abs(col)>3]

523   -3.428254
900    3.366626
Name: 0, dtype: float64

In [122]:
# Dataframe的case
dframe[(np.abs(dframe)>3).any(axis=1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


### 拆解

In [142]:
# Step 1
a = (np.abs(dframe)>3).head()
a

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False


In [145]:
# Step 2
b = a.any(axis=1).head() # 該row至少有一個True就為True
b

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [147]:
# Step 3 DataFrame[Series]: Series的element必須為True / False，長度和DataFrame的index相等，表示DataFrame中對應的index是否選取
# dframe[b]

In [149]:
# 將>3的設成3，<3的設成-3
dframe[np.abs(dframe)>3] = np.sign(dframe)*3 # sign()根據正負return -1 / 0 / 1

In [150]:
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067623,0.068473,0.025153,-0.002081
std,0.995485,0.990253,1.003977,0.989736
min,-3.0,-3.0,-3.0,-3.0
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.0,2.653656,3.0,3.0


# Permutation

In [159]:
dframe = DataFrame(np.arange(16).reshape(4, 4))
dframe

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [160]:
# 隨機排列0-3
blender = np.random.permutation(4)
blender

array([3, 1, 2, 0])

In [161]:
dframe.take(blender)
# 等同於dframe.ix[blender]

Unnamed: 0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11
0,0,1,2,3


### ndarray

In [176]:
box = np.array([1, 2, 3])

In [178]:
shaker = np.random.randint(0, len(box), size=10)
shaker

array([0, 2, 2, 1, 1, 2, 1, 2, 1, 2])

In [179]:
hand_grabs = box.take(shaker) # 取box對應index的位置
hand_grabs

array([1, 3, 3, 2, 2, 3, 2, 3, 2, 3])