# Chapter 7 資料整理和前處理

## Data Cleaning and Preparation

In [1]:
import pandas as pd
import numpy as np

### 處理遺失資料

### NAN: 在pandas內，遺失的資料表示為NAN，也就是not a number。

In [2]:
string_data=pd.Series(['a','b','c','d',np.nan,'e'])

In [3]:
string_data

0      a
1      b
2      c
3      d
4    NaN
5      e
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2    False
3    False
4     True
5    False
dtype: bool

### Python內建中的None值，也是NA。

In [5]:
string_data[0]=None

In [6]:
string_data.isnull()

0     True
1    False
2    False
3    False
4     True
5    False
dtype: bool

### 過濾遺失值。

### dropna(): 可以過濾遺失值。

In [7]:
from numpy import nan as NA

In [8]:
data=pd.Series([1,NA,3.5,NA,7])

In [9]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

### notnull(): 就是isnull()的相反，可以顯示布林值。

In [10]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
data=pd.DataFrame([[1,5,7],[NA,6,NA],[NA,NA,NA],[NA,8,9]])

In [12]:
data

Unnamed: 0,0,1,2
0,1.0,5.0,7.0
1,,6.0,
2,,,
3,,8.0,9.0


### 遇到dataframe時，使用dropna()會把有NA的列移除。

In [13]:
cleaned=data.dropna()

In [14]:
cleaned

Unnamed: 0,0,1,2
0,1.0,5.0,7.0


### dropna(how='all'): 會移除該列每一個數值都是NA的列。

In [15]:
cleaned2=data.dropna(how='all')

In [16]:
cleaned2

Unnamed: 0,0,1,2
0,1.0,5.0,7.0
1,,6.0,
3,,8.0,9.0


In [17]:
data[4]=NA

In [18]:
data

Unnamed: 0,0,1,2,4
0,1.0,5.0,7.0,
1,,6.0,,
2,,,,
3,,8.0,9.0,


In [19]:
data.dropna(axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,5.0,7.0
1,,6.0,
2,,,
3,,8.0,9.0


In [20]:
df=pd.DataFrame(np.random.randn(7,3))

In [21]:
df

Unnamed: 0,0,1,2
0,-0.911266,0.319673,-0.254247
1,0.80363,-1.347411,-0.211894
2,0.355464,1.061805,0.377615
3,-0.934533,0.348289,-0.040785
4,0.002128,0.892894,0.027967
5,-0.246219,-0.284493,-0.877898
6,0.71944,0.869055,-0.464252


In [22]:
df.iloc[:4,1]=NA

In [23]:
df.iloc[:2,0]=NA

In [24]:
df

Unnamed: 0,0,1,2
0,,,-0.254247
1,,,-0.211894
2,0.355464,,0.377615
3,-0.934533,,-0.040785
4,0.002128,0.892894,0.027967
5,-0.246219,-0.284493,-0.877898
6,0.71944,0.869055,-0.464252


In [25]:
df.dropna()

Unnamed: 0,0,1,2
4,0.002128,0.892894,0.027967
5,-0.246219,-0.284493,-0.877898
6,0.71944,0.869055,-0.464252


### thresh＝2 表示該列至少要有2筆數字非遺失值，才會保留該行。

In [26]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.355464,,0.377615
3,-0.934533,,-0.040785
4,0.002128,0.892894,0.027967
5,-0.246219,-0.284493,-0.877898
6,0.71944,0.869055,-0.464252


### 爲遺失值填值。

### fillna(數字)： 放入常數，就可以在NAN的位置補上一個常數。

In [27]:
df

Unnamed: 0,0,1,2
0,,,-0.254247
1,,,-0.211894
2,0.355464,,0.377615
3,-0.934533,,-0.040785
4,0.002128,0.892894,0.027967
5,-0.246219,-0.284493,-0.877898
6,0.71944,0.869055,-0.464252


In [28]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.0,0.0,-0.254247
1,0.0,0.0,-0.211894
2,0.355464,0.0,0.377615
3,-0.934533,0.0,-0.040785
4,0.002128,0.892894,0.027967
5,-0.246219,-0.284493,-0.877898
6,0.71944,0.869055,-0.464252


### fillna(dict): 可以指定欄位，在遺失值的位置填入常數。

In [29]:
df.fillna({0:20, 1:30})

Unnamed: 0,0,1,2
0,20.0,30.0,-0.254247
1,20.0,30.0,-0.211894
2,0.355464,30.0,0.377615
3,-0.934533,30.0,-0.040785
4,0.002128,0.892894,0.027967
5,-0.246219,-0.284493,-0.877898
6,0.71944,0.869055,-0.464252


### fillna(0): 本身可以存取為獨立的物件。

In [30]:
a=df.fillna({0:30,1:100})

In [31]:
a

Unnamed: 0,0,1,2
0,30.0,100.0,-0.254247
1,30.0,100.0,-0.211894
2,0.355464,100.0,0.377615
3,-0.934533,100.0,-0.040785
4,0.002128,0.892894,0.027967
5,-0.246219,-0.284493,-0.877898
6,0.71944,0.869055,-0.464252


### inplace=True 可以直接修改原本的資料。

In [32]:
df.fillna(0,inplace=True)

In [33]:
df

Unnamed: 0,0,1,2
0,0.0,0.0,-0.254247
1,0.0,0.0,-0.211894
2,0.355464,0.0,0.377615
3,-0.934533,0.0,-0.040785
4,0.002128,0.892894,0.027967
5,-0.246219,-0.284493,-0.877898
6,0.71944,0.869055,-0.464252


In [34]:
df=pd.DataFrame(np.random.randn(6,3))

In [35]:
df

Unnamed: 0,0,1,2
0,0.404117,1.335955,-0.644789
1,-0.392918,-0.573654,-1.166073
2,0.328231,1.336829,0.317249
3,-0.892051,1.3587,-0.795727
4,-0.91083,-0.739205,0.685204
5,-0.143754,-0.34074,-1.168837


In [36]:
df.iloc[4:,1]=NA

In [37]:
df.iloc[2:,2]=NA

In [38]:
df

Unnamed: 0,0,1,2
0,0.404117,1.335955,-0.644789
1,-0.392918,-0.573654,-1.166073
2,0.328231,1.336829,
3,-0.892051,1.3587,
4,-0.91083,,
5,-0.143754,,


### method='ffill' 重複前一個值。

In [39]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.404117,1.335955,-0.644789
1,-0.392918,-0.573654,-1.166073
2,0.328231,1.336829,-1.166073
3,-0.892051,1.3587,-1.166073
4,-0.91083,1.3587,-1.166073
5,-0.143754,1.3587,-1.166073


### limit=2，遇到連續的遺失值，限定最多一次可以連續補上2個值。

In [40]:
df

Unnamed: 0,0,1,2
0,0.404117,1.335955,-0.644789
1,-0.392918,-0.573654,-1.166073
2,0.328231,1.336829,
3,-0.892051,1.3587,
4,-0.91083,,
5,-0.143754,,


In [41]:
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,0.404117,1.335955,-0.644789
1,-0.392918,-0.573654,-1.166073
2,0.328231,1.336829,-1.166073
3,-0.892051,1.3587,-1.166073
4,-0.91083,1.3587,
5,-0.143754,1.3587,


### fillna(data.mean()): 還可以在遺失值的位置補上該數列的平均值、最大值、最小值、中位數等。

In [42]:
data=pd.Series([1,3,5,NA,6,NA])

In [43]:
data

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    NaN
dtype: float64

In [44]:
data.fillna(data.mean())

0    1.00
1    3.00
2    5.00
3    3.75
4    6.00
5    3.75
dtype: float64

In [45]:
data.fillna(data.max())

0    1.0
1    3.0
2    5.0
3    6.0
4    6.0
5    6.0
dtype: float64

### 資料的轉換

### 移除重複值

In [46]:
data=pd.DataFrame({'k1':['one','two']*3+['one'], 'k2':[1,2,1,2,3,4,3]})

In [47]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,2
2,one,1
3,two,2
4,one,3
5,two,4
6,one,3


### duplicated(): 用來判斷該列前面有沒有出現過，有的話，會顯示Ture。

In [48]:
data.duplicated()

0    False
1    False
2     True
3     True
4    False
5    False
6     True
dtype: bool

### drop_duplicates(): 移除重複的值，也就是會保留下duplicated() 顯示為False的地方。

In [49]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,2
4,one,3
5,two,4


In [50]:
data['v1']=range(7)

In [51]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,2,1
2,one,1,2
3,two,2,3
4,one,3,4
5,two,4,5
6,one,3,6


### 可以針對特定的欄位進行drop_duplicates(), duplicated()

In [52]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,2,1


In [53]:
data.duplicated(['k1'])

0    False
1    False
2     True
3     True
4     True
5     True
6     True
dtype: bool

### 一般來說，drop_duplicates(), duplicated()會保留看到的第一個值。
### 如果是keep='last'，就會保留看到的最後一個值。

In [54]:
data.duplicated(['k1','k2'],keep='last')

0     True
1     True
2    False
3    False
4     True
5    False
6    False
dtype: bool

In [55]:
data.drop_duplicates(['k1','k2'], keep='last')

Unnamed: 0,k1,k2,v1
2,one,1,2
3,two,2,3
5,two,4,5
6,one,3,6


### 使用函式或對應關係轉換值

In [56]:
data=pd.DataFrame({'food':['Bacon','pork','bacon','Chicken','Beef','chicken','Ham','bacon','ham','beef'],
                   'ounces':[4,6,8,8,10,12,4,3,9,10]})

In [57]:
data

Unnamed: 0,food,ounces
0,Bacon,4
1,pork,6
2,bacon,8
3,Chicken,8
4,Beef,10
5,chicken,12
6,Ham,4
7,bacon,3
8,ham,9
9,beef,10


In [58]:
origin_of_meat={'bacon':'Taiwan','pork':'Japan','chicken':'France',
                'beef':'Australia','ham':'Spain'}
type(origin_of_meat)

dict

### 由於data有些大寫、有些小寫，透過str.lower()全面改成小寫。

In [59]:
lowercased=data['food'].str.lower()

In [60]:
print(lowercased, type(lowercased))

0      bacon
1       pork
2      bacon
3    chicken
4       beef
5    chicken
6        ham
7      bacon
8        ham
9       beef
Name: food, dtype: object <class 'pandas.core.series.Series'>


### 透過series.map(dict)，可以對應出每一個肉品的產地。

In [61]:
lowercased.map(origin_of_meat)

0       Taiwan
1        Japan
2       Taiwan
3       France
4    Australia
5       France
6        Spain
7       Taiwan
8        Spain
9    Australia
Name: food, dtype: object

In [62]:
data['origin']=lowercased.map(origin_of_meat)

In [63]:
data

Unnamed: 0,food,ounces,origin
0,Bacon,4,Taiwan
1,pork,6,Japan
2,bacon,8,Taiwan
3,Chicken,8,France
4,Beef,10,Australia
5,chicken,12,France
6,Ham,4,Spain
7,bacon,3,Taiwan
8,ham,9,Spain
9,beef,10,Australia


### 也可以僅透過一個式子就完成配對。

In [64]:
data['food'].map(lambda x: origin_of_meat[x.lower()])

0       Taiwan
1        Japan
2       Taiwan
3       France
4    Australia
5       France
6        Spain
7       Taiwan
8        Spain
9    Australia
Name: food, dtype: object

### 取代值

In [65]:
data=pd.Series([10,-999,20,-999,30,-999,100])

In [66]:
data

0     10
1   -999
2     20
3   -999
4     30
5   -999
6    100
dtype: int64

### -999可能是遺失值，需要改寫為NAN，可以使用replace()

In [67]:
a=data.replace(-999,np.nan)
a

0     10.0
1      NaN
2     20.0
3      NaN
4     30.0
5      NaN
6    100.0
dtype: float64

### inplace=Ture 可以直接修改原始資料

### 如果一口氣要取代許多值，可以用[]呈現。

In [68]:
data.replace([-999,100], np.nan, inplace=True)

In [69]:
data

0    10.0
1     NaN
2    20.0
3     NaN
4    30.0
5     NaN
6     NaN
dtype: float64

In [70]:
data=pd.Series([10,-999,20,-999,30,-999,100])

### 如果想要置換不同的值，也可以使用雙括號。

In [71]:
data.replace([-999,100],[np.nan,0])

0    10.0
1     NaN
2    20.0
3     NaN
4    30.0
5     NaN
6     0.0
dtype: float64

### 也可以在replace()內放入dict。

In [72]:
data.replace({-999:np.nan, 100: 0})

0    10.0
1     NaN
2    20.0
3     NaN
4    30.0
5     NaN
6     0.0
dtype: float64

### 更名軸index

In [73]:
data=pd.DataFrame(np.arange(12).reshape(3,4),index=['France','Spain','Netherland'], 
                  columns=['first','second','third','fourth'])

In [74]:
data

Unnamed: 0,first,second,third,fourth
France,0,1,2,3
Spain,4,5,6,7
Netherland,8,9,10,11


### index 透過map也可以做格式化的調整。

In [75]:
transform=lambda x:x[:3].upper()

In [76]:
data.index.map(transform)

Index(['FRA', 'SPA', 'NET'], dtype='object')

### 直接對index給值，就可以原地進行資料調整。

In [77]:
data.index=data.index.map(transform)

In [78]:
data

Unnamed: 0,first,second,third,fourth
FRA,0,1,2,3
SPA,4,5,6,7
NET,8,9,10,11


### 如果不想動到原本的東西，可以使用rename。

In [79]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,FIRST,SECOND,THIRD,FOURTH
Fra,0,1,2,3
Spa,4,5,6,7
Net,8,9,10,11


### rename(index={}, columns={})：可以放入dict，修改標籤名稱。

In [80]:
data.rename(index={'NET':'ENL'}, columns={'fourth':'third'})

Unnamed: 0,first,second,third,third.1
FRA,0,1,2,3
SPA,4,5,6,7
ENL,8,9,10,11


### rename() 如果加上inplace=True，就可以很節省時間的直接修改原始資料。

In [81]:
data.rename(index={'NET':'ENL'}, inplace=True)

In [82]:
data

Unnamed: 0,first,second,third,fourth
FRA,0,1,2,3
SPA,4,5,6,7
ENL,8,9,10,11


### 離散化和分組

In [83]:
ages=[18,19,20,22,25,27,30,32,40,45,50,56,60,61,70,79]

### 將年齡分為18-25, 25-35, 35-45, 55-65, 65以上。

In [84]:
bins=[18,25,35,55,65,100]

In [85]:
cats=pd.cut(ages,bins)

### 小括號代表open，中括號代表close。

In [86]:
cats

[NaN, (18.0, 25.0], (18.0, 25.0], (18.0, 25.0], (18.0, 25.0], ..., (55, 65], (55, 65], (55, 65], (65, 100], (65, 100]]
Length: 16
Categories (5, interval[int64, right]): [(18, 25] < (25, 35] < (35, 55] < (55, 65] < (65, 100]]

### 使用cuts可以將資料做分群，會產生特殊的categorical物件，可以透過codes看到categorical陣列。

In [87]:
cats.codes

array([-1,  0,  0,  0,  0,  1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4],
      dtype=int8)

### categories 可以看出有哪些分群。

In [88]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 55], (55, 65], (65, 100]], dtype='interval[int64, right]')

### value_counts() 可以計算每個分群有幾筆數值。

In [89]:
pd.value_counts(cats)

(18, 25]     4
(25, 35]     3
(35, 55]     3
(55, 65]     3
(65, 100]    2
dtype: int64

### right=flase 表示右邊的值為closed。

In [90]:
cats=pd.cut(ages,bins, right=False)

In [91]:
cats

[[18, 25), [18, 25), [18, 25), [18, 25), [25, 35), ..., [55, 65), [55, 65), [55, 65), [65, 100), [65, 100)]
Length: 16
Categories (5, interval[int64, left]): [[18, 25) < [25, 35) < [35, 55) < [55, 65) < [65, 100)]

### 指定名稱給lables，就可以幫不同的分群取名。

In [92]:
group_name=['Youth','Young_Adult','Middle_Age','Seniors','Elders']

In [93]:
pd.cut(ages,bins,labels=group_name)

[NaN, 'Youth', 'Youth', 'Youth', 'Youth', ..., 'Seniors', 'Seniors', 'Seniors', 'Elders', 'Elders']
Length: 16
Categories (5, object): ['Youth' < 'Young_Adult' < 'Middle_Age' < 'Seniors' < 'Elders']

In [94]:
data=np.random.randn(20)

### 除了給bins，也可以指定分成4群。

### precision=2,表示小數點後兩位。

In [95]:
pd.cut(data,4,precision=2)

[(-0.75, 0.13], (-1.63, -0.75], (-0.75, 0.13], (-0.75, 0.13], (1.01, 1.89], ..., (-0.75, 0.13], (0.13, 1.01], (1.01, 1.89], (-1.63, -0.75], (1.01, 1.89]]
Length: 20
Categories (4, interval[float64, right]): [(-1.63, -0.75] < (-0.75, 0.13] < (0.13, 1.01] < (1.01, 1.89]]

### cut依據資料的分佈，分組時有時後會每組樣本數不同，使用qcut就可以平均分配樣本數囉！

In [96]:
data=np.random.randn(1000) #常態分佈

In [97]:
cats=pd.qcut(data,4)

In [98]:
pd.value_counts(cats) #平均分配

(-3.0269999999999997, -0.674]    250
(-0.674, -0.0629]                250
(-0.0629, 0.564]                 250
(0.564, 3.426]                   250
dtype: int64

In [99]:
cats2=pd.cut(data,4)

In [100]:
pd.value_counts(cats2) #依照數值區間分配

(-1.413, 0.2]       517
(0.2, 1.813]        366
(-3.033, -1.413]     88
(1.813, 3.426]       29
dtype: int64

### 偵測和濾除離群值

In [101]:
data=pd.DataFrame(np.random.randn(100,4))

In [102]:
data

Unnamed: 0,0,1,2,3
0,-1.156006,-1.041441,-1.523949,-0.688716
1,-0.371170,0.452690,0.164494,-0.196833
2,-0.922207,0.366299,0.292682,0.756499
3,0.020676,0.656298,-1.673750,0.856715
4,1.387991,-0.274587,-1.644329,0.001996
...,...,...,...,...
95,-0.678326,-1.157943,0.255729,0.464355
96,1.217492,-0.150644,-0.137437,0.569122
97,0.812648,1.598039,-0.467408,1.363090
98,-2.202130,0.010385,-0.226516,0.121855


In [103]:
data.describe()

Unnamed: 0,0,1,2,3
count,100.0,100.0,100.0,100.0
mean,-0.080002,-0.002403,-0.208463,0.000841
std,0.964844,0.946555,1.016092,0.94584
min,-2.20213,-2.050799,-2.664219,-2.047694
25%,-0.730851,-0.67551,-0.87126,-0.699701
50%,-0.190575,0.011343,-0.168712,0.050811
75%,0.54763,0.519222,0.49304,0.766262
max,2.355491,2.179362,2.444315,1.723315


In [104]:
col=data[2]

In [105]:
col[np.abs(col)>2]

6    -2.500444
20   -2.647673
25   -2.000173
27   -2.664219
60    2.444315
Name: 2, dtype: float64

In [106]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3


In [107]:
data[np.abs(data)>3]=np.sign(data)*3

In [108]:
data.describe()

Unnamed: 0,0,1,2,3
count,100.0,100.0,100.0,100.0
mean,-0.080002,-0.002403,-0.208463,0.000841
std,0.964844,0.946555,1.016092,0.94584
min,-2.20213,-2.050799,-2.664219,-2.047694
25%,-0.730851,-0.67551,-0.87126,-0.699701
50%,-0.190575,0.011343,-0.168712,0.050811
75%,0.54763,0.519222,0.49304,0.766262
max,2.355491,2.179362,2.444315,1.723315


### sign(): 如果正值，給1，如果負值，給-1。

In [109]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,-1.0,-1.0,-1.0
1,-1.0,1.0,1.0,-1.0
2,-1.0,1.0,1.0,1.0
3,1.0,1.0,-1.0,1.0
4,1.0,-1.0,-1.0,1.0


### 排列與隨機取樣

### np.random.permutation(n): 會將0~n-1的數列隨機排序。

In [110]:
np.random.permutation(7)

array([6, 0, 4, 1, 2, 3, 5])

In [111]:
df=pd.DataFrame(np.arange(5*4).reshape(5,4))

In [112]:
sampler=np.random.permutation(5)

In [113]:
sampler

array([2, 4, 3, 1, 0])

In [114]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


### take(array): 可以將index換成array內的數值。

In [115]:
df.take(sampler)

Unnamed: 0,0,1,2,3
2,8,9,10,11
4,16,17,18,19
3,12,13,14,15
1,4,5,6,7
0,0,1,2,3


### sample(n=4) 表示可以隨機從原數據挑選兩行。

In [116]:
df.sample(n=4)

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
4,16,17,18,19
0,0,1,2,3


In [117]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
2,8,9,10,11
4,16,17,18,19
1,4,5,6,7


In [118]:
choices=pd.Series([5,7,-1,6,4])

In [119]:
choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

### 如果想抽取的資料行數大於原本的資料，就必須填入replace=True，才可以重複抽取。

In [120]:
draws=choices.sample(n=10,replace=True)

In [121]:
a=choices.sample(n=3)
a

1    7
4    4
0    5
dtype: int64

In [122]:
draws

3    6
4    4
4    4
1    7
0    5
0    5
3    6
4    4
2   -1
3    6
dtype: int64

### 指標、虛擬變數

### pd.get_dummies(): 可以告訴你變數的所在位置，該位置會顯示為1。

In [123]:
df=pd.DataFrame({'key':['a','a','b','b','c','c'], 'data1':range(6)})

In [124]:
df

Unnamed: 0,key,data1
0,a,0
1,a,1
2,b,2
3,b,3
4,c,4
5,c,5


In [125]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,1,0,0
1,1,0,0
2,0,1,0
3,0,1,0
4,0,0,1
5,0,0,1


### prefix='key' 可以在欄位名稱上做註記，表示是key欄位的a的所在位置。

In [126]:
dummies=pd.get_dummies(df['key'], prefix='key')

In [127]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,1,0,0
1,1,0,0
2,0,1,0
3,0,1,0
4,0,0,1
5,0,0,1


### join(): 以index為key值，使用join可以合併兩個表格。

In [128]:
df_with_dummy=df[['data1']].join(dummies)

In [129]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,1,0,0
1,1,1,0,0
2,2,0,1,0
3,3,0,1,0
4,4,0,0,1
5,5,0,0,1


In [130]:
pd.get_dummies(df)

Unnamed: 0,data1,key_a,key_b,key_c
0,0,1,0,0
1,1,1,0,0
2,2,0,1,0
3,3,0,1,0
4,4,0,0,1
5,5,0,0,1


In [131]:
movies=pd.DataFrame({'movie_id':range(10),
                     'title': ['Toy Story(1995)','Jumanji(1995)','Grumpier Old Man(1995)',
                              'Waiting to Exhale(1995)','Father of the Bride Part II(1995)',
                              'Heart(1995)','Sabrina(1995)','Tom and Huck(1995)','Sudden Death(1995)',
                              'Goldeneye(1995)'],
                    'genres': ['Animation|Children\'s|Comedy','Adventure|Children\'s|Fantasy',
                               'Comedy|Romance','Comedy|Drama','Comedy','Action|Crime|Thriller',
                               'Comedy|Romance','Adventure|Children\'s','Action','Action|Adventure|Thriller']})

In [132]:
movies

Unnamed: 0,movie_id,title,genres
0,0,Toy Story(1995),Animation|Children's|Comedy
1,1,Jumanji(1995),Adventure|Children's|Fantasy
2,2,Grumpier Old Man(1995),Comedy|Romance
3,3,Waiting to Exhale(1995),Comedy|Drama
4,4,Father of the Bride Part II(1995),Comedy
5,5,Heart(1995),Action|Crime|Thriller
6,6,Sabrina(1995),Comedy|Romance
7,7,Tom and Huck(1995),Adventure|Children's
8,8,Sudden Death(1995),Action
9,9,Goldeneye(1995),Action|Adventure|Thriller


In [133]:
all_genres=[]

### x.split('|'): 移除文字與文字之間的符號。

In [134]:
for x in movies.genres:
    all_genres.extend(x.split('|'))

### pd.unique: 移除重複值。

In [135]:
genres=pd.unique(all_genres)

In [136]:
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller'], dtype=object)

### np.zeros（x,y): 可以製作x*y的矩陣，並以0為值。

### len(movies): 表示movies的列數有10列。 len(genres): 表示genres有10個品項。

In [137]:
zero_matrix=np.zeros((len(movies),len(genres)))

In [138]:
zero_matrix

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [139]:
dummies=pd.DataFrame(zero_matrix, columns=genres)

In [140]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [141]:
movies

Unnamed: 0,movie_id,title,genres
0,0,Toy Story(1995),Animation|Children's|Comedy
1,1,Jumanji(1995),Adventure|Children's|Fantasy
2,2,Grumpier Old Man(1995),Comedy|Romance
3,3,Waiting to Exhale(1995),Comedy|Drama
4,4,Father of the Bride Part II(1995),Comedy
5,5,Heart(1995),Action|Crime|Thriller
6,6,Sabrina(1995),Comedy|Romance
7,7,Tom and Huck(1995),Adventure|Children's
8,8,Sudden Death(1995),Action
9,9,Goldeneye(1995),Action|Adventure|Thriller


In [142]:
gen=movies.genres[0]
gen

"Animation|Children's|Comedy"

In [143]:
gen.split('|')

['Animation', "Children's", 'Comedy']

In [144]:
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [145]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2])

In [146]:
for i, gen in enumerate(movies.genres):
    indices= dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i,indices]=1

In [147]:
movies_windic=movies.join(dummies.add_prefix('Genre_'))

In [148]:
movies_windic

Unnamed: 0,movie_id,title,genres,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Adventure,Genre_Fantasy,Genre_Romance,Genre_Drama,Genre_Action,Genre_Crime,Genre_Thriller
0,0,Toy Story(1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,Jumanji(1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Grumpier Old Man(1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,3,Waiting to Exhale(1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,4,Father of the Bride Part II(1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,Heart(1995),Action|Crime|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
6,6,Sabrina(1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,7,Tom and Huck(1995),Adventure|Children's,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,Sudden Death(1995),Action,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,9,Goldeneye(1995),Action|Adventure|Thriller,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [149]:
movies_windic.iloc[6]

movie_id                         6
title                Sabrina(1995)
genres              Comedy|Romance
Genre_Animation                0.0
Genre_Children's               0.0
Genre_Comedy                   1.0
Genre_Adventure                0.0
Genre_Fantasy                  0.0
Genre_Romance                  1.0
Genre_Drama                    0.0
Genre_Action                   0.0
Genre_Crime                    0.0
Genre_Thriller                 0.0
Name: 6, dtype: object

In [150]:
np.random.seed(12345)

### np.random.rand() 會出現n個介於0-1的數值。

In [151]:
values=np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [152]:
bins=[0,0.2,0.4,0.6,0.8,1]

### pd.cut(values,bins)：先對資料進行分組。 pd.get_dummies(): 可以找出每個數值在群組中的位置。

In [153]:
pd.get_dummies(pd.cut(values,bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


### 字串操作

### split('標點符號')：依據標點符號的位置，可以切分文字。

In [154]:
string='a,b, hello'

In [155]:
string.split(',')

['a', 'b', ' hello']

### strip(): 可以移除前後空白（包括換行）

In [156]:
pieces= [x.strip() for x in string.split(',')]

In [157]:
pieces

['a', 'b', 'hello']

### ＋：字串可以用加號連結起來。

In [158]:
first, second, third= pieces

In [159]:
first+'::'+second+'::'+third

'a::b::hello'

### ‘文字’.join(物件): 可以將點點前面的文字串連到每一個join內的物件。

In [160]:
'::'.join(pieces)

'a::b::hello'

### in : 可以確認該物件是否有在tuple, list, dict內。

In [161]:
'hello' in pieces

True

### index():可以尋找文字所在位置。

In [162]:
string

'a,b, hello'

In [163]:
string.index('hello')

5

In [164]:
string.index(' ')

4

In [165]:
string.index(',')

1

In [166]:
string.index(':') #如果找不到該物件，會顯示ValueError。

ValueError: substring not found

### find(): 可以尋找文字。

In [167]:
string.find('hello')

5

In [168]:
string.find(' ')

4

In [169]:
string.find(':') #如果找不到該物件，會顯示-1。

-1

### count(): 計算文字出現次數。

In [170]:
string.count(',')

2

### replace(a,b): 可以用來替換字串，a表示原本的字串，b表示修改後的字串。

In [171]:
string.replace('a', 'd')

'd,b, hello'

In [172]:
string

'a,b, hello'

### 正規表達式(regular expression): re模組。

In [173]:
import re

### \t代表3個空白。

In [174]:
print(f'apple   bread')

apple   bread


In [175]:
print(f'apple\tbread')

apple	bread


In [176]:
text="apple \tbread  \tberry   \ttriangle"

In [177]:
text

'apple \tbread  \tberry   \ttriangle'

### '\s+': 表示tab, 空白或換行。 split('\s+'): 表示文字遇到tab,空白或換行，就用逗號區隔開來。

In [178]:
re.split('\s+', text)

['apple', 'bread', 'berry', 'triangle']

In [179]:
re.compile(text)

re.compile(r'apple \tbread  \tberry   \ttriangle', re.UNICODE)

### re.compile('功能')：可以將一直會寫到的功能放進來，就可以出現一個功能函式物件。

### regex= regular expression 正規表達式

In [180]:
regex=re.compile('\s+')

In [181]:
regex

re.compile(r'\s+', re.UNICODE)

In [182]:
regex.split(text)

['apple', 'bread', 'berry', 'triangle']

### findall()： 可以找到所有對應regex功能的物件，並顯示出來。

In [183]:
regex.findall(text)

[' \t', '  \t', '   \t']

### match(): 對比字串的開頭是否匹配。

In [184]:
text="""Dave dave@google.com
Steve steve@gmail.com
Rob rob@gamil.com
Ryan ryan@yahoo.com"""

### r: 表示字串不會被轉譯，例如\n就會換行。
### [a-zA-Z0-9]: 表示大寫小寫字母與數字都可以。
### . :表示點點之後的文字與符號都可以（但是不包括換行）。
### +: 表示加號之後的文字至少要出現一次。
### \ : 跳脫字元，使後面的文字不要產生功能。
### {m,n}: 表示大括號前的文字僅能重複m~n次。

In [185]:
pattern=r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

### flags=re.IGNORECASE 忽略字母大小寫。

In [186]:
regex=re.compile(pattern,flags=re.IGNORECASE)

### 找出所有符合regex格式的信箱。

In [187]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gamil.com', 'ryan@yahoo.com']

### search()：會找到第一個符合regex的字串。

In [188]:
m=regex.search(text)

In [189]:
m

<re.Match object; span=(5, 20), match='dave@google.com'>

### start(): 顯示字串的開頭。 end(): 顯示字串的結束。

In [190]:
text[m.start():m.end()]

'dave@google.com'

### match:只會檢查字串的開頭是否匹配，如果不匹配，回傳None。

In [191]:
text

'Dave dave@google.com\nSteve steve@gmail.com\nRob rob@gamil.com\nRyan ryan@yahoo.com'

### 由於text第一個字為Dave，並非regex定義的信箱格式，故回傳None，如果符合regex的格式，回傳第一個符合的物件。

In [192]:
print(regex.match(text))

None


In [193]:
text2="""dave@google.com
steve@gmail.com
rob@gmail.com"""

In [194]:
print(regex.match(text2))

<re.Match object; span=(0, 15), match='dave@google.com'>


### sub('取代後的文字', 原本的字串)：sub可以找到符合regex的字串，並取代成新的文字。

In [195]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED


### 在pattern內把想要切斷的文字放上小括號。（）

In [196]:
pattern=r'([A-Z0-0._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

### flags=re.IGNORECASE 忽略字母的大小寫。

In [197]:
regex=re.compile(pattern,flags=re.IGNORECASE)

In [198]:
m=regex.match('wesm@bright.net')

### groups(): 將小括號內的文字提出來，成為一個tuple。

In [199]:
m.groups()

('wesm', 'bright', 'net')

### findall(text): 由於regex已經先分成三個物件，故在取出物件時，也會分成三份。

In [200]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gamil', 'com'),
 ('ryan', 'yahoo', 'com')]

### r: 表示保留原始文字，不做轉譯。
### sub(\1,\2,\3): 透過\1, \2, \3 可用來代表取出來按照排序的文字的物件。

In [201]:
print(regex.sub(r'Username: \1, Domain: \2,  Suffix:\3', text))

Dave Username: dave, Domain: google,  Suffix:com
Steve Username: steve, Domain: gmail,  Suffix:com
Rob Username: rob, Domain: gamil,  Suffix:com
Ryan Username: ryan, Domain: yahoo,  Suffix:com


### pandas中的向量字串函式。

In [202]:
data={'Dave':'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob':'rob@gmail.com', 'Wes':np.nan}

In [203]:
data=pd.Series(data)

In [204]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [205]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

### str.contains(): 找到是否包含此字串的物件。

In [206]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [207]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

### 如果要對表格每一行執行findall(), match()，記得要在前方加上str.

In [208]:
pattern

'([A-Z0-0._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [209]:
data.str.findall(pattern,flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [210]:
matches=data.str.match(pattern,flags=re.IGNORECASE)

In [211]:
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [212]:
matches.dropna(inplace=True)

In [213]:
matches.str.get(1)

AttributeError: Can only use .str accessor with string values!

In [214]:
matches.str[0]

AttributeError: Can only use .str accessor with string values!

### str[]: 可以對字串做切片。

In [215]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object