# Data Wrangling (cleaning)

## 缺失值
## 重复值
## 异常值

In [2]:
import numpy as np
import pandas as pd

## 缺失值

In [1]:
# Missing Values
# - None
# - np.nan(NaN)

# - Pandas会自动把None，转换为NaN

In [3]:
type(None) # object Null 會終止運算

NoneType

In [6]:
type(np.nan) # NaN 可以参与运算

float

## Pandas 处理空值的操作
### （isnull/notnull/any/all/dropna/fillna）

In [7]:
df = pd.DataFrame(np.random.randint(0,100,size=(7,6)))

In [10]:
df.iloc[1,2]=None
df.iloc[1,3]=np.NaN
df

Unnamed: 0,0,1,2,3,4,5
0,67,18,74.0,59.0,12,63
1,54,55,,,39,37
2,36,53,59.0,67.0,94,86
3,53,25,10.0,71.0,72,57
4,31,98,68.0,80.0,63,65
5,80,7,80.0,67.0,85,89
6,74,80,41.0,79.0,75,36


In [13]:
# 删除空值行数据
df.dropna(axis=0)

Unnamed: 0,0,1,2,3,4,5
0,67,18,74.0,59.0,12,63
2,36,53,59.0,67.0,94,86
3,53,25,10.0,71.0,72,57
4,31,98,68.0,80.0,63,65
5,80,7,80.0,67.0,85,89
6,74,80,41.0,79.0,75,36


In [14]:
# 删除空值列数据
df.dropna(axis=1)

Unnamed: 0,0,1,4,5
0,67,18,12,63
1,54,55,39,37
2,36,53,94,86
3,53,25,72,57
4,31,98,63,65
5,80,7,85,89
6,74,80,75,36


In [21]:
# 哪些 行 中存在nan
df.isnull().any(axis=1)
df.loc[ df.isnull().any(axis=1) ]

Unnamed: 0,0,1,2,3,4,5
1,54,55,,,39,37


In [25]:
# 哪些 列 中存在nan
df.isnull().any(axis=0)

df.loc[ :,df.isnull().any(axis=0) ]

Unnamed: 0,2,3
0,74.0,59.0
1,,
2,59.0,67.0
3,10.0,71.0
4,68.0,80.0
5,80.0,67.0
6,41.0,79.0


## fillna

In [29]:
# 指定值覆盖
df.fillna(value=777)

Unnamed: 0,0,1,2,3,4,5
0,67,18,74.0,59.0,12,63
1,54,55,777.0,777.0,39,37
2,36,53,59.0,67.0,94,86
3,53,25,10.0,71.0,72,57
4,31,98,68.0,80.0,63,65
5,80,7,80.0,67.0,85,89
6,74,80,41.0,79.0,75,36


In [31]:
# 用前面一个值代替, 垂直 方向
df.fillna(method='ffill',axis=0)

Unnamed: 0,0,1,2,3,4,5
0,67,18,74.0,59.0,12,63
1,54,55,74.0,59.0,39,37
2,36,53,59.0,67.0,94,86
3,53,25,10.0,71.0,72,57
4,31,98,68.0,80.0,63,65
5,80,7,80.0,67.0,85,89
6,74,80,41.0,79.0,75,36


In [32]:
# 用前面一个值代替, 水平 方向
df.fillna(method='ffill',axis=1)

Unnamed: 0,0,1,2,3,4,5
0,67.0,18.0,74.0,59.0,12.0,63.0
1,54.0,55.0,55.0,55.0,39.0,37.0
2,36.0,53.0,59.0,67.0,94.0,86.0
3,53.0,25.0,10.0,71.0,72.0,57.0
4,31.0,98.0,68.0,80.0,63.0,65.0
5,80.0,7.0,80.0,67.0,85.0,89.0
6,74.0,80.0,41.0,79.0,75.0,36.0


In [33]:
# 用 后面 一个值代替, 垂直 方向
df.fillna(method='bfill',axis=0)

Unnamed: 0,0,1,2,3,4,5
0,67,18,74.0,59.0,12,63
1,54,55,59.0,67.0,39,37
2,36,53,59.0,67.0,94,86
3,53,25,10.0,71.0,72,57
4,31,98,68.0,80.0,63,65
5,80,7,80.0,67.0,85,89
6,74,80,41.0,79.0,75,36


## drop columns

In [39]:
df.drop(columns=[0,1],inplace=False)

Unnamed: 0,2,3,4,5
0,74.0,59.0,12,63
1,,,39,37
2,59.0,67.0,94,86
3,10.0,71.0,72,57
4,68.0,80.0,63,65
5,80.0,67.0,85,89
6,41.0,79.0,75,36


In [43]:
df.drop(index=[1,4])

Unnamed: 0,0,1,2,3,4,5
0,67,18,74.0,59.0,12,63
2,36,53,59.0,67.0,94,86
3,53,25,10.0,71.0,72,57
5,80,7,80.0,67.0,85,89
6,74,80,41.0,79.0,75,36


In [59]:
df_new = df[[1,2,3,4]]
df_new

Unnamed: 0,1,2,3,4
0,18,74.0,59.0,12
1,55,,,39
2,53,59.0,67.0,94
3,25,10.0,71.0,72
4,98,68.0,80.0,63
5,7,80.0,67.0,85
6,80,41.0,79.0,75


In [62]:
df_n = df_new.fillna(method='ffill',axis=0).fillna(method='bfill',axis=0)

In [65]:
df_n.isnull().any()

1    False
2    False
3    False
4    False
dtype: bool

## duplicated values

In [67]:
df = pd.DataFrame(data=np.random.randint(0,10,size=(7,4)))

In [71]:
df.iloc[2] = [0,0,0,0]
df.iloc[4] = [0,0,0,0]
df.iloc[6] = [0,0,0,0]

In [76]:
df.drop_duplicates(keep=False) # keep 'first', 'last',False(delete all)

Unnamed: 0,0,1,2,3
0,0,8,7,9
1,1,6,9,0
3,8,7,1,2
5,6,6,6,0


## 異常數值

In [79]:
df = pd.DataFrame(data=np.random.random(size=(1000,3)),columns=['A','B','C'])
# if values greater than 2*std, remove
twice_std = df['C'].std() * 2

df.loc[df["C"]<twice_std]

Unnamed: 0,A,B,C
1,0.334678,0.657539,0.002794
3,0.842848,0.751797,0.221977
5,0.165169,0.108490,0.224812
6,0.576522,0.211173,0.082260
8,0.963837,0.252891,0.149935
...,...,...,...
989,0.283358,0.769549,0.102190
993,0.676610,0.947145,0.500159
994,0.146807,0.413681,0.501733
995,0.914084,0.698974,0.322870
