# Data Cleaning and Preparation

---

Author: Jiacheng 

Date: 2019-05-15

---

In [1]:
import numpy as np
import pandas as pd

---

## 1. Handling Missing Data
## 处理缺失数据

In [2]:
string_data = pd.Series(['dark', 'artichoke', np.nan, 'avocat'])
string_data

0         dark
1    artichoke
2          NaN
3       avocat
dtype: object

In [3]:
string_data[0] = None

In [4]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

* ### 1.滤除缺失数据

In [5]:
string_data.dropna()

1    artichoke
3       avocat
dtype: object

In [9]:
data = pd.DataFrame([[1.,6.5,3.], [1., np.nan, np.nan],
                    [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [11]:
# 丢弃含有Nan的行
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [12]:
# 丢弃全为Nan的行或列
clean2 = data.dropna(how='all')
clean2

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [13]:
data[4] = np.nan
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [16]:
clean3 = data.dropna(axis=1, how='all')
clean3

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
df = pd.DataFrame(np.random.rand(7,3))
df

Unnamed: 0,0,1,2
0,0.838163,0.466194,0.118949
1,0.312286,0.17468,0.867592
2,0.831147,0.789635,0.259428
3,0.101773,0.312501,0.548222
4,0.160799,0.355009,0.743956
5,0.280305,0.394925,0.142269
6,0.112486,0.684926,0.395195


In [18]:
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.838163,,
1,0.312286,,
2,0.831147,,0.259428
3,0.101773,,0.548222
4,0.160799,0.355009,0.743956
5,0.280305,0.394925,0.142269
6,0.112486,0.684926,0.395195


In [23]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.831147,,0.259428
3,0.101773,,0.548222
4,0.160799,0.355009,0.743956
5,0.280305,0.394925,0.142269
6,0.112486,0.684926,0.395195


* ### 2. 填充缺失数据

In [24]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.838163,0.0,0.0
1,0.312286,0.0,0.0
2,0.831147,0.0,0.259428
3,0.101773,0.0,0.548222
4,0.160799,0.355009,0.743956
5,0.280305,0.394925,0.142269
6,0.112486,0.684926,0.395195


In [28]:
# 对不同行列填充
df.fillna({1: 0.6, 2: 100})

Unnamed: 0,0,1,2
0,0.838163,0.6,100.0
1,0.312286,0.6,100.0
2,0.831147,0.6,0.259428
3,0.101773,0.6,0.548222
4,0.160799,0.355009,0.743956
5,0.280305,0.394925,0.142269
6,0.112486,0.684926,0.395195


In [35]:
# 就地修改
df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.838163,0.0,0.0
1,0.312286,0.0,0.0
2,0.831147,0.0,0.259428
3,0.101773,0.0,0.548222
4,0.160799,0.355009,0.0
5,0.280305,0.394925,0.0
6,0.112486,0.684926,0.0


In [36]:
df.iloc[1:3, 1] = np.nan
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.838163,0.0,0.0
1,0.312286,,0.0
2,0.831147,,0.259428
3,0.101773,0.0,0.548222
4,0.160799,0.355009,
5,0.280305,0.394925,
6,0.112486,0.684926,


In [37]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.838163,0.0,0.0
1,0.312286,0.0,0.0
2,0.831147,0.0,0.259428
3,0.101773,0.0,0.548222
4,0.160799,0.355009,0.548222
5,0.280305,0.394925,0.548222
6,0.112486,0.684926,0.548222


In [38]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.838163,0.0,0.0
1,0.312286,0.0,0.0
2,0.831147,0.0,0.259428
3,0.101773,0.0,0.548222
4,0.160799,0.355009,0.548222
5,0.280305,0.394925,0.548222
6,0.112486,0.684926,


In [39]:
data = pd.Series([1., np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [40]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

--- 

## 2. Data Transformation
## 数据转换