### 6-3-1 遺漏資料的處理方法

In [1]:
import numpy as np
from numpy import nan as NA
import pandas as pd

df = pd.DataFrame(np.random.rand(10,4))

# 設定為NA
df.iloc[1,0] = NA
df.iloc[2:3,2] = NA
df.iloc[5:,3] = NA

In [2]:
df

Unnamed: 0,0,1,2,3
0,0.301974,0.987429,0.723113,0.730064
1,,0.998495,0.519317,0.683654
2,0.296273,0.435256,,0.131207
3,0.394902,0.256377,0.241552,0.168446
4,0.09759,0.439073,0.767179,0.396987
5,0.138265,0.159345,0.462462,
6,0.458835,0.546847,0.919949,
7,0.514916,0.05626,0.46539,
8,0.945792,0.600119,0.800126,
9,0.450496,0.360723,0.952125,


In [3]:
# 成批刪除，將存在NaN的列全部刪除
df.dropna()

Unnamed: 0,0,1,2,3
0,0.301974,0.987429,0.723113,0.730064
3,0.394902,0.256377,0.241552,0.168446
4,0.09759,0.439073,0.767179,0.396987


In [4]:
# 逐對刪除，刪除指定的列
df[[0,1]].dropna()

Unnamed: 0,0,1
0,0.301974,0.987429
2,0.296273,0.435256
3,0.394902,0.256377
4,0.09759,0.439073
5,0.138265,0.159345
6,0.458835,0.546847
7,0.514916,0.05626
8,0.945792,0.600119
9,0.450496,0.360723


In [6]:
# 填補，將NaN的位置上補上指定的值，這裡示範填上0
df.fillna(0)

Unnamed: 0,0,1,2,3
0,0.301974,0.987429,0.723113,0.730064
1,0.0,0.998495,0.519317,0.683654
2,0.296273,0.435256,0.0,0.131207
3,0.394902,0.256377,0.241552,0.168446
4,0.09759,0.439073,0.767179,0.396987
5,0.138265,0.159345,0.462462,0.0
6,0.458835,0.546847,0.919949,0.0
7,0.514916,0.05626,0.46539,0.0
8,0.945792,0.600119,0.800126,0.0
9,0.450496,0.360723,0.952125,0.0


In [11]:
# 以同列的上一個值來填補
df.fillna(method = 'ffill')

Unnamed: 0,0,1,2,3
0,0.301974,0.987429,0.723113,0.730064
1,0.301974,0.998495,0.519317,0.683654
2,0.296273,0.435256,0.519317,0.131207
3,0.394902,0.256377,0.241552,0.168446
4,0.09759,0.439073,0.767179,0.396987
5,0.138265,0.159345,0.462462,0.396987
6,0.458835,0.546847,0.919949,0.396987
7,0.514916,0.05626,0.46539,0.396987
8,0.945792,0.600119,0.800126,0.396987
9,0.450496,0.360723,0.952125,0.396987


In [12]:
# 各行的平均值（確認用）
df.mean()

0    0.399894
1    0.483993
2    0.650135
3    0.422072
dtype: float64

In [13]:
# 以平均值填補
df.fillna(df.mean())

Unnamed: 0,0,1,2,3
0,0.301974,0.987429,0.723113,0.730064
1,0.399894,0.998495,0.519317,0.683654
2,0.296273,0.435256,0.650135,0.131207
3,0.394902,0.256377,0.241552,0.168446
4,0.09759,0.439073,0.767179,0.396987
5,0.138265,0.159345,0.462462,0.422072
6,0.458835,0.546847,0.919949,0.422072
7,0.514916,0.05626,0.46539,0.422072
8,0.945792,0.600119,0.800126,0.422072
9,0.450496,0.360723,0.952125,0.422072


#### 練習問題6-13

In [15]:
import numpy as np
from numpy import nan as NA
import pandas as pd

df2 = pd.DataFrame(np.random.rand(15,6))

# 設定為NA
df2.iloc[2,0] = NA
df2.iloc[5:8,2] = NA
df2.iloc[7:9,3] = NA
df2.iloc[10,5] = NA
df2

Unnamed: 0,0,1,2,3,4,5
0,0.280873,0.361174,0.831726,0.000527,0.332562,0.496813
1,0.434767,0.467769,0.042339,0.426126,0.508405,0.663182
2,,0.973699,0.162277,0.043975,0.062317,0.948551
3,0.085847,0.485289,0.527748,0.409363,0.671392,0.760615
4,0.443084,0.254999,0.591717,0.21153,0.55649,0.602048
5,0.70582,0.146579,,0.680726,0.760529,0.359586
6,0.407835,0.22152,,0.825697,0.558914,0.870867
7,0.948557,0.669749,,,0.506716,0.119724
8,0.202862,0.169435,0.74561,,0.981722,0.462112
9,0.835567,0.63617,0.443324,0.685062,0.081836,0.457096


In [16]:
df2.dropna()

Unnamed: 0,0,1,2,3,4,5
0,0.280873,0.361174,0.831726,0.000527,0.332562,0.496813
1,0.434767,0.467769,0.042339,0.426126,0.508405,0.663182
3,0.085847,0.485289,0.527748,0.409363,0.671392,0.760615
4,0.443084,0.254999,0.591717,0.21153,0.55649,0.602048
9,0.835567,0.63617,0.443324,0.685062,0.081836,0.457096
11,0.774464,0.037664,0.314031,0.9158,0.133826,0.096108
12,0.792987,0.66905,0.656524,0.005008,0.978051,0.701769
13,0.07152,0.100934,0.15647,0.456634,0.23846,0.786167
14,0.871131,0.623069,0.750385,0.759984,0.113319,0.70163


#### 練習問題6-14

In [17]:
df2.fillna(0)

Unnamed: 0,0,1,2,3,4,5
0,0.280873,0.361174,0.831726,0.000527,0.332562,0.496813
1,0.434767,0.467769,0.042339,0.426126,0.508405,0.663182
2,0.0,0.973699,0.162277,0.043975,0.062317,0.948551
3,0.085847,0.485289,0.527748,0.409363,0.671392,0.760615
4,0.443084,0.254999,0.591717,0.21153,0.55649,0.602048
5,0.70582,0.146579,0.0,0.680726,0.760529,0.359586
6,0.407835,0.22152,0.0,0.825697,0.558914,0.870867
7,0.948557,0.669749,0.0,0.0,0.506716,0.119724
8,0.202862,0.169435,0.74561,0.0,0.981722,0.462112
9,0.835567,0.63617,0.443324,0.685062,0.081836,0.457096


In [18]:
df.fillna(df.mean())

Unnamed: 0,0,1,2,3
0,0.301974,0.987429,0.723113,0.730064
1,0.399894,0.998495,0.519317,0.683654
2,0.296273,0.435256,0.650135,0.131207
3,0.394902,0.256377,0.241552,0.168446
4,0.09759,0.439073,0.767179,0.396987
5,0.138265,0.159345,0.462462,0.422072
6,0.458835,0.546847,0.919949,0.422072
7,0.514916,0.05626,0.46539,0.422072
8,0.945792,0.600119,0.800126,0.422072
9,0.450496,0.360723,0.952125,0.422072
