## <b>处理丢失数据</b>

In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib as mpl

有两种丢失数据：
* None
* np.nan(NaN)

In [2]:
type(None)

NoneType

In [4]:
# NaN 不能参与运算，不是一个数值只是一个展示符号
type(np.nan)

float

In [5]:
# np.nan和任何书运算，得到结果还是nan
np.nan+1

nan

In [8]:
# 类型强制统一  numpy
np.array([1,2,3,None])

array([1, 2, 3, None], dtype=object)

In [10]:
%timeit np.arange(1e5, dtype=object).sum()

7.17 ms ± 321 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
%timeit np.arange(1e5, dtype=float).sum()

161 μs ± 7.13 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [16]:
s = Series(data=np.random.randint(0,10,size=(5)))
s

0    5
1    1
2    5
3    3
4    5
dtype: int32

In [19]:
# pandas对象会把None自动优化成np.nan
s.loc[1]=None

In [18]:
s

0    5.0
1    NaN
2    5.0
3    3.0
4    5.0
dtype: float64

In [24]:
df = DataFrame(data=np.random.randint(0,10,size=(3,3)),columns=list("ABC"))
df.loc[1,"B"]=None

In [25]:
df

Unnamed: 0,A,B,C
0,6,7.0,6
1,3,,6
2,5,4.0,8


In [27]:
# 查看DataFrame的每一列的数据元素的类型
df.dtypes

A      int32
B    float64
C      int32
dtype: object

In [31]:
# 查看哪些列存在空值
df.isnull().any()

A    False
B     True
C    False
dtype: bool

In [32]:
# 查看哪些行存在空值
df.isnull().any(axis=1)

0    False
1     True
2    False
dtype: bool

In [34]:
df.notnull().all()

A     True
B    False
C     True
dtype: bool

In [39]:
# 默认删除的是行
# axis空值删除的方向
# how
df.dropna(axis=1,how='all')

Unnamed: 0,A,B,C
0,6,7.0,6
1,3,,6
2,5,4.0,8


In [41]:
df.loc[0,"B"] = np.nan
df.loc[2,"B"] = np.nan
df

Unnamed: 0,A,B,C
0,6,,6
1,3,,6
2,5,,8


In [42]:
df.dropna(axis=1,how='all')

Unnamed: 0,A,C
0,6,6
1,3,6
2,5,8


In [44]:
# 使用一个数填充
df.fillna(value=10)

Unnamed: 0,A,B,C
0,6,10.0,6
1,3,10.0,6
2,5,10.0,8


In [46]:
user_df = DataFrame(data={
    "high":[187,165,190,178],
    "weight":[74,49,80,67],
    "age":[21,23,42,19]
},index=["wood","max","luca","lila"])
user_df.loc["wood","weight"] = np.nan
user_df.loc["max","high"] = np.nan
user_df.loc["lila","age"] = np.nan
user_df

Unnamed: 0,high,weight,age
wood,187.0,,21.0
max,,49.0,23.0
luca,190.0,80.0,42.0
lila,178.0,67.0,


In [47]:
user_df.fillna(value=22)

Unnamed: 0,high,weight,age
wood,187.0,22.0,21.0
max,22.0,49.0,23.0
luca,190.0,80.0,42.0
lila,178.0,67.0,22.0


In [50]:
# 通常的办法就是使用每一列的某一个聚合指标来对每一列进行填充
col_mean= user_df.mean()

In [53]:
# 可以简写为user_df.fillna(value=user_df.mean())
user_df.fillna(value=col_mean)

Unnamed: 0,high,weight,age
wood,187.0,65.333333,21.0
max,185.0,49.0,23.0
luca,190.0,80.0,42.0
lila,178.0,67.0,28.666667


In [57]:
user_df.fillna(method='ffill')

  user_df.fillna(method='ffill')


Unnamed: 0,high,weight,age
wood,187.0,,21.0
max,187.0,49.0,23.0
luca,190.0,80.0,42.0
lila,178.0,67.0,42.0


In [60]:
columns = ["语文","数学","英语"]
index=["张三","李四"]
data = np.random.randint(0,150,size=(2,3))
ddd3 = DataFrame(data =data,index=index,columns=columns)
ddd3

Unnamed: 0,语文,数学,英语
张三,1,63,25
李四,124,42,62


In [61]:
ddd3.iloc[0,2]=np.nan
ddd3

Unnamed: 0,语文,数学,英语
张三,1,63,
李四,124,42,62.0


In [62]:
ddd3.fillna(method='ffill',axis=1)

  ddd3.fillna(method='ffill',axis=1)


Unnamed: 0,语文,数学,英语
张三,1.0,63.0,63.0
李四,124.0,42.0,62.0


In [63]:
ddd3.fillna(method='bfill')

  ddd3.fillna(method='bfill')


Unnamed: 0,语文,数学,英语
张三,1,63,62.0
李四,124,42,62.0


In [None]:
查看是否保存