# 处理丢失数据

In [None]:
"""
有两种丢失数据：
   None
   np.nan(NaN)
"""

In [4]:
import numpy as np

In [5]:
type(None)

NoneType

In [6]:
type(np.nan)

float

# 1. None
None是Python自带的，其类型为python object。因此，None不能参与到任何计算中。

object类型的运算要比int类型的运算慢得多
计算不同数据类型求和时间
%timeit np.arange(1e5,dtype=xxx).sum()

In [8]:
1e6

1000000.0

In [9]:
%timeit np.arange(1e6,dtype=int).sum()

3.33 ms ± 51.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%timeit np.arange(1e6,dtype=float).sum()

6.87 ms ± 50.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
%timeit np.arange(1e6,dtype=object).sum()

81.1 ms ± 3.04 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# 2. np.nan（NaN）
np.nan是浮点类型，能参与到计算中。但计算的结果总是NaN。

但可以使用np.nan*()函数来计算nan，此时视nan为0。

In [12]:
nd = np.array([10,20,30,np.nan])
nd.sum()

nan

In [13]:
np.nansum(nd)

60.0

In [15]:
nd = np.array([10,20,30,np.nan,None])
nd

array([10, 20, 30, nan, None], dtype=object)

In [16]:
nd.sum()

TypeError: unsupported operand type(s) for +: 'float' and 'NoneType'

# 3. pandas中的None与NaN
### 1) pandas中None与np.nan都视作np.nan¶

创建DataFrame

In [18]:
import pandas as pd
from pandas import Series, DataFrame

In [26]:
df = DataFrame([10,20,30,None,np.nan],index=["Tom","Jerry","Spicke","Lilei","Mark"],columns=["python"])
df

Unnamed: 0,python
Tom,10.0
Jerry,20.0
Spicke,30.0
Lilei,
Mark,


In [27]:
df.sum()

python    60.0
dtype: float64

#### 关于Nan   numpy 和pandas 是不一样的

In [30]:
df = DataFrame([[10,20,30,None,np.nan],
               [22,33,44,12,None],
                [np.nan, 1,2,3,4]],
               index=["Tom","Jerry","Spicke"],
               columns=["python","Java","C++","Ruby","Html"])
df

Unnamed: 0,python,Java,C++,Ruby,Html
Tom,10.0,20,30,,
Jerry,22.0,33,44,12.0,
Spicke,,1,2,3.0,4.0


In [31]:
df.sum(axis=1)

Tom        60.0
Jerry     111.0
Spicke     10.0
dtype: float64

使用DataFrame行索引与列索引修改DataFrame数据

In [32]:
df["Java"]["Spicke"] = 60

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [33]:
df

Unnamed: 0,python,Java,C++,Ruby,Html
Tom,10.0,20,30,,
Jerry,22.0,33,44,12.0,
Spicke,,60,2,3.0,4.0


#### 2) pandas中None与np.nan的操作

In [34]:
df

Unnamed: 0,python,Java,C++,Ruby,Html
Tom,10.0,20,30,,
Jerry,22.0,33,44,12.0,
Spicke,,60,2,3.0,4.0


In [40]:
is_null = df.isnull()
is_null

Unnamed: 0,python,Java,C++,Ruby,Html
Tom,False,False,False,True,True
Jerry,False,False,False,False,True
Spicke,True,False,False,False,False


In [41]:
# 专门用来判断空值的
is_null = is_null.any(axis=1)
is_null

Tom       True
Jerry     True
Spicke    True
dtype: bool

##### 总结：一般 isnull()和any() 连用

In [45]:
df2 = DataFrame([[60,45,90,88,37],
               [22,33,44,12,56],
                [67, 1,2,3,4]],
               index=["Hei","Cao","Nini"],
               columns=["python","Java","C++","Ruby","Html"])
df2

Unnamed: 0,python,Java,C++,Ruby,Html
Hei,60,45,90,88,37
Cao,22,33,44,12,56
Nini,67,1,2,3,4


In [46]:
df3 = df.add(df2,fill_value=0)
df3

Unnamed: 0,python,Java,C++,Ruby,Html
Cao,22.0,33.0,44.0,12.0,56.0
Hei,60.0,45.0,90.0,88.0,37.0
Jerry,22.0,33.0,44.0,12.0,
Nini,67.0,1.0,2.0,3.0,4.0
Spicke,,60.0,2.0,3.0,4.0
Tom,10.0,20.0,30.0,,


In [47]:
df3_isnull = df3.isnull().any(axis=1)
df3_isnull

Cao       False
Hei       False
Jerry      True
Nini      False
Spicke     True
Tom        True
dtype: bool

In [48]:
df3[df3_isnull]  #根据条件进行过滤，过滤出来都是带有空值的数据

Unnamed: 0,python,Java,C++,Ruby,Html
Jerry,22.0,33.0,44.0,12.0,
Spicke,,60.0,2.0,3.0,4.0
Tom,10.0,20.0,30.0,,


##### (1)判断函数

In [49]:
df3

Unnamed: 0,python,Java,C++,Ruby,Html
Cao,22.0,33.0,44.0,12.0,56.0
Hei,60.0,45.0,90.0,88.0,37.0
Jerry,22.0,33.0,44.0,12.0,
Nini,67.0,1.0,2.0,3.0,4.0
Spicke,,60.0,2.0,3.0,4.0
Tom,10.0,20.0,30.0,,


In [50]:
df3_notnull = df3.notnull().all(axis=1)
df3_notnull

Cao        True
Hei        True
Jerry     False
Nini       True
Spicke    False
Tom       False
dtype: bool

In [52]:
df3[df3_notnull]  #把带有空值的过滤掉， 只剩下全部有值的数据

Unnamed: 0,python,Java,C++,Ruby,Html
Cao,22.0,33.0,44.0,12.0,56.0
Hei,60.0,45.0,90.0,88.0,37.0
Nini,67.0,1.0,2.0,3.0,4.0


In [54]:
df3

Unnamed: 0,python,Java,C++,Ruby,Html
Cao,22.0,33.0,44.0,12.0,56.0
Hei,60.0,45.0,90.0,88.0,37.0
Jerry,22.0,33.0,44.0,12.0,
Nini,67.0,1.0,2.0,3.0,4.0
Spicke,,60.0,2.0,3.0,4.0
Tom,10.0,20.0,30.0,,


In [56]:
# 通过判断条件来进行过滤
cond = (df3 >= 30).all(axis=1)
cond

Cao       False
Hei        True
Jerry     False
Nini      False
Spicke    False
Tom       False
dtype: bool

In [57]:
df3[cond]

Unnamed: 0,python,Java,C++,Ruby,Html
Hei,60.0,45.0,90.0,88.0,37.0


##### (2) 过滤函数

In [58]:
df3

Unnamed: 0,python,Java,C++,Ruby,Html
Cao,22.0,33.0,44.0,12.0,56.0
Hei,60.0,45.0,90.0,88.0,37.0
Jerry,22.0,33.0,44.0,12.0,
Nini,67.0,1.0,2.0,3.0,4.0
Spicke,,60.0,2.0,3.0,4.0
Tom,10.0,20.0,30.0,,


In [59]:
df3["H5"] = np.nan
df3

Unnamed: 0,python,Java,C++,Ruby,Html,H5
Cao,22.0,33.0,44.0,12.0,56.0,
Hei,60.0,45.0,90.0,88.0,37.0,
Jerry,22.0,33.0,44.0,12.0,,
Nini,67.0,1.0,2.0,3.0,4.0,
Spicke,,60.0,2.0,3.0,4.0,
Tom,10.0,20.0,30.0,,,


In [60]:
df3.dropna(axis=1)  # 通过axis = 0,或者是1  可以改变过滤掉空值的轴  看是行还是列

Unnamed: 0,Java,C++
Cao,33.0,44.0
Hei,45.0,90.0
Jerry,33.0,44.0
Nini,1.0,2.0
Spicke,60.0,2.0
Tom,20.0,30.0


In [61]:
df3.dropna(axis=1,how="all")

Unnamed: 0,python,Java,C++,Ruby,Html
Cao,22.0,33.0,44.0,12.0,56.0
Hei,60.0,45.0,90.0,88.0,37.0
Jerry,22.0,33.0,44.0,12.0,
Nini,67.0,1.0,2.0,3.0,4.0
Spicke,,60.0,2.0,3.0,4.0
Tom,10.0,20.0,30.0,,


##### (3) 填充函数 Series/DataFrame

fillna()

In [62]:
df3

Unnamed: 0,python,Java,C++,Ruby,Html,H5
Cao,22.0,33.0,44.0,12.0,56.0,
Hei,60.0,45.0,90.0,88.0,37.0,
Jerry,22.0,33.0,44.0,12.0,,
Nini,67.0,1.0,2.0,3.0,4.0,
Spicke,,60.0,2.0,3.0,4.0,
Tom,10.0,20.0,30.0,,,


In [63]:
df3.fillna(-1)

Unnamed: 0,python,Java,C++,Ruby,Html,H5
Cao,22.0,33.0,44.0,12.0,56.0,-1.0
Hei,60.0,45.0,90.0,88.0,37.0,-1.0
Jerry,22.0,33.0,44.0,12.0,-1.0,-1.0
Nini,67.0,1.0,2.0,3.0,4.0,-1.0
Spicke,-1.0,60.0,2.0,3.0,4.0,-1.0
Tom,10.0,20.0,30.0,-1.0,-1.0,-1.0


In [64]:
df3.fillna(method="backfill")   # 'backfill', 'bfill', 'pad', 'ffill', None

Unnamed: 0,python,Java,C++,Ruby,Html,H5
Cao,22.0,33.0,44.0,12.0,56.0,
Hei,60.0,45.0,90.0,88.0,37.0,
Jerry,22.0,33.0,44.0,12.0,4.0,
Nini,67.0,1.0,2.0,3.0,4.0,
Spicke,10.0,60.0,2.0,3.0,4.0,
Tom,10.0,20.0,30.0,,,


In [66]:
df3.fillna(method="ffill") #'backfill', 'bfill', 'pad', 'ffill', None
#forward

Unnamed: 0,python,Java,C++,Ruby,Html,H5
Cao,22.0,33.0,44.0,12.0,56.0,
Hei,60.0,45.0,90.0,88.0,37.0,
Jerry,22.0,33.0,44.0,12.0,37.0,
Nini,67.0,1.0,2.0,3.0,4.0,
Spicke,67.0,60.0,2.0,3.0,4.0,
Tom,10.0,20.0,30.0,3.0,4.0,


In [67]:
df3.fillna(method="bfill",axis = 1)

Unnamed: 0,python,Java,C++,Ruby,Html,H5
Cao,22.0,33.0,44.0,12.0,56.0,
Hei,60.0,45.0,90.0,88.0,37.0,
Jerry,22.0,33.0,44.0,12.0,,
Nini,67.0,1.0,2.0,3.0,4.0,
Spicke,60.0,60.0,2.0,3.0,4.0,
Tom,10.0,20.0,30.0,,,
