#### 处理丢失数据
- 有两种丢失数据：
  - None
  - np.nan(NaN)

In [19]:
import numpy as np
type(None)

NoneType

In [20]:
type(np.nan)

float

- 为什么在数据分析中需要用到的是浮点类型的空而不是对象类型
  - 数据分析中会常常使用某些形式的运算来处理原始数据，如果原始数据中的空值为NaN的形式，则不会干扰或者中断运算
  - NaN可以参与运算的
  - None是不可以参与运算的

- 在pandas中如果遇到了None形式的空值，则pandas会将其强转成NaN形式

In [21]:
np.nan + 1

nan

#### pandas处理空值的操作
- isnull
- notnull
- any
- all
- dropna
- fillna

In [22]:
import pandas as pd
from pandas import DataFrame, Series

In [23]:
# 伪造一组数据（存在空值)
df = DataFrame(data=np.random.randint(0, 100, size=(8, 6)))
df.iloc[2, 3] = None
df.iloc[4, 4] = np.nan
df.iloc[5, 2] = None
df

Unnamed: 0,0,1,2,3,4,5
0,16,71,17.0,87.0,68.0,2
1,12,80,50.0,67.0,16.0,91
2,59,85,13.0,,62.0,98
3,71,29,55.0,25.0,36.0,47
4,53,27,67.0,17.0,,25
5,18,45,,69.0,77.0,23
6,27,51,12.0,26.0,77.0,68
7,74,78,94.0,63.0,25.0,63


In [24]:
df.notnull()

Unnamed: 0,0,1,2,3,4,5
0,True,True,True,True,True,True
1,True,True,True,True,True,True
2,True,True,True,False,True,True
3,True,True,True,True,True,True
4,True,True,True,True,False,True
5,True,True,False,True,True,True
6,True,True,True,True,True,True
7,True,True,True,True,True,True


In [25]:
# any用来检测行或列中是否存在空值
# 将上步的bool值作为原数据的行索引
drop_index = df.loc[(df.isnull().any(axis=1))].index
df.drop(labels=drop_index, axis=0) # 将缺失行删除

Unnamed: 0,0,1,2,3,4,5
0,16,71,17.0,87.0,68.0,2
1,12,80,50.0,67.0,16.0,91
3,71,29,55.0,25.0,36.0,47
6,27,51,12.0,26.0,77.0,68
7,74,78,94.0,63.0,25.0,63


In [26]:
df.notnull().all(axis=1)
df.loc[df.notnull().all(axis=1)]

Unnamed: 0,0,1,2,3,4,5
0,16,71,17.0,87.0,68.0,2
1,12,80,50.0,67.0,16.0,91
3,71,29,55.0,25.0,36.0,47
6,27,51,12.0,26.0,77.0,68
7,74,78,94.0,63.0,25.0,63


- 规律
  - isnull--any
  - notnull--all

- dropna:可以直接将缺失的行或者列进行删除

In [27]:
df.dropna(axis=0)

Unnamed: 0,0,1,2,3,4,5
0,16,71,17.0,87.0,68.0,2
1,12,80,50.0,67.0,16.0,91
3,71,29,55.0,25.0,36.0,47
6,27,51,12.0,26.0,77.0,68
7,74,78,94.0,63.0,25.0,63


In [28]:
df.bfill(axis=0)

Unnamed: 0,0,1,2,3,4,5
0,16,71,17.0,87.0,68.0,2
1,12,80,50.0,67.0,16.0,91
2,59,85,13.0,25.0,62.0,98
3,71,29,55.0,25.0,36.0,47
4,53,27,67.0,17.0,77.0,25
5,18,45,12.0,69.0,77.0,23
6,27,51,12.0,26.0,77.0,68
7,74,78,94.0,63.0,25.0,63


In [31]:
df = DataFrame(data=np.random.randint(0, 100, size=(8, 4)))
df.iloc[2] = np.array([0, 0, 0, 0])
df.iloc[4] = np.array([0, 0, 0, 0])
df.iloc[5] = np.array([0, 0, 0, 0])
df


Unnamed: 0,0,1,2,3
0,26,62,9,50
1,36,25,8,49
2,0,0,0,0
3,83,16,95,96
4,0,0,0,0
5,0,0,0,0
6,50,11,40,50
7,89,63,95,99


In [None]:
df.drop_duplicates(keep='first')

Unnamed: 0,0,1,2,3
0,26,62,9,50
1,36,25,8,49
2,0,0,0,0
3,83,16,95,96
6,50,11,40,50
7,89,63,95,99


In [None]:
df = DataFrame(data=np.random.random(size=(1000, 3)), columns=['A', 'B', 'C'])
# 判定异常值的条件
twice_std = df['C'].std() * 2
# ~(df['C'] > twice_std)
df.loc[~(df['C'] > twice_std)]

Unnamed: 0,A,B,C
0,0.648051,0.526650,0.303466
1,0.111493,0.258675,0.277644
2,0.372196,0.386195,0.558934
4,0.348275,0.535242,0.266804
5,0.025267,0.373926,0.336796
...,...,...,...
991,0.711376,0.899220,0.391017
992,0.050938,0.822067,0.477247
994,0.172264,0.746677,0.335286
997,0.565415,0.187625,0.175747
