In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 高级处理-缺失值处理

## 判断缺失值是否存在 pd.isnull(a) pd.notnull(a)

In [3]:
# 读取电影数据
movie = pd.read_csv("../data/IMDB-Movie-Data.csv")
# iloc切片: 行,列
movie.shape

(1000, 12)

In [4]:
movie.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [5]:
pd.notnull(movie)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
995,True,True,True,True,True,True,True,True,True,True,False,True
996,True,True,True,True,True,True,True,True,True,True,True,True
997,True,True,True,True,True,True,True,True,True,True,True,True
998,True,True,True,True,True,True,True,True,True,True,False,True


In [6]:
pd.isnull(movie)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False,False,False,True,False
996,False,False,False,False,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,False,False,False,False,True,False


In [7]:
# 有一个为null就为True
np.any(pd.isnull(movie))

True

In [8]:
# 有一个为null就为False
np.all(pd.notnull(movie))

False

## 存在缺失值nan,并且是np.nan

### a.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

参数：

- axis：轴。0或'index'，表示按行删除；1或'columns'，表示按列删除。
- how：筛选方式。‘any’，表示该行/列只要有一个以上的空值，就删除该行/列；‘all’，表示该行/列全部都为空值，就删除该行/列。
- thresh：非空元素最低数量。int型，默认为None。如果该行/列中，非空元素数量小于这个值，就删除该行/列。
- subset：子集。列表，元素为行或者列的索引。如果axis=0或者‘index’，subset中元素为列的索引；如果axis=1或者‘column’，subset中元素为行的索引。由subset限制的子区域，是判断是否删除该行/列的条件判断区域。
- inplace：是否原地替换。布尔值，默认为False。如果为True，则在原DataFrame上进行操作，返回值为None。

In [11]:
movie.shape

(1000, 12)

In [18]:
movie_dropna = movie.dropna(axis=0, inplace=False)
movie_dropna.shape

(838, 12)

In [19]:
np.any(movie_dropna.isnull())

False

### 替换缺失值 a.fillna(替换值, inplace=False)

In [22]:
np.any(movie["Revenue (Millions)"].isnull())

True

In [24]:
movie_fillna = movie["Revenue (Millions)"].fillna(
    movie["Revenue (Millions)"].mean(), inplace=False
)
movie_fillna.shape

(1000,)

In [25]:
np.any(movie["Revenue (Millions)"].isnull())

True

#### 检查所有列

In [26]:
# 分别查看每一列
for i in movie.columns:
    # 只要一列有一个空值就为True
    if np.any(movie[i].isnull()):
        print(i)
        # 使用中位数替换空值
        movie[i].fillna(movie[i].mean(), inplace=True)

Revenue (Millions)
Metascore


In [27]:
np.any(pd.isnull(movie[i]))

False

# 不是缺失值nan，有默认标记的 "?"

In [28]:
wis = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
)
wis.head()

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


In [29]:
# 虽然不为空,但是有?
np.any(wis.isnull())

False

In [30]:
wis.replace(to_replace="?", value=np.nan, inplace=True)

In [31]:
# 有null了
np.any(wis.isnull())

True

In [33]:
wis.dropna(axis=0, inplace=True)

In [34]:
np.any(wis.isnull())

False