In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 高级处理-缺失值处理

## 判断缺失值是否存在    pd.isnull(a)    pd.notnull(a)

In [2]:
# 读取电影数据
movie = pd.read_csv("../data/IMDB-Movie-Data.csv")
# iloc切片: 行,列
movie.iloc[:5, :5]

Unnamed: 0,Rank,Title,Genre,Description,Director
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer


In [3]:
pd.notnull(movie)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
995,True,True,True,True,True,True,True,True,True,True,False,True
996,True,True,True,True,True,True,True,True,True,True,True,True
997,True,True,True,True,True,True,True,True,True,True,True,True
998,True,True,True,True,True,True,True,True,True,True,False,True


In [4]:
# 是不是全都不为空
np.all(pd.notnull(movie))

False

In [5]:
# 有空的吗
np.any(pd.isnull(movie))

True

## 存在缺失值nan,并且是np.nan

### 删除    a.dropna()    默认不修改原数据
参数:
- axis: 0代表row, 1代表col
- inplace: True 修改原值

pandas删除缺失值，使用dropna的前提是:缺失值的类型必须是np.nan

In [6]:
movie.shape

(1000, 12)

In [7]:
# 不修改原数据
data = movie.dropna(axis=0)

In [8]:
np.any(data.isnull())

False

In [9]:
data.shape

(838, 12)

### 替换缺失值	a.fillna(替换值, inplace=True)

In [10]:
movie['Revenue (Millions)']

0      333.13
1      126.46
2      138.12
3      270.32
4      325.02
        ...  
995       NaN
996     17.54
997     58.01
998       NaN
999     19.64
Name: Revenue (Millions), Length: 1000, dtype: float64

In [11]:
# 使用中位数填充NaN
movie['Revenue (Millions)'].fillna(movie['Revenue (Millions)'].mean(), inplace=True)

In [12]:
movie['Revenue (Millions)']

0      333.130000
1      126.460000
2      138.120000
3      270.320000
4      325.020000
          ...    
995     82.956376
996     17.540000
997     58.010000
998     82.956376
999     19.640000
Name: Revenue (Millions), Length: 1000, dtype: float64

In [13]:
# 分别查看每一列
for i in movie.columns:
    # 只要一列有一个空值就为True
    if np.any(pd.isnull(movie[i])) == True:
        print(i)
        # 使用中位数替换空值
        movie[i].fillna(movie[i].mean(), inplace=True)

Metascore


In [14]:
np.any(pd.isnull(movie[i]))
# False 说明全不为空了

False

 ## 不是缺失值nan，有默认标记的 入 ?

In [15]:
wis = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
wis.head()

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


In [16]:
# 
np.any(pd.isnull(wis))

False

In [17]:
# 1、先替换"?"为 np.nan
wis = wis.replace(to_replace="?", value=np.nan)

In [18]:
# 2、再进行缺失值的处理
wis = wis.dropna()
wis[:5]

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4
