In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
np.random.seed(42)

### 处理缺失值 --> 删除缺失值

In [2]:
animals = Series(['cat', 'dog', np.nan, 'duck'])     # pandas使用浮点值NaN表示缺失值
animals[0] = None          # Python内建的None值在对象数组中也会被当作NA(NOT AVAILABLE)处理
animals.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [3]:
# 在Series上使用dropna，它会返回Series中所有的非空数据及其索引值
# 等价于animals[animals.notnull()]
animals.dropna()       

1     dog
3    duck
dtype: object

In [4]:
# 当处理DataFrame对象时，dropna默认会删除包含缺失值的行，传入how='all'时，将删除所有值均为NA的行
data = DataFrame([[1, 6.5, 3], [1, np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.5, 3]])
data.dropna()       

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [5]:
data[4] = np.nan
data.dropna(axis=1, how='all')      # 删除列

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


过滤DataFrame的行的相关方法往往涉及时间序列数据，若想保留一定数量的观察值的行，可用thresh参数:

In [6]:
df = DataFrame(np.random.randn(5, 3))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,0.496714,,
1,1.52303,,
2,1.579213,,-0.469474
3,0.54256,,-0.46573
4,0.241962,-1.91328,-1.724918


In [7]:
# 保留两行含有空值的行作为观察行
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,1.579213,,-0.469474
3,0.54256,,-0.46573
4,0.241962,-1.91328,-1.724918


### 处理缺失值 --> 补全缺失值

In [8]:
df.fillna(0)          # 传入inplace=True直接修改原DataFrame

Unnamed: 0,0,1,2
0,0.496714,0.0,0.0
1,1.52303,0.0,0.0
2,1.579213,0.0,-0.469474
3,0.54256,0.0,-0.46573
4,0.241962,-1.91328,-1.724918


In [9]:
# 第二列的控制设置为0，第三列的控制设置为1
df.fillna({1 : 0, 2 : 1})

Unnamed: 0,0,1,2
0,0.496714,0.0,1.0
1,1.52303,0.0,1.0
2,1.579213,0.0,-0.469474
3,0.54256,0.0,-0.46573
4,0.241962,-1.91328,-1.724918


用于重建索引的相同的插值方法也可以用于fillna:

In [10]:
df.fillna(method='bfill')

Unnamed: 0,0,1,2
0,0.496714,-1.91328,-0.469474
1,1.52303,-1.91328,-0.469474
2,1.579213,-1.91328,-0.469474
3,0.54256,-1.91328,-0.46573
4,0.241962,-1.91328,-1.724918


In [11]:
df.fillna(method='bfill', limit=2)

Unnamed: 0,0,1,2
0,0.496714,,-0.469474
1,1.52303,,-0.469474
2,1.579213,-1.91328,-0.469474
3,0.54256,-1.91328,-0.46573
4,0.241962,-1.91328,-1.724918


In [12]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,0.496714,-1.91328,-0.886707
1,1.52303,-1.91328,-0.886707
2,1.579213,-1.91328,-0.469474
3,0.54256,-1.91328,-0.46573
4,0.241962,-1.91328,-1.724918


### 数据转换 --> 删除重复值

In [13]:
data = DataFrame({'k1': ['one', 'two'] * 2 + ['two'], 'k2': [1, 2, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,2
2,one,3
3,two,4
4,two,4


In [14]:
data.duplicated()      # k1, k2的值都相等时判别为重复

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [15]:
data.drop_duplicates()  # duplicated和drop_duplicated默认保留第一个观测值，传入keep='last'将会返回最后一个

Unnamed: 0,k1,k2
0,one,1
1,two,2
2,one,3
3,two,4


In [16]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2
0,one,1
1,two,2


### 使用函数或映射进行数据转换

In [17]:
cities = DataFrame({'city': ['hefei', 'shanghai', 'hefei', 'Suzhou', 'Suzhou']})
city_of_province = {'Hefei':'Anhui', 'Shanghai':'Shanghai', 'Suzhou':'Jiangsu', 'Nanjing':'Jiangsu'}
# 等价于cities['province'] = cities['city'].map(lambda x: city_of_province[x.title()])
titles = cities['city'].str.title()
cities['province'] = titles.map(city_of_province)  
cities

Unnamed: 0,city,province
0,hefei,Anhui
1,shanghai,Shanghai
2,hefei,Anhui
3,Suzhou,Jiangsu
4,Suzhou,Jiangsu


### 替代值

In [18]:
data = Series([-999.0, -999.0, -1000])
data.replace(-999, np.nan)

0       NaN
1       NaN
2   -1000.0
dtype: float64

In [19]:
data.replace([-999, -1000], np.nan)

0   NaN
1   NaN
2   NaN
dtype: float64

In [20]:
data.replace({-999: np.nan, -1000: 0})

0    NaN
1    NaN
2    0.0
dtype: float64

### 重命名轴索引

In [21]:
data = DataFrame(np.arange(12).reshape(3, 4), index=['SH', 'SZ', 'BJ'], columns=['one', 'two', 'three', 'four'])
data.index = data.index.map(lambda x: x[:].lower())
data

Unnamed: 0,one,two,three,four
sh,0,1,2,3
sz,4,5,6,7
bj,8,9,10,11


如果想要创建数据集转换后的版本，并且不修改原有的数据集，一个有用的方法是rename:

In [22]:
data.rename(index=str.upper, columns=str.title)

Unnamed: 0,One,Two,Three,Four
SH,0,1,2,3
SZ,4,5,6,7
BJ,8,9,10,11


rename可以结合字典型对象使用，为轴标签的子集提供新的值:

In [23]:
data.rename(index={'bj':'hf'}, columns={'three':3})      # 行索引bj替换为hf, 列索引three替换为3

Unnamed: 0,one,two,3,four
sh,0,1,2,3
sz,4,5,6,7
hf,8,9,10,11


In [24]:
data.rename(index={'bj':'hf'}, columns={'three':3}, inplace=True)  # 传入inplace=True直接修改原数据集
data

Unnamed: 0,one,two,3,four
sh,0,1,2,3
sz,4,5,6,7
hf,8,9,10,11


### 检测和过滤异常值

In [25]:
data = DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.026699,0.026885,0.027965,-0.011037
std,1.006796,0.965712,1.0102,1.006654
min,-2.991136,-3.019512,-2.896255,-3.241267
25%,-0.673146,-0.622188,-0.670083,-0.675299
50%,0.02031,0.050559,0.028459,-0.007509
75%,0.690511,0.664881,0.693881,0.638627
max,3.926238,3.243093,3.852731,3.152057


In [26]:
# 找出一列中绝对值大于3的值
col = data[2]
col[np.abs(col) > 3]

48     3.852731
485    3.137749
572    3.109919
Name: 2, dtype: float64

In [27]:
# 选出所有绝对值大于3的行，可以对布尔值DataFrame使用any方法
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
48,0.513786,0.515048,3.852731,0.570891
61,0.774634,-0.92693,-0.059525,-3.241267
115,-0.249036,0.576557,0.31125,3.078881
400,3.193108,0.298753,-0.751791,-0.426358
485,1.374438,-2.135674,3.137749,1.056057
502,-1.30882,-3.019512,0.18385,1.800511
572,0.2502,1.995667,3.109919,0.606723
720,3.926238,-2.084113,1.724697,-0.287448
925,-0.158154,3.243093,2.307916,-0.181449
991,-1.049655,1.362563,1.640615,3.152057


In [28]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,-1.0,1.0,-1.0
1,-1.0,1.0,-1.0,1.0
2,-1.0,-1.0,1.0,-1.0
3,1.0,-1.0,-1.0,-1.0
4,1.0,-1.0,-1.0,1.0


In [29]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.025579,0.026662,0.026865,-0.011026
std,1.003035,0.964871,1.006577,1.005201
min,-2.991136,-3.0,-2.896255,-3.0
25%,-0.673146,-0.622188,-0.670083,-0.675299
50%,0.02031,0.050559,0.028459,-0.007509
75%,0.690511,0.664881,0.693881,0.638627
max,3.0,3.0,3.0,3.0


### 置换和随机抽样
调用np.random.permutation时根据想要的轴长度可以产生一个表示新顺序的整数数组：

In [30]:
df = DataFrame(np.arange(20).reshape(5, 4))
sampler = np.random.permutation(5)
sampler

array([2, 0, 3, 1, 4])

In [31]:
df.take(sampler)

Unnamed: 0,0,1,2,3
2,8,9,10,11
0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
4,16,17,18,19


要选出一个不含有替代值的随机子集:

In [32]:
df.sample(3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
1,4,5,6,7


要生成一个带有样本值的样本(允许有重复选择)，传入replace=True:

In [33]:
choices = Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

2   -1
2   -1
1    7
4    4
1    7
4    4
3    6
4    4
1    7
3    6
dtype: int64

### 计算指标/虚拟变量
将分类变量转换为"虚拟"或"指标"矩阵是另一种用于统计建模或机器学习的转换操作。如果DataFrame中有一列有K个不同的值，则可以衍生一个K列的值为1或0的矩阵或DataFrame：

In [34]:
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [35]:
# 特征提取
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


在某些情况下，需要在指标DataFrame的列上加上前缀，然后与其他数据合并:

In [36]:
dummies = pd.get_dummies(df['key'], prefix='key')
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [37]:
df_with_dummies = df[['data1']].join(dummies)
df_with_dummies

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


将get_dummies与cut等离散化函数结合使用是统计应用的一个有用方法:

In [38]:
values = np.random.rand(10)
values

array([0.9932652 , 0.86855703, 0.56463986, 0.51662759, 0.28460038,
       0.51720943, 0.8481815 , 0.68517451, 0.29004403, 0.20146392])

In [39]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,1,0,0
3,0,0,1,0,0
4,0,1,0,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,1,0,0,0
9,0,1,0,0,0
