In [74]:
import pandas as pd
from numpy import nan as NA # 重要一行！
# 精简版！
data = pd.DataFrame([[1., 6.5, 3.], [1.,],[], [NA,6.5, 3.]])

In [75]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [76]:
cleaned = data.dropna()

In [77]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


### how='all'将只丢弃全为 NA 的那些行

In [78]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


### 删除一列数据

In [79]:
data[4] = NA

In [80]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [81]:
data.dropna(axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [82]:
df = pd.DataFrame(np.random.randn(7, 3))

In [83]:
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA

In [84]:
df

Unnamed: 0,0,1,2
0,0.01529,,
1,-0.167192,,
2,-0.01024,,-1.227654
3,0.938164,,-0.601011
4,1.330992,1.435171,1.094025
5,-0.428823,-1.665424,-0.238817
6,-1.803603,0.23758,-0.449573


In [85]:
df.dropna()

Unnamed: 0,0,1,2
4,1.330992,1.435171,1.094025
5,-0.428823,-1.665424,-0.238817
6,-1.803603,0.23758,-0.449573


### 只保留至少有x个非NAN值的行，thresh参数

In [86]:
# 只保留至少有2个非NAN值的行
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.01024,,-1.227654
3,0.938164,,-0.601011
4,1.330992,1.435171,1.094025
5,-0.428823,-1.665424,-0.238817
6,-1.803603,0.23758,-0.449573


### 填充缺失值

In [87]:
df.fillna({1:0.5,2:0}) # 字典填充缺失值

Unnamed: 0,0,1,2
0,0.01529,0.5,0.0
1,-0.167192,0.5,0.0
2,-0.01024,0.5,-1.227654
3,0.938164,0.5,-0.601011
4,1.330992,1.435171,1.094025
5,-0.428823,-1.665424,-0.238817
6,-1.803603,0.23758,-0.449573


fillna 默认会返回新对象,但也可以对现有对象进行就地修改

In [88]:
_ = df.fillna(0, inplace=True)

In [89]:
df

Unnamed: 0,0,1,2
0,0.01529,0.0,0.0
1,-0.167192,0.0,0.0
2,-0.01024,0.0,-1.227654
3,0.938164,0.0,-0.601011
4,1.330992,1.435171,1.094025
5,-0.428823,-1.665424,-0.238817
6,-1.803603,0.23758,-0.449573


### method='ffill'

In [90]:
df = pd.DataFrame(np.random.randn(6, 3))

In [91]:
df

Unnamed: 0,0,1,2
0,-0.090755,-0.969227,1.395343
1,0.254748,-0.158391,1.324646
2,-0.226522,-0.307543,-1.082089
3,0.458022,-0.85747,-0.293071
4,-0.894539,-0.141055,1.51859
5,-0.30773,-0.43431,1.118812


In [92]:
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA

In [93]:
df

Unnamed: 0,0,1,2
0,-0.090755,-0.969227,1.395343
1,0.254748,-0.158391,1.324646
2,-0.226522,,-1.082089
3,0.458022,,-0.293071
4,-0.894539,,
5,-0.30773,,


In [94]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.090755,-0.969227,1.395343
1,0.254748,-0.158391,1.324646
2,-0.226522,-0.158391,-1.082089
3,0.458022,-0.158391,-0.293071
4,-0.894539,-0.158391,-0.293071
5,-0.30773,-0.158391,-0.293071


ffill会按照当前列NAN前面的值，后续重复填充，可以通过limit参数进行设置！

In [95]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.090755,-0.969227,1.395343
1,0.254748,-0.158391,1.324646
2,-0.226522,-0.158391,-1.082089
3,0.458022,-0.158391,-0.293071
4,-0.894539,,-0.293071
5,-0.30773,,-0.293071


只要有些创新,你就可以利用 fillna 实现许多别的功能。比如说,你可以传入
Series 的平均值或中位数:

In [96]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

### 移除重复数据

In [97]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],'k2': [1, 1, 2, 3, 3, 4, 4]})

In [98]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


DataFrame 的 duplicated 方法返回一个布尔型 Series,表示各行是否是重复行
(前面出现过的行):

In [99]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [100]:
# 删除重复行
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [101]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [102]:
data['v1'] = range(7)

In [103]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [105]:
# 根据第一列去重
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [108]:
# 根据k1与k2两列去重，并保留最后一个
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### 利用函数或映射进行数据转换

In [109]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork','bacon','Pastrami', 'corned beef','Bacon','pastrami', 'honey ham', 'novlox'],'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5,6]})

In [111]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,novlox,6.0


In [113]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

In [114]:
# 转小写
lowercased = data['food'].str.lower()

In [115]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8         novlox
Name: food, dtype: object

In [116]:
data['animal'] = lowercased.map(meat_to_animal)

In [117]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,novlox,6.0,


In [None]:
# 高级函数使用
data['food'].map(lambda x: meat_to_animal[x.lower()])

### 替换值

In [120]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

In [121]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [122]:
# 替换多个值
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [124]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [126]:
# 要让每个值有不同的替换值,可以传递一个替换列表:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [127]:
# 传入的参数也可以是字典:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

**`data.replace`方法与 `data.str.replace`不同,后者做的是字符串的元
素级替换。**

### 离散化和面元划分

In [128]:
# 为了便于分析,连续数据常常被离散化或拆分为“面元”(bin)。
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

接下来将这些数据划分为“18 到 25”、“26 到 35”、“35 到 60”以及“60
以上”几个面元。要实现该功能,你需要使用 pandas 的 cut 函数:

In [129]:
bins = [18, 25, 35, 60, 100]

In [130]:
cats = pd.cut(ages, bins)

In [131]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [132]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [133]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [134]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

`pd.value_counts(cats)`是 `pandas.cut`结果的面元计数。

跟“区间”的数学符号一样,圆括号表示开端,而方括号则表示闭端(包
括)。哪边是闭端可以通过 right=False 进行修改:

In [135]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

你可以通过传递一个列表或数组到 labels,设置自己的面元名称:

In [137]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged',
'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

### 计算指标/哑变量

另一种常用于统计建模或机器学习的转换方式是:将分类变量(categorical
variable)转换为“哑变量”或“指标矩阵”。

In [139]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],'data1': range(6)})

In [140]:
df

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [141]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


有时候,你可能想给指标 DataFrame 的列加上一个前缀,以便能够跟其他数据
进行合并。get_dummies 的 prefix 参数可以实现该功能

In [142]:
dummies = pd.get_dummies(df['key'], prefix='key')

In [143]:
df_with_dummy = df[['data1']].join(dummies)

In [144]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0
