# 第 7 章 数据清洗和准备

## 7.1 处理缺失数据

In [1]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
string_data=pd.Series(['aardvark','artichoke',np.nan,'avocado'])

In [5]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [6]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
string_data[0]=None

In [8]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### 滤除缺失数据

In [9]:
from numpy import nan as NA

In [10]:
data=pd.Series([1,NA,3.5,NA,7])

In [11]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [15]:
data=pd.DataFrame([[1.,6.5,3.],[1.,NA,NA]
                 ,[NA,NA,NA],[NA,6.5,3.]])

In [16]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [17]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [20]:
data.dropna(1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [21]:
df=pd.DataFrame(np.random.randn(7,3))

In [22]:
df

Unnamed: 0,0,1,2
0,-0.533019,0.731368,-0.400742
1,-1.806063,1.632134,0.664484
2,0.573573,1.333208,0.325039
3,-0.775205,0.961142,1.033067
4,-0.646464,0.520725,-0.929506
5,1.53338,-0.044533,-1.248943
6,-0.624205,-0.209803,-0.098941


In [24]:
df.iloc[:4,1]=NA

In [25]:
df.iloc[:2,2]=NA

In [26]:
df

Unnamed: 0,0,1,2
0,-0.533019,,
1,-1.806063,,
2,0.573573,,0.325039
3,-0.775205,,1.033067
4,-0.646464,0.520725,-0.929506
5,1.53338,-0.044533,-1.248943
6,-0.624205,-0.209803,-0.098941


In [28]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.646464,0.520725,-0.929506
5,1.53338,-0.044533,-1.248943
6,-0.624205,-0.209803,-0.098941


In [34]:
df.dropna(thresh=4)

#  没懂，但是以往SAS没用过这的功能，可能理解有问题

Unnamed: 0,0,1,2


### 填充缺失数据

In [35]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.533019,0.0,0.0
1,-1.806063,0.0,0.0
2,0.573573,0.0,0.325039
3,-0.775205,0.0,1.033067
4,-0.646464,0.520725,-0.929506
5,1.53338,-0.044533,-1.248943
6,-0.624205,-0.209803,-0.098941


In [36]:
df.fillna({1:0.5,2:0})
# 注意默认作用于列的

Unnamed: 0,0,1,2
0,-0.533019,0.5,0.0
1,-1.806063,0.5,0.0
2,0.573573,0.5,0.325039
3,-0.775205,0.5,1.033067
4,-0.646464,0.520725,-0.929506
5,1.53338,-0.044533,-1.248943
6,-0.624205,-0.209803,-0.098941


In [37]:
_=df.fillna(0,inplace=True)

In [38]:
df

Unnamed: 0,0,1,2
0,-0.533019,0.0,0.0
1,-1.806063,0.0,0.0
2,0.573573,0.0,0.325039
3,-0.775205,0.0,1.033067
4,-0.646464,0.520725,-0.929506
5,1.53338,-0.044533,-1.248943
6,-0.624205,-0.209803,-0.098941


In [39]:
df=pd.DataFrame(np.random.randn(6,3))

In [40]:
df.iloc[2:,1]=NA

In [41]:
df.iloc[4:,2]=NA

In [42]:
df

Unnamed: 0,0,1,2
0,0.740356,-0.354509,2.030864
1,0.014972,-0.372163,1.598933
2,-0.812585,,1.134376
3,0.514329,,-1.430178
4,0.242766,,
5,0.272862,,


In [43]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.740356,-0.354509,2.030864
1,0.014972,-0.372163,1.598933
2,-0.812585,-0.372163,1.134376
3,0.514329,-0.372163,-1.430178
4,0.242766,-0.372163,-1.430178
5,0.272862,-0.372163,-1.430178


In [44]:
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,0.740356,-0.354509,2.030864
1,0.014972,-0.372163,1.598933
2,-0.812585,-0.372163,1.134376
3,0.514329,-0.372163,-1.430178
4,0.242766,,-1.430178
5,0.272862,,-1.430178


In [45]:
data=pd.Series([1.,NA,3.5,NA,7])

In [46]:
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [47]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## 7.2 数据转换

In [48]:
data=pd.DataFrame({'k1':['one','two']*3+['two'],
                  'k2':[1,1,2,3,3,4,4]})

In [49]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [50]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [51]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [52]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [53]:
data['v1']=range(7)

In [54]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [55]:
data.drop_duplicates(['k1'])
# SAS sort 只能排序去重，也就是只能去重相邻的两条重复值

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [56]:
data.drop_duplicates(['k1','k2'],keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### 利用函数或映射进行数据转换

In [60]:
data=pd.DataFrame({
    'food':['bacon','pulled prok','bacon','Pastrami','corned beef'
                   ,'Bacon' ,'pastrami','honey ham','nova lox']
    ,'ounces':[4,3,12,6,7.5,8,3,5,6]})

In [61]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled prok,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [62]:
meat_to_animal={
    'bacon':'pig',
    'pulled prok':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}

In [63]:
meat_to_animal

{'bacon': 'pig',
 'pulled prok': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'}

In [64]:
lowercased=data['food'].str.lower()

In [66]:
lowercased

0          bacon
1    pulled prok
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [67]:
data['animal']=lowercased.map(meat_to_animal)

In [68]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled prok,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [69]:
data['food'].map(lambda x:meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### 替换值

In [70]:
data=pd.Series([1.,-999.,2.,-999.,-1000.,3.])

In [71]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [72]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [73]:
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [74]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [75]:
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [76]:
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### 重命名轴索引

In [81]:
data=pd.DataFrame(np.arange(12).reshape((3,4)),
                 index=['Ohio','Colorado','New York'],
                 columns=['one','two','three','four'])

In [82]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [83]:
transform=lambda x:x[:4].upper()

In [84]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [85]:
data.index=data.index.map(transform)

In [86]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [87]:
data.rename(index=str.title,columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [88]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [89]:
data.rename(index={'OHIO':'INDIANA'},
           columns={'three':'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [90]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [91]:
data.rename(index={'OHIO':'INDIANA'},inplace=True)

In [92]:
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### 离散化和面元划分

In [93]:
ages=[20,22,23,27,21,23,37,31,61,45,41,32]

In [94]:
bins=[18,25,35,60,100]

In [95]:
cats=pd.cut(ages,bins)

In [96]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [97]:
cats.dtype

CategoricalDtype(categories=[(18, 25], (25, 35], (35, 60], (60, 100]],
              ordered=True)

In [98]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [99]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [100]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [101]:
pd.value_counts(cats.codes)

0    5
2    3
1    3
3    1
dtype: int64

In [106]:
pd.cut(ages,[18,26,36,61,100],right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [107]:
group_names=['Youth','YoungAdult','MiddleAged','Senior']

In [108]:
pd.cut(ages,bins,labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [109]:
data=np.random.rand(20)

In [110]:
pd.cut(data,4,precision=2)

[(0.3, 0.52], (0.079, 0.3], (0.079, 0.3], (0.52, 0.74], (0.74, 0.95], ..., (0.079, 0.3], (0.079, 0.3], (0.74, 0.95], (0.3, 0.52], (0.079, 0.3]]
Length: 20
Categories (4, interval[float64]): [(0.079, 0.3] < (0.3, 0.52] < (0.52, 0.74] < (0.74, 0.95]]

In [111]:
data=np.random.randn(1000)

In [112]:
cats=pd.qcut(data,4)

In [113]:
cats

[(-3.588, -0.681], (-0.681, -0.00166], (-0.00166, 0.664], (-0.681, -0.00166], (0.664, 3.674], ..., (-0.681, -0.00166], (-0.00166, 0.664], (-3.588, -0.681], (-3.588, -0.681], (-0.681, -0.00166]]
Length: 1000
Categories (4, interval[float64]): [(-3.588, -0.681] < (-0.681, -0.00166] < (-0.00166, 0.664] < (0.664, 3.674]]

In [114]:
pd.value_counts(cats)

(0.664, 3.674]        250
(-0.00166, 0.664]     250
(-0.681, -0.00166]    250
(-3.588, -0.681]      250
dtype: int64

In [115]:
pd.qcut(data,[0,0.1,0.5,0.9,1.])

[(-3.588, -1.291], (-1.291, -0.00166], (-0.00166, 1.235], (-1.291, -0.00166], (1.235, 3.674], ..., (-1.291, -0.00166], (-0.00166, 1.235], (-3.588, -1.291], (-1.291, -0.00166], (-1.291, -0.00166]]
Length: 1000
Categories (4, interval[float64]): [(-3.588, -1.291] < (-1.291, -0.00166] < (-0.00166, 1.235] < (1.235, 3.674]]

### 检测和过滤异常值

In [116]:
data=pd.DataFrame(np.random.randn(1000,4))

In [117]:
data

Unnamed: 0,0,1,2,3
0,1.548430,0.664895,-1.117014,0.658278
1,0.025574,0.225542,0.303272,-1.276002
2,-2.838849,-1.112452,1.516122,-1.403846
3,1.019895,-1.391299,0.185053,-0.653178
4,1.329594,1.358687,-0.113200,-0.457669
5,0.761064,-0.409904,1.447699,1.297155
6,1.298495,0.109527,1.188055,-1.152247
7,0.235641,-0.387440,1.123303,2.306749
8,0.098030,-0.885424,-0.698936,0.821462
9,-0.923802,0.632798,0.662294,1.513434


In [118]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.043338,-0.046569,0.059763,-0.040793
std,1.003985,1.003182,1.047356,0.996212
min,-3.107891,-3.419657,-3.068613,-3.275162
25%,-0.594628,-0.707062,-0.669422,-0.721335
50%,0.087918,-0.012116,0.054648,-0.057793
75%,0.696509,0.669956,0.725869,0.628866
max,3.442114,2.7778,3.117698,2.987172


In [119]:
col=data[2]

In [120]:
col

0     -1.117014
1      0.303272
2      1.516122
3      0.185053
4     -0.113200
5      1.447699
6      1.188055
7      1.123303
8     -0.698936
9      0.662294
10     1.872513
11    -0.016464
12    -0.941917
13     0.094182
14    -0.268087
15     0.139556
16    -1.371743
17    -0.895422
18    -0.712535
19     1.006214
20     0.299022
21     0.211419
22    -0.662846
23    -1.524922
24    -1.324901
25    -0.436680
26    -0.297049
27     0.413208
28    -0.029410
29    -2.983613
         ...   
970    0.371032
971   -3.068613
972   -1.306242
973   -0.130147
974    1.108401
975   -0.600126
976   -0.835970
977   -0.107293
978   -1.803043
979    1.419386
980   -0.917108
981   -0.348105
982    0.933966
983   -0.422775
984   -1.389555
985    0.848588
986   -0.742672
987   -0.142342
988   -0.647344
989   -2.158089
990    0.643050
991    0.499646
992   -0.825386
993   -0.512765
994   -1.242435
995    0.647956
996   -1.557562
997    1.469910
998   -0.949180
999   -1.370573
Name: 2, Length: 1000, d

In [121]:
col[np.abs(col)>3]

502    3.117698
898    3.011543
971   -3.068613
Name: 2, dtype: float64

In [122]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
25,3.442114,-1.440826,-0.43668,-0.689691
50,3.07365,-0.545734,0.810607,-1.424161
86,-0.96102,-3.27588,-1.453093,1.769205
105,-3.030861,-0.022167,1.558225,-0.132183
355,1.586879,-2.171898,2.810674,-3.275162
383,-3.107891,0.540117,-1.044296,0.732998
394,1.20525,-3.157571,-0.523278,-0.101943
502,0.346049,-1.740843,3.117698,0.543815
641,3.230372,-2.658964,-0.89262,-0.449142
820,1.063156,-3.419657,0.520998,-0.566643


In [123]:
np.sign(data)*3

Unnamed: 0,0,1,2,3
0,3.0,3.0,-3.0,3.0
1,3.0,3.0,3.0,-3.0
2,-3.0,-3.0,3.0,-3.0
3,3.0,-3.0,3.0,-3.0
4,3.0,3.0,-3.0,-3.0
5,3.0,-3.0,3.0,3.0
6,3.0,3.0,3.0,-3.0
7,3.0,-3.0,3.0,3.0
8,3.0,-3.0,-3.0,3.0
9,-3.0,3.0,3.0,3.0


In [124]:
data[np.abs(data)>3]=np.sign(data)*3

In [125]:
data

Unnamed: 0,0,1,2,3
0,1.548430,0.664895,-1.117014,0.658278
1,0.025574,0.225542,0.303272,-1.276002
2,-2.838849,-1.112452,1.516122,-1.403846
3,1.019895,-1.391299,0.185053,-0.653178
4,1.329594,1.358687,-0.113200,-0.457669
5,0.761064,-0.409904,1.447699,1.297155
6,1.298495,0.109527,1.188055,-1.152247
7,0.235641,-0.387440,1.123303,2.306749
8,0.098030,-0.885424,-0.698936,0.821462
9,-0.923802,0.632798,0.662294,1.513434


In [126]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.042731,-0.045716,0.059702,-0.040518
std,1.001228,1.000526,1.046783,0.995355
min,-3.0,-3.0,-3.0,-3.0
25%,-0.594628,-0.707062,-0.669422,-0.721335
50%,0.087918,-0.012116,0.054648,-0.057793
75%,0.696509,0.669956,0.725869,0.628866
max,3.0,2.7778,3.0,2.987172


### 排列和随机采用

In [127]:
df=pd.DataFrame(np.arange(5*4).reshape((5,4)))

In [128]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [129]:
sample=np.random.permutation(5)

In [130]:
sample

array([2, 1, 0, 3, 4])

In [131]:
df.tale(sample)

AttributeError: 'DataFrame' object has no attribute 'tale'

In [132]:
df.take(sample)

Unnamed: 0,0,1,2,3
2,8,9,10,11
1,4,5,6,7
0,0,1,2,3
3,12,13,14,15
4,16,17,18,19


### 计算指标/哑变量

In [133]:
df=pd.DataFrame({'key':list('bbacab'),
                'data1':range(6)})

In [134]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [135]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [136]:
dummies=pd.get_dummies(df['key'],prefix='key')

In [137]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [138]:
df_with_dummay=df[['data1']].join(dummies)

In [139]:
df_with_dummay

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


## 7.3 字符串操作

In [140]:
val='a,b,  guido'

In [141]:
val.split(',')

['a', 'b', '  guido']

In [142]:
pieces=[x.strip() for x in val.split(',')]

In [143]:
pieces

['a', 'b', 'guido']

In [144]:
first,second,third=pieces

In [145]:
first+'::'+second+'::'+third

'a::b::guido'

In [146]:
'::'.join(pieces)

'a::b::guido'

In [148]:
'guido' in val

True

In [149]:
val.index(',')

1

In [150]:
val.find(':')

-1

In [151]:
val.count(',')

2

In [152]:
val.replace(',','::')

'a::b::  guido'

In [153]:
val.replace(',','')

'ab  guido'

### 正则表达式

In [154]:
# 呵呵

### pandas的矢量化字符串函数

In [155]:
# 字符操作