In [1]:
import pandas as pd
import numpy as np

# 重复值处理

In [2]:
data = pd.DataFrame({'k1':['one']*3 + ['two']*2, 'k2':[1,1,2,3,3]})

In [3]:
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3


In [5]:
(data.duplicated()).sum()

2

In [6]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3


In [8]:
data.drop_duplicates('k1')  # 根据k1进行去重

Unnamed: 0,k1,k2
0,one,1
3,two,3


In [9]:
data['k3'] = 1

In [10]:
data

Unnamed: 0,k1,k2,k3
0,one,1,1
1,one,1,1
2,one,2,1
3,two,3,1
4,two,3,1


In [11]:
data.drop_duplicates(['k1', 'k3'])  # 根据k1,k3进行去重

Unnamed: 0,k1,k2,k3
0,one,1,1
3,two,3,1


# 数值替换

In [13]:
data.k1.replace('two','three')  # k1列，two替换成three（对Series）

0      one
1      one
2      one
3    three
4    three
Name: k1, dtype: object

In [14]:
data

Unnamed: 0,k1,k2,k3
0,one,1,1
1,one,1,1
2,one,2,1
3,two,3,1
4,two,3,1


In [15]:
data.replace(1, 100) # 对DataFrame

Unnamed: 0,k1,k2,k3
0,one,100,100
1,one,100,100
2,one,2,100
3,two,3,100
4,two,3,100


In [16]:
data.replace(1, 100, inplace=True)

In [17]:
data

Unnamed: 0,k1,k2,k3
0,one,100,100
1,one,100,100
2,one,2,100
3,two,3,100
4,two,3,100


# 过滤缺失值

In [18]:
data = pd.Series([2, np.nan, 4, np.nan, 8.5])

In [19]:
data

0    2.0
1    NaN
2    4.0
3    NaN
4    8.5
dtype: float64

In [22]:
(data.isnull()).sum()

2

In [24]:
data.notnull().sum()  # 不加括号也可以

3

In [25]:
data[data.notnull()]

0    2.0
2    4.0
4    8.5
dtype: float64

In [27]:
data.dropna(inplace=True)  # 抛弃nan值

In [28]:
data

0    2.0
2    4.0
4    8.5
dtype: float64

In [30]:
data = pd.DataFrame(np.random.randn(4,3),index=list('abcd'),columns=['aa','bb','cc'])
data.iloc[1:,:2] = np.nan
data.iloc[1,2] = np.nan

In [31]:
data

Unnamed: 0,aa,bb,cc
a,-0.474847,-0.398699,-0.43281
b,,,
c,,,2.835789
d,,,-0.557477


In [32]:
data.isna()

Unnamed: 0,aa,bb,cc
a,False,False,False
b,True,True,True
c,True,True,False
d,True,True,False


In [33]:
data.dropna()

Unnamed: 0,aa,bb,cc
a,-0.474847,-0.398699,-0.43281


In [34]:
data.dropna(how='all')  # 全部为nan，才会被抛弃

Unnamed: 0,aa,bb,cc
a,-0.474847,-0.398699,-0.43281
c,,,2.835789
d,,,-0.557477


In [35]:
data

Unnamed: 0,aa,bb,cc
a,-0.474847,-0.398699,-0.43281
b,,,
c,,,2.835789
d,,,-0.557477


In [36]:
data.iloc[0, 0] = np.nan

In [37]:
data

Unnamed: 0,aa,bb,cc
a,,-0.398699,-0.43281
b,,,
c,,,2.835789
d,,,-0.557477


In [38]:
data.dropna(axis=0, how='all')  # axis=0，根据行判断，全部为nan，才会被抛弃

Unnamed: 0,aa,bb,cc
a,,-0.398699,-0.43281
c,,,2.835789
d,,,-0.557477


In [39]:
data.dropna(axis=1, how='all')  # axis=1，根据列判断，全部为nan，才会被抛弃

Unnamed: 0,bb,cc
a,-0.398699,-0.43281
b,,
c,,2.835789
d,,-0.557477


# 填充缺失值

In [40]:
data

Unnamed: 0,aa,bb,cc
a,,-0.398699,-0.43281
b,,,
c,,,2.835789
d,,,-0.557477


In [41]:
data.fillna(0) # nan值填充为0

Unnamed: 0,aa,bb,cc
a,0.0,-0.398699,-0.43281
b,0.0,0.0,0.0
c,0.0,0.0,2.835789
d,0.0,0.0,-0.557477


In [44]:
data.fillna({'aa':1, 'bb':100})  #aa填充1，bb填充100，cc不填充

Unnamed: 0,aa,bb,cc
a,1.0,-0.398699,-0.43281
b,1.0,100.0,
c,1.0,100.0,2.835789
d,1.0,100.0,-0.557477


In [45]:
data = pd.Series([2, np.nan, 4, np.nan, 8.5])

In [46]:
data

0    2.0
1    NaN
2    4.0
3    NaN
4    8.5
dtype: float64

In [47]:
data.fillna(data.mean())  # 使用均值填充

0    2.000000
1    4.833333
2    4.000000
3    4.833333
4    8.500000
dtype: float64