In [2]:
import pandas as pd
import numpy as np
from pandas import Series,DataFrame

In [3]:
data = DataFrame({'k1':['one']*3+['two']*4,
                  'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [4]:
print("############################################")
#移除重复数据

############################################


In [5]:
data.duplicated()
#duplicated会返回一个布尔型的Series 表示各行是否重复

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [6]:
data.drop_duplicates()
#drop_duplicates去重

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [7]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [8]:
data.drop_duplicates(['k1'])
#使用参数可以显示定义对哪一行进行去重

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


In [9]:
#使用函数或者映射进行数据转换
#map方法

In [10]:
data1 = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami',
                   'corned beef','Bacon','pastrami','honey ham','nova lox'],
                   'ounces':[4,3,12,6,7.5,8,3,5,6]})
data1

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [11]:
meat_to_animal = {
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}

In [12]:
data1['animal'] = data1['food'].map(str.lower).map(meat_to_animal)
data1
#map是实现元素级别的数据清理工作的快捷方式 通过映射关系（字典或者lambda表达式）进行映射

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [13]:
data1['food'].map(lambda x:meat_to_animal[x.lower()])
#使用lambda表达式效果同字典

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [14]:
print("######################################################")
#替换值 replace方法

data2 = Series([1.,-999,2.,-999,-1000.,3.])
data2

######################################################


0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [15]:
#如果要将-999表示为缺失数据值或不可达到的值
data2.replace(-999,np.nan,inplace=True)
data2
#第一个参数为被替换者 第二个参数为替换内容 如果不设置inplace=True是不会修改原对象的

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [16]:
replaced_data2 = data2.replace([-999,-1000],np.nan)
replaced_data2

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [17]:
replaced2_data2 = data2.replace([-999,-1000],[np.nan,0])
replaced2_data2
#使用两个等长数组表示映射关系 （直觉上不是很合适）

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [18]:
replaced3_data2 = data2.replace({-999:np.nan,-1000:None})
replaced3_data2
#可以使用字典进行替换

0       1
1     NaN
2       2
3     NaN
4    None
5       3
dtype: object

In [19]:
data2.map({-999:np.nan,-1000:None,1:1,2:2,3:3})
#效果类似吧 但是不替换的部分也得写出来 否则就是NaN 没有replace方便

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [20]:
print("#####################################################")
#重命名轴/重命名索引

#####################################################


In [21]:
df = DataFrame(np.arange(12).reshape(3,4),
               index=['Ohio','Colorado','New York'],
               columns=['one','two','three','four'])
df

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [22]:
#和Series一样，轴标签也有map方法
print(df.index)

df.index.map(str.upper)

Index(['Ohio', 'Colorado', 'New York'], dtype='object')


Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [23]:
df.index = df.index.map(str.upper)
df

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [24]:
#同理column也是一样
df.columns.map(str.upper)

Index(['ONE', 'TWO', 'THREE', 'FOUR'], dtype='object')

In [25]:
#可以使用rename更便捷地实现
df.rename(columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [26]:
#非常显然地是 可以使用字典
df.rename(index={'Ohio':'Indiana'},
          columns={'three':'peekaboo'},inplace=True)
df
#同样是使用inplace=True进行原值的修改

Unnamed: 0,one,two,peekaboo,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [27]:
print("########################################################")
#离散化

########################################################


In [28]:
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
#将离散值进行更加离散化，即区间化
bins = [18,25,35,60,100]
cat = pd.cut(ages,bins)
cat
#pd.cut方法 将每个值分散到由bins定义的区间上去
#bins参数中是区间节点
#返回值会确定每个值位于区间的哪个部分

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [34]:
print(cat.codes)
#codes参数可以输出所属区间的顺序
print(cat.categories)
#categories可以输出区间
pd.value_counts(cat)
#高级value_counts更为有用 可以输出每个区间的个数

[0 0 0 1 0 0 2 1 3 2 2 1]
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')


(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [35]:
cat2 = pd.cut(ages,bins,labels=['Youth','YoungAdult','MiddleAged','Senior'])
cat2 #使用labels参数可以命名区间

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [36]:
data3 = np.random.rand(20)
data3

array([0.88412145, 0.55637974, 0.21508789, 0.76391896, 0.38825048,
       0.85726492, 0.26610345, 0.35605452, 0.04231609, 0.4178597 ,
       0.18316613, 0.88025454, 0.22937081, 0.76214866, 0.89154829,
       0.06401121, 0.15763062, 0.33161823, 0.8009405 , 0.34533644])

In [37]:
pd.cut(data3,4,precision=2)
#此时传入的bins并不是区间 而是数字 则会使用最大值最小值的等长四个区间进行切分

[(0.68, 0.89], (0.47, 0.68], (0.041, 0.25], (0.68, 0.89], (0.25, 0.47], ..., (0.041, 0.25], (0.041, 0.25], (0.25, 0.47], (0.68, 0.89], (0.25, 0.47]]
Length: 20
Categories (4, interval[float64]): [(0.041, 0.25] < (0.25, 0.47] < (0.47, 0.68] < (0.68, 0.89]]

In [38]:
data4 = np.random.randn(1000)
cat4 = pd.qcut(data4,4)
cat4

[(-0.00639, 0.629], (0.629, 3.567], (0.629, 3.567], (0.629, 3.567], (-0.00639, 0.629], ..., (-3.576, -0.646], (-3.576, -0.646], (-0.646, -0.00639], (-3.576, -0.646], (0.629, 3.567]]
Length: 1000
Categories (4, interval[float64]): [(-3.576, -0.646] < (-0.646, -0.00639] < (-0.00639, 0.629] < (0.629, 3.567]]

In [39]:
pd.value_counts(cat4)
#使用qcut传入参数时，使用的是下侧分位数法，如果只传入一个数字参数表示分几份，其会均等地分

(0.629, 3.567]        250
(-0.00639, 0.629]     250
(-0.646, -0.00639]    250
(-3.576, -0.646]      250
dtype: int64

In [40]:
cat5 = pd.qcut(data4,[0,0.1,0.5,0.9,1])
cat5

[(-0.00639, 1.218], (1.218, 3.567], (1.218, 3.567], (-0.00639, 1.218], (-0.00639, 1.218], ..., (-1.29, -0.00639], (-1.29, -0.00639], (-1.29, -0.00639], (-1.29, -0.00639], (1.218, 3.567]]
Length: 1000
Categories (4, interval[float64]): [(-3.576, -1.29] < (-1.29, -0.00639] < (-0.00639, 1.218] < (1.218, 3.567]]

In [41]:
pd.value_counts(cat5) #使用0到1闭区间的数值点 也可以显示定义分位数

(-0.00639, 1.218]    400
(-1.29, -0.00639]    400
(1.218, 3.567]       100
(-3.576, -1.29]      100
dtype: int64

In [43]:
print('####################################################################################')

#检测和过滤异常值
np.random.seed(42)
data5 = DataFrame(np.random.randn(1000,4))
data5.describe()

####################################################################################


Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.030624,0.024828,-0.008255,0.030086
std,0.963919,1.011884,1.006075,1.006964
min,-3.019512,-2.896255,-3.241267,-2.991136
25%,-0.612942,-0.677037,-0.675299,-0.670871
50%,0.056187,0.02021,-0.007509,0.021158
75%,0.664881,0.693881,0.642282,0.695878
max,3.243093,3.852731,3.152057,3.926238


In [45]:
col = data5[3]
col[np.abs(col)>3]
#要获取对应条件的部分 直接用布尔类型的Series

403    3.193108
723    3.926238
Name: 3, dtype: float64

In [54]:
data5[(np.abs(data5)>3)].dropna(how='all').replace(np.nan,0)
#同样地要获取对应条件的部分 使用布尔类型的DataFrame获取

Unnamed: 0,0,1,2,3
52,0.0,3.852731,0.0,0.0
65,0.0,0.0,-3.241267,0.0
119,0.0,0.0,3.078881,0.0
403,0.0,0.0,0.0,3.193108
489,0.0,3.137749,0.0,0.0
506,-3.019512,0.0,0.0,0.0
576,0.0,3.109919,0.0,0.0
723,0.0,0.0,0.0,3.926238
929,3.243093,0.0,0.0,0.0
995,0.0,0.0,3.152057,0.0


In [55]:
data5[np.abs(data5)>3]=np.sign(data5)*3
data5.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.0304,0.023728,-0.008245,0.028967
std,0.963077,1.008264,1.004621,1.003207
min,-3.0,-2.896255,-3.0,-2.991136
25%,-0.612942,-0.677037,-0.675299,-0.670871
50%,0.056187,0.02021,-0.007509,0.021158
75%,0.664881,0.693881,0.642282,0.695878
max,3.0,3.0,3.0,3.0


In [56]:
#排序和随机采样

In [65]:
df2 = DataFrame(np.arange(20).reshape(5,4))
sampler = np.random.permutation(5)
#使用np.random.permutation作用于Series和DataFrame上 可以达到重排效果
#返回的是重排后的index序列 需要后续take 或者 loc操作
print(df2)
print(df2.loc[sampler])
print(df2.take(sampler))

    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19
    0   1   2   3
0   0   1   2   3
1   4   5   6   7
3  12  13  14  15
2   8   9  10  11
4  16  17  18  19
    0   1   2   3
0   0   1   2   3
1   4   5   6   7
3  12  13  14  15
2   8   9  10  11
4  16  17  18  19


In [None]:
#计算指标/哑变量

In [66]:
df3 = DataFrame({'key':['b','b','a','c','a','b'],
                 'data1':range(6)})
df3

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [68]:
pd.get_dummies(df3['key'])
#get_dummies变为哑变量矩阵（完全没见过没听说过）
#意思大概就是把data的每个值取出来作为column 形成0-1矩阵

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [69]:
pd.get_dummies(df3)

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [73]:
names = ['movie_id','title','genres']
movies = pd.read_csv('../data/movielens/movies.dat',header=None,names=names,sep='::')
movies.head()

  


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [74]:
genre_iter = (set(x.split('|')) for x in movies.genres)
genre_iter

<generator object <genexpr> at 0x00000205AD14E3C8>

In [77]:
genres = sorted(set.union(*genre_iter))
genres

TypeError: descriptor 'union' of 'set' object needs an argument

In [78]:
genres

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [109]:
print(len(movies))
print(len(genres))
zero = np.zeros((3883,18))
dummies = DataFrame(zero,columns=genres)
dummies


3883
18


Unnamed: 0,Action,Documentary,Adventure,Crime,Horror,Thriller,Film-Noir,Western,Sci-Fi,War,Fantasy,Romance,Drama,Musical,Children's,Comedy,Animation,Mystery
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
for i,gen in enumerate(movies.genres):
    for j in gen.split('|'):
        # print(i)
        # print(j)
        # print([i,j])
        # print(dummies.loc[i,j])
        dummies.loc[i,j]=1
dummies
#使用比较笨的办法进行哑变量实现

Unnamed: 0,Action,Documentary,Adventure,Crime,Horror,Thriller,Film-Noir,Western,Sci-Fi,War,Fantasy,Romance,Drama,Musical,Children's,Comedy,Animation,Mystery
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [114]:
#cut和dummies联合使用更合理
values = np.random.rand(10)
values

array([0.94377811, 0.25089032, 0.10056273, 0.39326149, 0.61671271,
       0.86079894, 0.34619689, 0.65378675, 0.35019868, 0.61721958])

In [115]:
bins = [0,0.2,0.4,0.6,0.8,1]
pd.get_dummies(pd.cut(values,bins))
#非常合理 哑变量矩阵 出现的区间

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,0,1,0
5,0,0,0,0,1
6,0,1,0,0,0
7,0,0,0,1,0
8,0,1,0,0,0
9,0,0,0,1,0
