In [1]:
import pandas as pd
import numpy as np
NA = np.nan
data = pd.DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])
cleaned = data.dropna()    #dropna删除所有带有na的行
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [None]:
#dropna     根据每个标签的值是否是缺失数据来筛选轴标签，并根据允许丢失的数据量来确定阈值
#fillna     用某些值填充缺失的数据或使用插值方法（如'ffil1'或'bfil1')
#isnul1     返回表明哪些值是缺失值的布尔值
#notnull    isnull的反函数

In [5]:
data.dropna(how='all')    #how=all使只删除所有行都是na值的行

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [6]:
data[4]=NA
data.dropna(axis=1,how='all')    #axis=1使只删除全是na值的列

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [9]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1] = NA      #先行再列，表示0-3行，1列的元素变成na
df.iloc[:2,2] = NA      #表示0-1行，2列的元素变成nan
df

Unnamed: 0,0,1,2
0,0.702837,,
1,0.127051,,
2,0.442757,,-1.683236
3,2.012081,,-0.166313
4,0.725165,0.170911,0.246186
5,-0.461747,0.612682,1.917369
6,0.677238,0.804056,1.44159


In [10]:
df.dropna()

Unnamed: 0,0,1,2
4,0.725165,0.170911,0.246186
5,-0.461747,0.612682,1.917369
6,0.677238,0.804056,1.44159


In [11]:
df.dropna(thresh=2)    #theresh=2表示只删除一行中有2个na及以上的行（只有一个na的行不删）

Unnamed: 0,0,1,2
2,0.442757,,-1.683236
3,2.012081,,-0.166313
4,0.725165,0.170911,0.246186
5,-0.461747,0.612682,1.917369
6,0.677238,0.804056,1.44159


In [12]:
df.fillna(0)    #将缺失值补全为0

Unnamed: 0,0,1,2
0,0.702837,0.0,0.0
1,0.127051,0.0,0.0
2,0.442757,0.0,-1.683236
3,2.012081,0.0,-0.166313
4,0.725165,0.170911,0.246186
5,-0.461747,0.612682,1.917369
6,0.677238,0.804056,1.44159


In [13]:
df.fillna({1:0.5,2:0})    #为不同列设定不同的填充值，第一列用0.5替换na，第二列用0替换na

Unnamed: 0,0,1,2
0,0.702837,0.5,0.0
1,0.127051,0.5,0.0
2,0.442757,0.5,-1.683236
3,2.012081,0.5,-0.166313
4,0.725165,0.170911,0.246186
5,-0.461747,0.612682,1.917369
6,0.677238,0.804056,1.44159


In [15]:
_ = df.fillna(0,inplace=True) #参数 inplace=True 的作用是在原数据框上进行就地修改，而不是创建新的副本
                              #将所有的缺失值替换为0，并对原数据框进行了修改。
df

Unnamed: 0,0,1,2
0,0.702837,0.0,0.0
1,0.127051,0.0,0.0
2,0.442757,0.0,-1.683236
3,2.012081,0.0,-0.166313
4,0.725165,0.170911,0.246186
5,-0.461747,0.612682,1.917369
6,0.677238,0.804056,1.44159


In [16]:
df = pd.DataFrame(np.random.randn(6,3))
df.iloc[4:,1] = NA      
df.iloc[2:,2] = NA 
df

Unnamed: 0,0,1,2
0,0.530701,1.517117,0.38607
1,-0.642905,-0.861938,0.382286
2,0.782977,-0.569735,
3,2.046811,0.039911,
4,-0.736097,,
5,0.920433,,


In [18]:
df.fillna(method='ffill')    #用同列的前值替换na，即用na上面的一个值填充下面全部的na

Unnamed: 0,0,1,2
0,0.530701,1.517117,0.38607
1,-0.642905,-0.861938,0.382286
2,0.782977,-0.569735,0.382286
3,2.046811,0.039911,0.382286
4,-0.736097,0.039911,0.382286
5,0.920433,0.039911,0.382286


In [19]:
df.fillna(method='ffill',limit=2)    #限制同一个前值只能填充两个na值

Unnamed: 0,0,1,2
0,0.530701,1.517117,0.38607
1,-0.642905,-0.861938,0.382286
2,0.782977,-0.569735,0.382286
3,2.046811,0.039911,0.382286
4,-0.736097,0.039911,
5,0.920433,0.039911,


In [20]:
data = pd.Series([1.,NA,3.5,NA,7])
data.fillna(data.mean())    #用平均值填充na值

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [22]:
df.fillna(df.mean())   #将所有的缺失值替换为相应列的均值。

Unnamed: 0,0,1,2
0,0.530701,1.517117,0.38607
1,-0.642905,-0.861938,0.382286
2,0.782977,-0.569735,0.384178
3,2.046811,0.039911,0.384178
4,-0.736097,0.031339,0.384178
5,0.920433,0.031339,0.384178


In [None]:
#fillna函数参数
#value      标量值或字典型对象用于填充缺失值
#method     插值方法，如果没有其他参数，默认是‘ffil1'
#axis       需要填充的轴，默认 axis=0
#inplace    修改被调用的对象，而不是生成一个备份
#limit      用于前向或后向填充时最大的填充范围

In [23]:
data = pd.DataFrame({'k1':['one','two']*3 + ['two'],'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [24]:
data.duplicated()   #返回一个布尔值series，反应各行是否存在重复值

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [25]:
data.drop_duplicates()  #删除重复的行，只返回无重复的行

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [26]:
data['v1'] = range(7)
data.drop_duplicates(['k1'])  #基于k1列去除重复值，即只能保留一组被k1标记为one two的值,默认保留第一个观测到的值

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [27]:
data.drop_duplicates(['k1','k2'],keep='last')   #keep=last，使保留最后一个被观测到的值

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [29]:
data = pd.DataFrame({'food':['bacon','pulled pork','bacon','pastrami','corned beef','bacon','pastrami','honey ham','nova lox'],'ounces':[4,3,12,6,7.5,8,3,5,6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [31]:
meat_to_animal={
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}
data['animal'] = data['food'].map(meat_to_animal)   #map只能对一列series起作用，用来实现映射一一对应
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [32]:
data = pd.DataFrame(np.arange(12).reshape((3,4)),index=['ohio','colorado','new york'], columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7
new york,8,9,10,11


In [33]:
transform = lambda x:x[:4].upper()
data.index.map(transform)    #应用函数后用map一一对应替换

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [34]:
data.index=data.index.map(transform)    #将镜像部分替换本体对应的部分
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [35]:
data.rename(index=str.title, columns=str.upper)    #创建数据集转换后的版本，并不修改原有的数据集，用rename

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [36]:
data.rename(index={'OHIO':'INDIANA'},columns={'three':'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [37]:
data.rename(index={'OHIO':'INDIANA'},inplace=True)    #inplace=true可以修改原数据集，不再是修改镜像
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [38]:
ages =[20,22,25,27,21,23,37,31,61,45,41,32]    #未分组数据
bins=[18,25,35,60,100]                     #分组依据，分为18-25，25-35等等几组
cats = pd.cut(ages,bins)                   #用cut将数据分组（分箱）
cats                                       #<表示25的左边是闭区间，右边是开区间

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [39]:
cats.codes    #显示每个数据包被分到了哪个组里（按0开始的顺序）

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [42]:
cats.categories    #返回分组结果的所有唯一分类值（区间范围）

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [43]:
pd.value_counts(cats)    #对pandas.cut结果的箱数量的计数

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [44]:
pd.cut(ages, [18,26,36,61,100],right=False) #中括号表示封闭，right=false改变哪一边是封闭的

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [45]:
group_names = ['youth','youngadult','middleaged','senior']
pd.cut(ages,bins,labels=group_names)     #为每一个箱子建立一个labels，以labels的形式显示

['youth', 'youth', 'youth', 'youngadult', 'youth', ..., 'youngadult', 'senior', 'middleaged', 'middleaged', 'youngadult']
Length: 12
Categories (4, object): ['youth' < 'youngadult' < 'middleaged' < 'senior']

In [46]:
data = np.random.rand(20)
pd.cut(data,4,precision=2)    #将data均匀分成四份，precision=2表示分箱的数据保留两位小数

[(0.77, 0.98], (0.56, 0.77], (0.35, 0.56], (0.35, 0.56], (0.56, 0.77], ..., (0.13, 0.35], (0.77, 0.98], (0.13, 0.35], (0.56, 0.77], (0.56, 0.77]]
Length: 20
Categories (4, interval[float64, right]): [(0.13, 0.35] < (0.35, 0.56] < (0.56, 0.77] < (0.77, 0.98]]

In [48]:
data = np.random.randn(1000)
cats = pd.qcut(data,4)    #qcut可以分出等长的箱，每个箱中计数都相等
pd.value_counts(cats)

(-2.592, -0.688]     250
(-0.688, -0.0373]    250
(-0.0373, 0.707]     250
(0.707, 3.231]       250
dtype: int64

In [49]:
pd.qcut(data,[0,0.1,0.5,0.9,1.])

[(-1.199, -0.0373], (-1.199, -0.0373], (-1.199, -0.0373], (-1.199, -0.0373], (-0.0373, 1.338], ..., (-1.199, -0.0373], (-1.199, -0.0373], (-2.592, -1.199], (-1.199, -0.0373], (-0.0373, 1.338]]
Length: 1000
Categories (4, interval[float64, right]): [(-2.592, -1.199] < (-1.199, -0.0373] < (-0.0373, 1.338] < (1.338, 3.231]]

In [50]:
data = pd.DataFrame(np.random.randn(1000,4))
data.describe()   #每一列都表示数据集中对应列的统计信息, 计数（count）、均值（mean）、标准差（std）、最小值（min）
                  #下四分位数（25%）、中位数（50%）、上四分位数（75%）和最大值（max）。

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.009083,0.037611,0.037355,-0.005275
std,1.018214,1.015944,1.024593,1.011826
min,-3.414717,-3.726565,-2.847262,-3.273658
25%,-0.703033,-0.654794,-0.61474,-0.725039
50%,-0.023075,0.059486,0.021018,0.001921
75%,0.699898,0.701831,0.720876,0.692171
max,3.251481,3.337307,3.452981,3.235815


In [51]:
col = data[2]
col[np.abs(col)>3]    #找出第三列中绝对值大于三的值

5      3.452981
287    3.178064
Name: 2, dtype: float64

In [52]:
data[(np.abs(data)>3).any(1)]   #选出所有至少有一个值绝对值大于三的行
                                #any(1)对每一行进行判断，如果该行中存在任意一个元素为True，则返回True，否则返回False

  data[(np.abs(data)>3).any(1)]


Unnamed: 0,0,1,2,3
5,-1.629759,0.369375,3.452981,-0.44382
38,0.607048,-3.726565,-0.504624,1.240844
65,1.588876,0.209996,-0.401538,3.166984
242,-0.77028,3.337307,0.072213,0.368412
287,0.741842,-1.037651,3.178064,2.526651
309,-0.610601,-0.047746,0.435188,-3.273658
395,0.343518,-3.303903,-1.398031,-1.033578
501,-3.414717,0.321898,-0.563943,0.605043
620,0.11341,0.17168,-1.207674,3.235815
912,3.251481,-1.346929,-0.781564,0.039058


In [53]:
data[np.abs(data)>3] = np.sign(data)*3      #np.abs(data)>3创建一个布尔型的DataFrame，其中元素大于3的位置为True，小于等于3的位置为False。
                                #然后，使用np.sign(data)生成一个与data相同形状的数组，其中元素大于0的位置为1，小于0的位置为-1，等于0的位置为0
       #将原始数据集data中所有绝对值大于3的元素设置为它们的符号乘以3。即将大于3的数截断为3，将小于-3的数截断为-3，而保留在区间[-3,3]内的数不变
                           #这样可以保持数据的分布形状，同时消除异常值的影响。
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.009247,0.038304,0.036724,-0.005404
std,1.01613,1.011503,1.022648,1.009737
min,-3.0,-3.0,-2.847262,-3.0
25%,-0.703033,-0.654794,-0.61474,-0.725039
50%,-0.023075,0.059486,0.021018,0.001921
75%,0.699898,0.701831,0.720876,0.692171
max,3.0,3.0,3.0,3.0


In [54]:
np.sign(data).head()    #根据值符号情况分为-1，0，1

Unnamed: 0,0,1,2,3
0,1.0,1.0,-1.0,1.0
1,-1.0,1.0,1.0,1.0
2,1.0,-1.0,1.0,-1.0
3,1.0,-1.0,-1.0,1.0
4,-1.0,-1.0,-1.0,1.0


In [55]:
df = pd.DataFrame(np.arange(5*4).reshape((5,4)))
sampler = np.random.permutation(5)    #生成了一个长度为5的随机排列数组
sampler

array([0, 4, 2, 3, 1])

In [56]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [57]:
df.take(sampler)   #行标签0-4重排

Unnamed: 0,0,1,2,3
0,0,1,2,3
4,16,17,18,19
2,8,9,10,11
3,12,13,14,15
1,4,5,6,7


In [60]:
df.sample(n=3)    #sample随机选出n行

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
4,16,17,18,19


In [61]:
choices = pd.Series([5,7,-1,6,4])
draws = choices.sample(n=10,replace=True) #sample表示抽样操作，n=10表示抽样的数量为10，replace=True表示允许重复抽样（即有放回抽样）
draws

4    4
4    4
1    7
3    6
4    4
1    7
3    6
1    7
1    7
0    5
dtype: int64

In [67]:
df = pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
pd.get_dummies(df['key'])    #pd.get_dummies()函数可以将分类变量转换为虚拟变量的二进制表示形式

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [68]:
dummies = pd.get_dummies(df['key'],prefix='key')    #'key'列进行独热编码操作，并指定了前缀为key(prefix是前缀)
df_with_dummy = df[['data1']].join(dummies)    #生成一个包含'data1'列和独热编码后键值的frame
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [69]:
mnames = ['movie_id','title','genres']
movies = pd.read_table('examples/movies.dat',sep='::',header=None,names=mnames)
movies[:10]

  movies = pd.read_table('examples/movies.dat',sep='::',header=None,names=mnames)


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [70]:
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))   #依据|将单词切割后一个个加入all_genres中
genres = pd.unique(all_genres)    #选出电影标签全集中每一种标签（每种记录一次）
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [71]:
zero_matrix = np.zeros((len(movies), len(genres)))  #构建一个行是电影，列是电影标签的全0矩阵（后面再填）
dummies = pd.DataFrame(zero_matrix,columns=genres)
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
gen = movies.genres[0]
gen.split('|')     #总数据集中第一行电影的标签

['Animation', "Children's", 'Comedy']

In [73]:
dummies.columns.get_indexer(gen.split('|'))   #在全0矩阵中找到第一行电影标签的位置数

array([0, 1, 2], dtype=int64)

In [74]:
for i ,gen in enumerate(movies.genres):        #使用enumerate()函数对movies.genres进行循环遍历
    indices = dummies.columns.get_indexer(gen.split('|'))  #获取相应电影标签分类的列索引，在全0矩阵中找到相应电影标签的位置数
    dummies.iloc[i, indices] = 1                           #令全0矩阵对应的位置赋值为1（即出现了该标签）
movies_windic = movies.join(dummies.add_prefix('Genre_'))   #给dummies中的列名添加前缀'Genre_'，以区分其他的列名
                                                            #.join()方法将独热编码后的电影分类信息和原始的movies DataFrame按列合并
movies_windic.iloc[0]                           #显示第一个(行）电影的标签存在情况，只有前三个标签

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                              1.0
Genre_Children's                             1.0
Genre_Comedy                                 1.0
Genre_Adventure                              0.0
Genre_Fantasy                                0.0
Genre_Romance                                0.0
Genre_Drama                                  0.0
Genre_Action                                 0.0
Genre_Crime                                  0.0
Genre_Thriller                               0.0
Genre_Horror                                 0.0
Genre_Sci-Fi                                 0.0
Genre_Documentary                            0.0
Genre_War                                    0.0
Genre_Musical                                0.0
Genre_Mystery                                0.0
Genre_Film-Noir                              0.0
Genre_Western       

In [75]:
movies_windic

Unnamed: 0,movie_id,title,genres,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Adventure,Genre_Fantasy,Genre_Romance,Genre_Drama,...,Genre_Crime,Genre_Thriller,Genre_Horror,Genre_Sci-Fi,Genre_Documentary,Genre_War,Genre_Musical,Genre_Mystery,Genre_Film-Noir,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3879,3949,Requiem for a Dream (2000),Drama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3880,3950,Tigerland (2000),Drama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3881,3951,Two Family House (2000),Drama,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [77]:
bins = [0,0.2,0.4,0.6,0.8,1]
pd.get_dummies(pd.cut(values,bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


In [2]:
val = 'a,b,    guido'
val.split(',')   #凭着符号依据来切割

['a', 'b', '    guido']

In [3]:
pieces = [x.strip() for x in val.split(',')]   #去除空格
pieces

['a', 'b', 'guido']

In [4]:
first, second, third = pieces   #等位替代
first+ '::'+ second+ '::'+third

'a::b::guido'

In [5]:
"::".join(pieces)   #一个符号join会替代原字符串的分隔符

'a::b::guido'

In [6]:
val.find(':')    #find没找到（不存在）返回-1，index则是抛出异常

-1

In [None]:
#count       返回子字符串在字符串中的非重叠出现次数
#endswith      如果字符串以后缀结尾则返回True
#startswith     如果字符串以前级开始则返回True
#join     使用字符串作为间隔符，用于粘合其他字符串的序列
#index     如果在字符串中找到，则返回子字符串中第一个字符的位置；如果找不到则引发ValueError
#find     返回字符串中第一个出现子字符的第一个字符的位置；类似index,但如果没有找到则返回-1
#rfind    返回子字符串在字符串中最后一次出现时第一个字符的位置；如果没有找到，则返回一1
#replace    使用一个字符串替代另一个字符串
#strip，rstrip，lstript      修剪空白，包括换行符；相当于对每个元素进行x。strip(）（以及rstrip, lstrip)。
#split      使用分隔符将字符串拆分为子字符串的列表
#lower      将大写字母转换为小写字母
#upper      将小写字母转换为大写字母
#casefold   将字符转换为小写，并将任何特定于区域的变量字符组合转换为常见的可比较形式
#ljust,rjust    左对齐或右对齐；用空格（或其他一些字符）填充字符串的相反侧以返回具有最小宽度的字符串

In [3]:
import re
text = "foo    bar\t baz   \tqux"
re.split('\s+',text)    #将text字符串按照空白字符进行分割，并返回一个列表，其中每个元素都是一个分割后的子字符串
                        #'\s+'是一个正则表达式，表示匹配一个或多个空白字符(包括空格、制表符、换行符等）

['foo', 'bar', 'baz', 'qux']

In [4]:
regex = re.compile('\s+')    #re.compile('\s+')是使用正则表达式模式'\s+'创建了一个正则表达式对象regex
                             #该正则表达式模式表示匹配一个或多个空白字符
regex.split(text)         #相当于先设置compile=\s+，然后再让split根据compile处理字符串，可节约cpu周期

['foo', 'bar', 'baz', 'qux']

In [5]:
regex.findall(text)    #在text中查找所有连续的空白字符，并以列表形式返回这些空白字符组成的子字符串

['    ', '\t ', '   \t']

In [5]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'   #[A-Z0-9._%+-]匹配一个或多个大写字母、数字、下划线、点、百分号、加号或减号
                             #@[A-Z0-9.-]+\.[A-Z]{2,4}匹配一个@符号后跟一个或多个大写字母、数字、点或减号，然后是一个点和两到四个大写字母
regex = re.compile(pattern, flags=re.IGNORECASE)  #将正则表达式模式编译为正则表达式对象，并指定忽略大小写的标志
regex.findall(text)    #在文本中查找所有匹配的内容，并返回一个列表

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [6]:
m = regex.search(text)    #regex相当于一个标准，按照这个标准匹配文本中的对象,search只返回第一个匹配项
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [7]:
text[m.start():m.end()]

'dave@google.com'

In [8]:
print(regex.match(text))   #match只在模式出现于字符串起始位置时进行匹配，开头没有匹配到就是none

None


In [9]:
print(regex.sub('REDACTED', text))   #sub会返回一个新的字符串，原字符串中的模式会被新字符串（redacted）替代

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [10]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()   #m.groups()方法获取所有匹配到的分组（即被括在圆括号中的部分），返回一个元组

('wesm', 'bright', 'net')

In [11]:
regex.findall(text)   #当模式可以分组后，返回的是括号内的内容，是包含元组的列表

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [12]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))  #sub可用\1\2访问每个匹配对象的分组，\1是第一个匹配分组

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [None]:
#正则表达式方法
#findall      将字符串中所有的非重叠匹配模式以列表形式返回
#finditer     与findal1类似，但返回的是迭代器
#match        在字符串起始位置匹配模式，也可以将模式组建匹配到分组中；如果模式匹配上了，返回的一个匹配对象，否则返回None
#search       扫描字符串的匹配模式，如果扫描到了返回匹配对象
              #与match方法不同的是，search方法的匹配可以是字符串的任意位置，而不仅仅是字符串的起始位置
#split        根据模式，将字符串拆分为多个部分
#sub, subn    用替换表达式替换字符串中所有的匹配(sub)或第个出现的匹配串(subn）；使用符号\1, \2......来引用替换字符串中的匹配组元素

In [13]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes':np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [14]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [15]:
data.str.contains('gmail')   #Series通过str属性实现跳过nan值的字符串操作，防止在一处卡死

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [16]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [17]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [27]:
matches = data.str.match(pattern, flags=re.IGNORECASE)   #series中的逐元素特性，让match不再只对开头一个，而是逐元素全部作用
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [29]:
data.str[:5]  #取字符串前五个字符，进行切片

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

In [None]:
#cat          根据可选的分隔符按元素黏合字符串
#contains     返回是否含有某个模式/正则表达式的布尔值数组
#count        模式出现次数的计数
#extract      使用正则表达式从字符串Series中分组抽取一个或多个字符串；返回的结果是每个分组形成一列的DataFrame
#endswith     等价于对每个元素使用x.endwith（模式)
#startswith   等价于对每个元素使用x.startwith（模式)
#findall      找出字符串中所有的模式/正则表达式匹配项，以列表返回
#get          对每个元素进行素引（获得第i个元素)
#isalnum      等价于内建的str.alnum
#isalhpa      等价于内建的str.isalpha
#isdecimal    等价于内建的 str.isdecimal
#isdigit      等价于内建er的str.isdigit
#islower      等价于内建的str.islow
#isnumeric    等价于内建的str. isnumeric
#isupper      等价于内建的str. isupper
#join         根据传递的分隔符，将Series中的字符串联合
#len          计算每个字符串的长度
#lower，upper 转换大小写，等价于对每个元素进行x.lower()或x.upper()
#match        使用re.match将正则表达式应用到每个元素上，将匹配分组以列表形式返回
#pad          将空白加到字符串的左边、右边或两边
#center       等价于 pad (side='both')
#Tepeat       重复值（例如s.str.repeat(3）等价于对每个字符串进行x*3）
#replace      以其他字符串替代模式/正式表达式的匹配项
#slice        对Series中的字符串进行切片
#split        以分隔符或正则表达式对字符串进行拆分
#strip        对字符串两侧的空白进行消除，包括换行符
#rstrip       消除字符串右边的空白
#lstrip       消除字符串左边的空白