In [1]:
import numpy as np
import pandas as pd

## 处理缺失数据
+ `s/df.isnull()`寻找缺失值, 返回mask

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### 过滤缺失数据

In [5]:
data = pd.Series([1,np.nan, 3.5, np.nan, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [6]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

等价于

In [7]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

DataFrame的处理

In [8]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                      [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [9]:
cleaned = data.dropna()
display(data, cleaned)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0


传入要删除的方式
+ `how='any'`:只要有一个
+ `how='all'`:所有的都为nan才删除

In [10]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [11]:
data[4] = np.nan

指定轴`axis`

In [12]:
display(data, data.dropna(axis=1, how='all'))


Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


 保留一部分观测数据
 `thresh`

In [13]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1] = np.nan
df.iloc[:2,2] = np.nan

In [14]:
display(df, df.dropna(), df.dropna(thresh=2))

Unnamed: 0,0,1,2
0,-0.562251,,
1,0.475496,,
2,-0.378481,,-0.013537
3,0.39197,,-1.560434
4,-1.690767,0.684848,0.46741
5,-0.472548,-0.245485,0.072568
6,0.71815,-1.927105,-0.434763


Unnamed: 0,0,1,2
4,-1.690767,0.684848,0.46741
5,-0.472548,-0.245485,0.072568
6,0.71815,-1.927105,-0.434763


Unnamed: 0,0,1,2
2,-0.378481,,-0.013537
3,0.39197,,-1.560434
4,-1.690767,0.684848,0.46741
5,-0.472548,-0.245485,0.072568
6,0.71815,-1.927105,-0.434763


### 填充数据

`df.fillna()`

In [15]:
display(df, df.fillna(0))

Unnamed: 0,0,1,2
0,-0.562251,,
1,0.475496,,
2,-0.378481,,-0.013537
3,0.39197,,-1.560434
4,-1.690767,0.684848,0.46741
5,-0.472548,-0.245485,0.072568
6,0.71815,-1.927105,-0.434763


Unnamed: 0,0,1,2
0,-0.562251,0.0,0.0
1,0.475496,0.0,0.0
2,-0.378481,0.0,-0.013537
3,0.39197,0.0,-1.560434
4,-1.690767,0.684848,0.46741
5,-0.472548,-0.245485,0.072568
6,0.71815,-1.927105,-0.434763


通过字典填充


In [16]:
df.fillna({1:0.5, 2:0})

Unnamed: 0,0,1,2
0,-0.562251,0.5,0.0
1,0.475496,0.5,0.0
2,-0.378481,0.5,-0.013537
3,0.39197,0.5,-1.560434
4,-1.690767,0.684848,0.46741
5,-0.472548,-0.245485,0.072568
6,0.71815,-1.927105,-0.434763


原地修改

In [17]:
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,-0.562251,0.0,0.0
1,0.475496,0.0,0.0
2,-0.378481,0.0,-0.013537
3,0.39197,0.0,-1.560434
4,-1.690767,0.684848,0.46741
5,-0.472548,-0.245485,0.072568
6,0.71815,-1.927105,-0.434763


In [18]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-1.092374,-3.089034,0.669464
1,0.250267,-0.74759,-1.002245
2,0.633423,,0.528709
3,1.548153,,-0.170117
4,0.271981,,
5,-0.988104,,


指定填充方法
+ `df.fillna(method='ffill')`:forword fill

In [19]:
display(df.fillna(method='ffill'), df.fillna(method='ffill', limit=2))

Unnamed: 0,0,1,2
0,-1.092374,-3.089034,0.669464
1,0.250267,-0.74759,-1.002245
2,0.633423,-0.74759,0.528709
3,1.548153,-0.74759,-0.170117
4,0.271981,-0.74759,-0.170117
5,-0.988104,-0.74759,-0.170117


Unnamed: 0,0,1,2
0,-1.092374,-3.089034,0.669464
1,0.250267,-0.74759,-1.002245
2,0.633423,-0.74759,0.528709
3,1.548153,-0.74759,-0.170117
4,0.271981,,-0.170117
5,-0.988104,,-0.170117


In [20]:
# 填充中位数
data = pd.Series([2., np.nan, 3, np.nan, 7])
data.fillna(data.mean())

0    2.0
1    4.0
2    3.0
3    4.0
4    7.0
dtype: float64

## 数据转换

### 移除重复数据
`df.duplicated()`: 返回mask，判断是否为重复值(默认后面有重复的标记为True)

In [21]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
_ = pd.DataFrame({'k1':['one'], 'k2':[1]})
data = data.append(_,ignore_index=True)
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4
7,one,1


In [22]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
dtype: bool

删除重复项

In [23]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


指定判断依据

In [24]:
data['v1'] = range(8)
display(data, data.drop_duplicates(['k1']))

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6
7,one,1,7


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


保留最后一个

In [25]:
data.drop_duplicates(['k1','k2'], keep='last')

Unnamed: 0,k1,k2,v1
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6
7,one,1,7


### 利用函数或者映射进行数据转换

In [26]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                               'Pastrami', 'corned beef', 'Bacon',
                               'pastrami', 'honey ham', 'nova lox'],
                               'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})


In [27]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [28]:
# 添加映射
meat_to_animal = {
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}

In [29]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [30]:
data['animal'] = lowercased.map(meat_to_animal)

In [31]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [32]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### 替换值

In [33]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [34]:
data.replace(-999., np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [35]:
# 一次替换多个值
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

让每个值有不同的替换值

In [36]:
data.replace([-999, -1000], [9, 10])

0     1.0
1     9.0
2     2.0
3     9.0
4    10.0
5     3.0
dtype: float64

In [37]:
# 传入字典替换数据
data.replace({-999:9, -1000:10})

0     1.0
1     9.0
2     2.0
3     9.0
4    10.0
5     3.0
dtype: float64


## 重命名轴索引

In [38]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                     index=['Ohio', 'Colorado', 'New York'],
                     columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [39]:
transform = lambda x: x[:4].upper()

In [40]:
data.index = data.index.map(transform)

In [41]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


创建数据集的转换版(不修改原始数据), 可以通过`rename`实现

In [42]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


传入字典，实现对部分轴标签的更新

In [43]:
data.rename(index={'OHIO':'oooo'}, columns={'one':1,'two':2})

Unnamed: 0,1,2,three,four
oooo,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


实现就地更改

In [44]:
data.rename(index={'OHIO':'oooo'},
            columns={'one':1,'two':2}, inplace=True)
data

Unnamed: 0,1,2,three,four
oooo,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


## 离散化和bin划分

In [45]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

将数据分为18到25，26到35，35到60，以及60以上

In [46]:
bins = [18,25,35,60,100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [47]:
# 返回数据的组代号
cats.codes


array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [48]:
# 返回数据的分类依据
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [49]:
# 统计分组后，每组的数据
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [50]:
# 给分完的组其别名
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)


[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

### 根据分位数划分
`qcut()`

In [51]:
data = np.random.randn(1000)
cats = pd.qcut(data, 4)  # cut into quartiles
cats

[(-0.665, 0.0324], (0.0324, 0.655], (-3.5829999999999997, -0.665], (-3.5829999999999997, -0.665], (-3.5829999999999997, -0.665], ..., (0.655, 3.613], (0.655, 3.613], (0.0324, 0.655], (0.0324, 0.655], (0.655, 3.613]]
Length: 1000
Categories (4, interval[float64]): [(-3.5829999999999997, -0.665] < (-0.665, 0.0324] < (0.0324, 0.655] < (0.655, 3.613]]

In [52]:
pd.value_counts(cats)

(0.655, 3.613]                   250
(0.0324, 0.655]                  250
(-0.665, 0.0324]                 250
(-3.5829999999999997, -0.665]    250
dtype: int64

自定义分位数


In [53]:
pd.qcut(data, [0, 0.1,0.4, 0.7,1])


[(-0.244, 0.558], (-0.244, 0.558], (-1.198, -0.244], (-3.5829999999999997, -1.198], (-3.5829999999999997, -1.198], ..., (0.558, 3.613], (0.558, 3.613], (-0.244, 0.558], (-0.244, 0.558], (0.558, 3.613]]
Length: 1000
Categories (4, interval[float64]): [(-3.5829999999999997, -1.198] < (-1.198, -0.244] < (-0.244, 0.558] < (0.558, 3.613]]

## 检测和过滤异常值

In [54]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.037162,0.025776,0.013886,-0.050455
std,1.03469,0.996606,1.017653,1.015424
min,-3.701776,-3.23471,-3.104973,-3.22886
25%,-0.733503,-0.618878,-0.654649,-0.734322
50%,-0.050854,0.035223,0.034756,-0.014726
75%,0.612402,0.680077,0.73092,0.623813
max,2.894073,3.433994,3.095846,3.55509


找出绝对值大于3的值

In [55]:
col = data[2]
col[np.abs(col) > 3]

40    -3.104973
378    3.095846
Name: 2, dtype: float64

找出含数据的绝对值大于3的行

In [56]:
data[(np.abs(data) > 3).any(axis=1)]

Unnamed: 0,0,1,2,3
40,-0.862641,-1.467541,-3.104973,1.830106
107,1.515711,-1.509069,-1.23532,3.55509
150,0.240019,-0.651424,-0.841377,3.228745
378,-0.686443,-0.957752,3.095846,-0.642075
450,-3.291976,-0.224063,1.448892,-1.030725
461,-0.500551,-3.023852,1.344609,0.212107
617,-0.879689,3.433994,-0.231457,0.16542
669,0.05807,0.693203,0.168167,-3.22886
858,1.083887,-3.1122,0.171219,-0.172551
953,-0.762221,-3.23471,0.007704,0.455805


In [57]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.036168,0.025712,0.013895,-0.05101
std,1.031557,0.994051,1.017051,1.012193
min,-3.0,-3.0,-3.0,-3.0
25%,-0.733503,-0.618878,-0.654649,-0.734322
50%,-0.050854,0.035223,0.034756,-0.014726
75%,0.612402,0.680077,0.73092,0.623813
max,2.894073,3.0,3.0,3.0


## 排列和随机采样

利用`numpy.random.permutation`函数可以轻松实现对`Series`或`DataFrame`的列的排列工作（`permuting`，随机重排序）。通过需要排列的轴的长度调用`permutation`，可产生一个表示新顺序的整数数组：

In [58]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [59]:
sampler = np.random.permutation(5)  # 对序列进行随机排序
sampler  

array([0, 3, 1, 4, 2])

In [60]:
# 可以用iloc或者take可以对样本重新排列
df.take(sampler)

Unnamed: 0,0,1,2,3
0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
4,16,17,18,19
2,8,9,10,11


Series和DataFrame封装了`sample()`方法，可以实现采样

In [61]:
df.sample(n=2, axis=1)

Unnamed: 0,0,2
0,0,2
1,4,6
2,8,10
3,12,14
4,16,18


通过替换的方式产生样本(允许重复选择),replace=True

In [62]:
choice = pd.Series([5,7,-1,2,5])
draws = choice.sample(n=8, replace=True)
draws

1    7
0    5
1    7
3    2
1    7
0    5
1    7
1    7
dtype: int64

## 计算指标

常用于统计建模或机器学习的转换方式是：将分类变量（categorical variable）转换为“哑变量”或“指标矩阵”。

In [63]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [64]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


你可能想给指标DataFrame的列加上一个前缀，以便能够跟其他数据进行合并。get_dummies的prefix参数可以实现该功能：

In [65]:
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummies = df[['data1']].join(dummies)
df_with_dummies

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


示例

In [66]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_csv('datasets/movielens/movies.dat', sep='::',
                        header=None, names=mnames)

  This is separate from the ipykernel package so we can avoid doing imports until


In [67]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


首先从数据集中抽取不同的genre值

In [68]:
all_genres = []
for x in movies['genres']:
    all_genres.extend(x.split('|'))
genres= pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

从零构造`指标DataFrame`

In [69]:
zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)
dummies.head()

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
gen = movies.genres[0]
gen.split('|')

['Animation', "Children's", 'Comedy']

In [71]:
dummies.columns.get_indexer(gen.split('|'))

array([0, 1, 2], dtype=int64)

In [72]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1


In [73]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [74]:
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [75]:
bins = np.linspace(0, 1, 6)
pd.get_dummies(pd.cut(values, bins))

   (0.0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1.0]
0           0           0           0           0           1
1           0           1           0           0           0
2           1           0           0           0           0
3           0           1           0           0           0
4           0           0           1           0           0
5           0           0           1           0           0
6           0           0           0           0           1
7           0           0           0           1           0
8           0           0           0           1           0
9           0           0           0           1           0

In [76]:
genres_temp = movies['genres']
dummies_temp = []
for i in genres_temp:
    dummies_temp.extend(i.split('|'))
genres = pd.unique(dummies_temp)
dummies = pd.DataFrame(np.zeros((len(movies), len(genres)), dtype=np.int8),
                       columns=genres)
dummies.head()

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [77]:
for i, gen in enumerate(movies['genres']):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1
dummies.head()

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [78]:
movies.join(dummies.add_prefix('genres_')).head()

Unnamed: 0,movie_id,title,genres,genres_Animation,genres_Children's,genres_Comedy,genres_Adventure,genres_Fantasy,genres_Romance,genres_Drama,...,genres_Crime,genres_Thriller,genres_Horror,genres_Sci-Fi,genres_Documentary,genres_War,genres_Musical,genres_Mystery,genres_Film-Noir,genres_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 字符串操作

### 字符串对象方法

![](https://camo.githubusercontent.com/2b2d428ec2ffa80d923afb9d9c0285844feecd22/687474703a2f2f75706c6f61642d696d616765732e6a69616e7368752e696f2f75706c6f61645f696d616765732f373137383639312d303837666536376266366462303730312e706e673f696d6167654d6f6772322f6175746f2d6f7269656e742f7374726970253743696d61676556696577322f322f772f31323430)

![](https://camo.githubusercontent.com/321a7e42325294bd5aba12ab4e5e41a1d3ca07d0/687474703a2f2f75706c6f61642d696d616765732e6a69616e7368752e696f2f75706c6f61645f696d616765732f373137383639312d643166306434656433653839353031362e706e673f696d6167654d6f6772322f6175746f2d6f7269656e742f7374726970253743696d61676556696577322f322f772f31323430)

### 正则表达式

In [79]:
import re

#### 替换|

In [80]:
text = 'foo  bar \tbaz \tuqx'
re.split('\s+', text)

['foo', 'bar', 'baz', 'uqx']

In [81]:
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'uqx']

#### 匹配全部

In [82]:
regex.findall(text)

['  ', ' \t', ' \t']

#### 查找与匹配

In [83]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""

In [84]:
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex = re.compile(pattern, flags=re.I)

In [85]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [88]:
regex.search(text).group()

'dave@google.com'

In [92]:
text[regex.search(text).start():regex.search(text).end()]

'dave@google.com'

In [93]:
print(regex.match(text))

None


#### 替换

In [95]:
print(regex.sub('redacted', text))

Dave redacted
Steve redacted
Rob redacted
Ryan redacted



#### 分组


In [97]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, re.I)

In [100]:
m = regex.match('west@bright.net')
m.groups()

('west', 'bright', 'net')

In [101]:
regex.findall(text)


[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

通过数字进行分组

In [104]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



### Pandas 矢量化字符串函数

In [106]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
         'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [107]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

`map（）`可以对字符串进行操作，但是遇到nan值就会报错

In [119]:
data[:-1].map(lambda x:x.split('.'))
# data.map(lambda x: x.lower()) AttributeError

Dave     [dave@google, com]
Steve    [steve@gmail, com]
Rob        [rob@gmail, com]
dtype: object

通过`df/s.str`可以跳过nan值

In [124]:
# 可以传正则表达式
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [126]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [127]:
data.str.findall(pattern, flags=re.I)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

矢量化元素获取

In [129]:
matches = data.str.match(pattern, flags=re.I)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [137]:
matches.str.get(1)

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [138]:
matches.str[0]

Dave    NaN
Steve   NaN
Rob     NaN
Wes     NaN
dtype: float64

In [139]:
data.str[:-1]

Dave     dave@google.co
Steve    steve@gmail.co
Rob        rob@gmail.co
Wes                 NaN
dtype: object

![](https://camo.githubusercontent.com/695c497df2e177fdd535567620375f6ee5f00881/687474703a2f2f75706c6f61642d696d616765732e6a69616e7368752e696f2f75706c6f61645f696d616765732f373137383639312d613633343336346564366435643563352e706e673f696d6167654d6f6772322f6175746f2d6f7269656e742f7374726970253743696d61676556696577322f322f772f31323430)

In [144]:
data.str.cat(sep='')

'dave@google.comsteve@gmail.comrob@gmail.com'

In [155]:
data.str.pad(20)

Dave          dave@google.com
Steve         steve@gmail.com
Rob             rob@gmail.com
Wes                       NaN
dtype: object