# 字符串操作

In [1]:
# 原生操作
val = 'a,  b,   chen'
val_list = val.split(',')
val_list = [v.strip() for v in val_list] # 去除空白
print(":".join(val_list))

a:b:chen


In [2]:
import pandas as pd
import numpy as np
data = {"Dave": "dave@gmail.com", "chen": 'chenx@github.io', "Robin": "rob@google.com", "Wes": np.nan}
data = pd.Series(data)
print(data)

Dave      dave@gmail.com
chen     chenx@github.io
Robin     rob@google.com
Wes                  NaN
dtype: object


In [3]:
print(data.isna())

Dave     False
chen     False
Robin    False
Wes       True
dtype: bool


In [4]:
# 通过str对Series进行字符串操作，并带有Pandas的传播行为 
data.str.contains("gmail")

Dave      True
chen     False
Robin    False
Wes        NaN
dtype: object

# 分类数据

In [6]:
values = pd.Series(['apple', 'banana', 'creme', 'dot']*2)
values.unique()

array(['apple', 'banana', 'creme', 'dot'], dtype=object)

In [8]:
# 编码表示法,通过编码表示减少内存占用
fruits = pd.Series([0, 1, 0, 0] * 2)
category = pd.Series(['apple', 'banana'])
# 分类查询显示结果
category.take(fruits)

0     apple
1    banana
0     apple
0     apple
0     apple
1    banana
0     apple
0     apple
dtype: object

In [19]:
# pandas对分类的支持，转为分类
categories = pd.Series(['foo', 'bar', 'zoo'])
labels = pd.Series(['foo', 'foo', 'bar', 'zoo'] * 20_0000)
categories = labels.astype('category') # pandas内部会通过编号来做映射关系,会转为Categorical对象
print(categories)
print(labels)
print(categories.memory_usage())
print(labels.memory_usage())
print(type(categories.array)) # <class 'pandas.core.arrays.categorical.Categorical'>

0         foo
1         foo
2         bar
3         zoo
4         foo
         ... 
799995    zoo
799996    foo
799997    foo
799998    bar
799999    zoo
Length: 800000, dtype: category
Categories (3, object): ['bar', 'foo', 'zoo']
0         foo
1         foo
2         bar
3         zoo
4         foo
         ... 
799995    zoo
799996    foo
799997    foo
799998    bar
799999    zoo
Length: 800000, dtype: object
800260
6400128
<class 'pandas.core.arrays.categorical.Categorical'>


In [21]:
# <class 'pandas.core.arrays.categorical.Categorical'>对象，有两个属性
c = categories.array
print(c.categories)
print(c.codes)
# 获取编码和分类的映射关系
dict(enumerate(c.categories))

Index(['bar', 'foo', 'zoo'], dtype='object')
[1 1 0 ... 1 0 2]


{0: 'bar', 1: 'foo', 2: 'zoo'}

In [23]:
# 通过from_codes将一组序列解析为对应的分类
codes = [0, 0, 0, 1, 1, 1, 2, 2, 2]
cate = ['foor', 'barr', 'zooz']
res = pd.Categorical.from_codes(codes, cate)
res

['foor', 'foor', 'foor', 'barr', 'barr', 'barr', 'zooz', 'zooz', 'zooz']
Categories (3, object): ['foor', 'barr', 'zooz']

In [27]:
# 转换为Categorical是有代价的
animal_old = pd.Series(['beer', 'beer', 'elephant', 'bird', 'cat', 'cat', 'dog', 'dog', 'whale', 'lion'] * 20_0000) 
# %timeit animals = animal_old.astype('category')
animals = animal_old.astype('category')
# 代价是一次性的，可以提高后续处理的速度
%timeit animal_old.value_counts()
%timeit animals.value_counts()

69.3 ms ± 2.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
9.09 ms ± 92.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
