# pandas加速优化

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)  # 设置查看列不省略
# pd.set_option('display.max_rows', None)  # 设置查看行不省略
import seaborn as sns

In [2]:
# 使用内置数据集
# sns.get_dataset_names()  # ['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'exercise', 'flights', 'fmri', 'gammas', 'geyser', 'iris', 'mpg', 'penguins', 'planets', 'taxis', 'tips', 'titanic']
# sns.get_data_home()  # /home/user01/seaborn-data
# /home/user01/seaborn-data/titanic.csv

In [3]:
# df = sns.load_dataset('iris')
df = sns.load_dataset('titanic')
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


## for

In [4]:
%%timeit
item_list = []
len_df = len(df)
for i in range(len_df):
    item = f"{df.iloc[i]['embarked']}({df.iloc[i]['embark_town']})"
    item_list.append(item)
# item_list

185 ms ± 1.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
%%timeit
item_list = []
for row in df.itertuples():
    item = f'{row.embarked}({row.embark_town})'
    item_list.append(item)
# item_list

2.11 ms ± 15.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 向量计算（字符串相加）

In [6]:
%%timeit
df['embarked'] + '(' + df['embark_town'] + ')'

517 µs ± 2.94 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## apply

In [7]:
%%timeit
def apply_func(row):
    item = f"{row['embarked']}({row['embark_town']})"
    item_list.append(item)
    # print(item)
item_list = []
tmp = df.apply(func=apply_func, axis=1)
# item_list

9.68 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


## isin

使用`isin()`方法进行筛选，标注各行所属的类，对筛选分类好的各个类进行向量计算或者使用对应的方法，效率较逐行迭代高。

## numba

加速数值计算

安装：
```
$ pip install numba
```

In [8]:
import numba

@numba.vectorize
def fun_numba(x):
    return x * x + x

def fun(x):
    return x * x + x

In [9]:
%%timeit
# apply
df['fare'].apply(fun)

300 µs ± 4.24 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [10]:
%%timeit
# 向量计算
df['fare'] * df['fare'] + df['fare']

126 µs ± 542 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [11]:
%%timeit
# numba
fun_numba(df['fare'].to_numpy())  # 需要以numpy数组的形式传入

6.23 µs ± 238 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## 参考
[提速千倍！Pandas性能优化方法，让你的pandas飞起来！](https://www.jianshu.com/p/6de7b6fd3790)