In [1]:
import pandas as pd

# apply 自定义函数

## Series 的 apply 方法

##### 语法：series.apply(func)
**作用：将 series 中的每个元素传递给 func 函数的参数，func函数的返回值，最终组成一个新的 Series**

In [2]:
df = pd.DataFrame({'a': [10, 20, 30], 'b': [20, 30, 40]})
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [4]:
def my_sq(x):
    """求平方"""
    return x ** 2

In [5]:
# 注意：series.apply(函数名)
df['a'].apply(my_sq)

0    100
1    400
2    900
Name: a, dtype: int64

In [6]:
def my_add(x, n):
    """求和"""
    return x + n

# 注意：args 参数必须是一个元组，这里 3 是传递给 n 的
df['a'].apply(my_add, args=(3,))

0    13
1    23
2    33
Name: a, dtype: int64

In [7]:
# 使用 apply 时，也可直接指定参数 n 的值
df['a'].apply(my_add, n=3)

0    13
1    23
2    33
Name: a, dtype: int64

## DataFrame 的 apply 方法

##### 语法：dataframe.apply(func)
**作用：将 dataframe 中的每列数据传递给 func 函数的参数，func函数的返回值，最终组成一个新的 Dataframe**

In [8]:
def sub_one(x):
    """减1操作"""
    print(x)
    return x - 1

In [9]:
# 按列计算
df.apply(sub_one)

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


Unnamed: 0,a,b
0,9,19
1,19,29
2,29,39


##### 语法：dataframe.apply(func, axis=1)
**作用：将 dataframe 中的每行数据传递给 func 函数的参数，func函数的返回值，最终组成一个新的 Dataframe**

In [10]:
# 按行计算
df.apply(sub_one, axis=1)

a    10
b    20
Name: 0, dtype: int64
a    20
b    30
Name: 1, dtype: int64
a    30
b    40
Name: 2, dtype: int64


Unnamed: 0,a,b
0,9,19
1,19,29
2,29,39


##### 语法：dataframe.applymap(func)
**作用：将 dataframe 中的每个元素传递 func 函数的参数，func函数的返回值，最终组成一个新的 Dataframe**

In [11]:
df.applymap(sub_one)

10
20
30
20
30
40


Unnamed: 0,a,b
0,9,19
1,19,29
2,29,39


# apply 使用案例

In [3]:
titanic = pd.read_csv('./data/titanic.csv')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   sibsp        891 non-null    int64  
 5   parch        891 non-null    int64  
 6   fare         891 non-null    float64
 7   embarked     889 non-null    object 
 8   class        891 non-null    object 
 9   who          891 non-null    object 
 10  adult_male   891 non-null    bool   
 11  deck         203 non-null    object 
 12  embark_town  889 non-null    object 
 13  alive        891 non-null    object 
 14  alone        891 non-null    bool   
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB


In [14]:
# 自定义函数
def count_missing(vec):
    """计算缺失值的个数"""
    return vec.isnull().sum()

def prop_missing(vec):
    """计算缺失值的比例"""
    return count_missing(vec) / vec.size

def prop_complete(vec):
    """计算非缺失值的比例"""
    return 1 - prop_missing(vec)

##### ① 统计每列缺失值的个数


In [15]:
titanic.apply(count_missing)

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

##### ② 统计每列缺失值的占比

In [16]:
titanic.apply(prop_missing)

survived       0.000000
pclass         0.000000
sex            0.000000
age            0.198653
sibsp          0.000000
parch          0.000000
fare           0.000000
embarked       0.002245
class          0.000000
who            0.000000
adult_male     0.000000
deck           0.772166
embark_town    0.002245
alive          0.000000
alone          0.000000
dtype: float64

##### ③ 统计每列非缺失值的占比

In [17]:
titanic.apply(prop_complete)

survived       1.000000
pclass         1.000000
sex            1.000000
age            0.801347
sibsp          1.000000
parch          1.000000
fare           1.000000
embarked       0.997755
class          1.000000
who            1.000000
adult_male     1.000000
deck           0.227834
embark_town    0.997755
alive          1.000000
alone          1.000000
dtype: float64

##### ① 统计每行缺失值的个数

In [18]:
titanic.apply(count_missing, axis=1)

0      1
1      0
2      1
3      0
4      1
      ..
886    1
887    0
888    2
889    0
890    1
Length: 891, dtype: int64

##### ② 统计每行缺失值的占比

In [19]:
titanic.apply(prop_missing, axis=1)

0      0.066667
1      0.000000
2      0.066667
3      0.000000
4      0.066667
         ...   
886    0.066667
887    0.000000
888    0.133333
889    0.000000
890    0.066667
Length: 891, dtype: float64

##### ③ 统计每行非缺失值的占比

In [20]:
titanic.apply(prop_complete, axis=1)

0      0.933333
1      1.000000
2      0.933333
3      1.000000
4      0.933333
         ...   
886    0.933333
887    1.000000
888    0.866667
889    1.000000
890    0.933333
Length: 891, dtype: float64

In [21]:
titanic.apply(count_missing, axis=1).value_counts()

1    549
0    182
2    160
dtype: int64