In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

data = sns.load_dataset('titanic')
data = data.loc[:, ['who', 'age', 'survived']]
data.head()

Unnamed: 0,who,age,survived
0,man,22.0,0
1,woman,38.0,1
2,woman,26.0,1
3,woman,35.0,1
4,man,35.0,0


## Lambda 簡介

In [2]:
def my_func(x):
    return x * 100

print(my_func(77))

my_func = lambda x: x * 100
print(my_func(77))

7700
7700


In [3]:
arr = list(range(1, 6))
print(arr)
list(map(lambda x: x * 100, arr))

[1, 2, 3, 4, 5]


[100, 200, 300, 400, 500]

## .assign()

In [4]:
# 請算出每個人是否高於所有人的平均年齡，而且需要完整計算流程

## 先算出平均，再算出大於平均
print(data.head())
tmp = data.copy()
tmp['avg_age'] = tmp['age'].mean()
tmp['diff_from_avg_age'] = tmp['age'] - tmp['avg_age']
tmp['if_age_greater_than_avg'] = tmp['diff_from_avg_age'] > 0
tmp.head()

     who   age  survived
0    man  22.0         0
1  woman  38.0         1
2  woman  26.0         1
3  woman  35.0         1
4    man  35.0         0


Unnamed: 0,who,age,survived,avg_age,diff_from_avg_age,if_age_greater_than_avg
0,man,22.0,0,29.699118,-7.699118,False
1,woman,38.0,1,29.699118,8.300882,True
2,woman,26.0,1,29.699118,-3.699118,False
3,woman,35.0,1,29.699118,5.300882,True
4,man,35.0,0,29.699118,5.300882,True


In [5]:
# 用 lambda 來實現漂亮的 assign() 寫法
print(
(data
    .assign(
        avg_age = lambda df: df['age'].mean(),
        diff_from_avg_age = lambda df: df['age'] - df['avg_age'],
        if_age_greater_than_avg = lambda df: df['diff_from_avg_age'] > 0
    )
).head()
)

     who   age  survived    avg_age  diff_from_avg_age  \
0    man  22.0         0  29.699118          -7.699118   
1  woman  38.0         1  29.699118           8.300882   
2  woman  26.0         1  29.699118          -3.699118   
3  woman  35.0         1  29.699118           5.300882   
4    man  35.0         0  29.699118           5.300882   

   if_age_greater_than_avg  
0                    False  
1                     True  
2                    False  
3                     True  
4                     True  


In [6]:
# 大錯特錯的寫法！！！！
(data
    .assign(
        avg_age = data['age'].mean(),
        diff_from_avg_age = data['age'] - data['avg_age'],
        if_age_greater_than_avg = data['diff_from_avg_age'] > 0
    )
)

KeyError: ignored

## .loc[]

In [7]:
# 篩選出年齡高於平均的那些人，並且只留下年齡（'age'）相關的計算過程欄位

## 照抄 .assign() 小節示範的寫法
tmp = (data
    .assign(
        avg_age = lambda df: df['age'].mean(),
        diff_from_avg_age = lambda df: df['age'] - df['avg_age'],
        if_age_greater_than_avg = lambda df: df['diff_from_avg_age'] > 0
    )
)

## 1. 篩選年齡高於平均值的 row
row_condition = tmp['if_age_greater_than_avg']

## 2. 篩選名稱包含 'age' 的 column
col_condition = [c for c in tmp.columns if c.find('age') > 0]

## 3. 把篩選條件都扔進 .loc[]
print(tmp.loc[row_condition, col_condition].head())

      avg_age  diff_from_avg_age  if_age_greater_than_avg
1   29.699118           8.300882                     True
3   29.699118           5.300882                     True
4   29.699118           5.300882                     True
6   29.699118          24.300882                     True
11  29.699118          28.300882                     True


In [8]:
# 在 .loc[] 應用 lambda

## 1. 篩選年齡高於平均值的 row

tmp = (data
    .assign(
        avg_age = lambda df: df['age'].mean(),
        diff_from_avg_age = lambda df: df['age'] - df['avg_age'],
        if_age_greater_than_avg = lambda df: df['diff_from_avg_age'] > 0
    )
    .loc[lambda df: df['if_age_greater_than_avg'], :] # 增加這行程式就好囉
)
print(tmp.head())

## 2. 篩選名稱包含 'age' 的 column

tmp = tmp.loc[:, lambda df: [c for c in df.columns if c.find('age') > 0]]
print(tmp.head())

## 以上兩步驟可以一氣呵成
print(
(data
    .assign(
        avg_age = lambda df: df['age'].mean(),
        diff_from_avg_age = lambda df: df['age'] - df['avg_age'],
        if_age_greater_than_avg = lambda df: df['diff_from_avg_age'] > 0
    )
     # 一行 .loc[] 同時完成 row 與 column 篩選！
    .loc[lambda df: df['if_age_greater_than_avg'], 
         lambda df: [c for c in df.columns if c.find('age') > 0]]
).head()
)

      who   age  survived    avg_age  diff_from_avg_age  \
1   woman  38.0         1  29.699118           8.300882   
3   woman  35.0         1  29.699118           5.300882   
4     man  35.0         0  29.699118           5.300882   
6     man  54.0         0  29.699118          24.300882   
11  woman  58.0         1  29.699118          28.300882   

    if_age_greater_than_avg  
1                      True  
3                      True  
4                      True  
6                      True  
11                     True  
      avg_age  diff_from_avg_age  if_age_greater_than_avg
1   29.699118           8.300882                     True
3   29.699118           5.300882                     True
4   29.699118           5.300882                     True
6   29.699118          24.300882                     True
11  29.699118          28.300882                     True
      avg_age  diff_from_avg_age  if_age_greater_than_avg
1   29.699118           8.300882                     True
3

## .rename()

In [9]:
new_data = (data
    .assign(
        if_cat_person=np.random.randint(2, size=data.shape[0])>0,
        if_dog_person=np.random.randint(2, size=data.shape[0])>0,
        if_allergy=np.random.randint(2, size=data.shape[0])>0,
        if_swimmer=np.random.randint(2, size=data.shape[0])>0,
        if_english_speaker=np.random.randint(2, size=data.shape[0])>0
    )
)
new_data.head()

Unnamed: 0,who,age,survived,if_cat_person,if_dog_person,if_allergy,if_swimmer,if_english_speaker
0,man,22.0,0,True,False,False,False,True
1,woman,38.0,1,True,True,True,False,True
2,woman,26.0,1,False,False,False,True,True
3,woman,35.0,1,False,True,False,False,False
4,man,35.0,0,True,True,True,False,False


In [10]:
print(
(new_data
    .rename(columns={
        'if_cat_person': 'is_cat_person',
        'if_dog_person': 'is_dog_person',
        'if_allergy': 'is_allergy',
        'if_swimner': 'is_swimmer',
        'if_english_speaker': 'is_english_speaker'
    }) 
).head()
)

     who   age  survived  is_cat_person  is_dog_person  is_allergy  \
0    man  22.0         0           True          False       False   
1  woman  38.0         1           True           True        True   
2  woman  26.0         1          False          False       False   
3  woman  35.0         1          False           True       False   
4    man  35.0         0           True           True        True   

   if_swimmer  is_english_speaker  
0       False                True  
1       False                True  
2        True                True  
3       False               False  
4       False               False  


In [11]:
print(
(new_data
    .rename(columns=lambda col: col.replace('if_', 'is_') 
                                if col.startswith('if_') else col) 
).head()
)

     who   age  survived  is_cat_person  is_dog_person  is_allergy  \
0    man  22.0         0           True          False       False   
1  woman  38.0         1           True           True        True   
2  woman  26.0         1          False          False       False   
3  woman  35.0         1          False           True       False   
4    man  35.0         0           True           True        True   

   is_swimmer  is_english_speaker  
0       False                True  
1       False                True  
2        True                True  
3       False               False  
4       False               False  


## .apply()

In [12]:
(data
    .groupby('who')
    .apply(lambda df: print(df.head(), '\n')) 
)

      who   age  survived
7   child   2.0         0
9   child  14.0         1
10  child   4.0         1
14  child  14.0         0
16  child   2.0         0 

    who   age  survived
0   man  22.0         0
4   man  35.0         0
5   man   NaN         0
6   man  54.0         0
12  man  20.0         0 

      who   age  survived
1   woman  38.0         1
2   woman  26.0         1
3   woman  35.0         1
8   woman  27.0         1
11  woman  58.0         1 



In [13]:
(data
    .groupby('who')
    ['survived']
    .mean()
)

who
child    0.590361
man      0.163873
woman    0.756458
Name: survived, dtype: float64

In [14]:
(data
    .groupby('who')
    .apply(lambda df: df.loc[df['age'] <= df['age'].mean(), 'survived'].mean())    
)

who
child    0.702128
man      0.165992
woman    0.738095
dtype: float64

## sort_values()

In [15]:
new_data = pd.DataFrame({
    'col_1': [{'c': 2}, {'a': 4}, {'d': 1}, {'b': 3}],
    'col_2': [7.25, 53.1, 13.0, 8.05]
})
print(new_data.head())

      col_1  col_2
0  {'c': 2}   7.25
1  {'a': 4}  53.10
2  {'d': 1}  13.00
3  {'b': 3}   8.05


In [16]:
# dict 不能排序，排序失敗！
print(new_data.sort_values('col_1'))

TypeError: ignored

In [17]:
# 用 dict 的 key 排序
print(
    new_data.sort_values('col_1', 
                         key=lambda s: s.map(lambda d: list(d.keys())[0]))
)

# 用 dict 的 value 排序
print(
    new_data.sort_values('col_1', 
                         key=lambda s: s.map(lambda d: list(d.values())[0]))
)

      col_1  col_2
1  {'a': 4}  53.10
3  {'b': 3}   8.05
0  {'c': 2}   7.25
2  {'d': 1}  13.00
      col_1  col_2
2  {'d': 1}  13.00
0  {'c': 2}   7.25
3  {'b': 3}   8.05
1  {'a': 4}  53.10
