In [None]:
print("""
@File         : 08_groupby_operations_split-apply-combine.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-08-10 21:06:56
@Email        : cuixuanstephen@gmail.com
@Description  : 分组操作：分割‑应用‑合并
""")

In [None]:
%cd ../

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('data/gapminder.tsv', sep='\t')

In [None]:
avg_lief_exp_by_year = df.groupby('year')['lifeExp'].mean()
avg_lief_exp_by_year

In [None]:
year = df.year.unique()
year

我们可以查看每一年的资料并对数据进行子集化。

In [None]:
y1952 = df.loc[df.year == 1952, :]
y1952

In [None]:
y1952_mean = y1952['lifeExp'].mean()
y1952_mean

`.groupby()` 方法本质上对每一年列重复此过程（即分割数据）、计算平均值（即应用函数），并方便地返回将所有结果放在一个数据框中（即将所有值组合在一起）。

provides a non-exclusive list of built-in Pandas methods you can use to aggregate your data.

|Pandas Method|Numpy/Scipy Function|Description|
|---|---|---|
|`.count()`|`np.count_nonzero()`|Frequency count not including NaN values|
|`.size()`||Frequency count with NaN values|
|`.mean()`|`np.mean()`|Mean of the values|
|`.std()`|`np.std()`|Sample standard deviation|
|`.min()`|`np.min()`|Minimum values|
|`.quantile(q=0.25)`|`np.percentile(q=0.25)`|25th percentile of the values|
|`.quantile(q=0.50)`|`np.percentile(q=0.50)`|50th percentile of the values|
|`.quantile(q=0.75)`|`np.percentile(q=0.75)`|75th percentile of the values|
|`.max()`|`np.max()`|Maximum value|
|`.sum()`|`np.sum()`|Sum of the values|
|`.var()`|`np.var()`|Unbiased variance|
|`.sem()`|`scipy.stats.sem()`|Unbiased standard error of the mean|
|`.describe()`|`scipy.stats.describe()`|Count, mean, standard deviation, minimum, 25%, 50%, 75%, and maximum|
|`.first()`||Returns the first row|
|`.last()`||Returns the last row|
|`.nth()`||Returns the nth row (Python starts counting from 0)|

In [None]:
continent_describe = df.groupby(['continent'])['lifeExp'].describe()
continent_describe

除了直接调用聚合方法，您还可以调用 `.agg()` 或 `.aggregate()` 方法，并在其中传递所需的聚合函数。

> `.agg()` 方法是 `.aggregate()` 的别名。Pandas 文档建议你使用别名 `.agg()`，而不是完整拼写的方法。

In [None]:
cont_le_agg = df.groupby('continent')['lifeExp'].agg(np.mean)
cont_le_agg

#### Custom User Functions

In [None]:
def my_mean(values):
    """My version of calculating a mean"""
    n = len(values)
    
    sum_ = 0
    for value in values:
        sum_ += value
    return sum_ / n

In [None]:
agg_my_mean = df.groupby('year')['lifeExp'].agg(my_mean)
agg_my_mean

我们可以编写接受多个参数的函数。只要第一个参数从数据框中获取一系列值，你可以传递其他参数作为关键字放入 `.agg()` 或 `.aggregate()` 中。

In [None]:
def my_mean_diff(values, diff_value):
    mean_ = my_mean(values)
    return (mean_ - diff_value)

In [None]:
global_mean = df['lifeExp'].mean()
global_mean

In [None]:
agg_mean_diff = (
    df.groupby('year')
    ['lifeExp']
    .agg(my_mean_diff, diff_value=global_mean)
)
agg_mean_diff

### 多种函数同时实现

In [None]:
gdf = (
    df.groupby('year')['lifeExp']
    .agg([np.count_nonzero, np.mean, np.std])
)
gdf

In [None]:
gdf_dict = df.groupby("year").agg(
    {
        "lifeExp": "mean",
        "pop": "median",
        "gdpPercap": "median"
    }
)
gdf_dict

In the past, passing a dict into a Series after a .groupby() allowed you to directly calculate aggregate statistics as the returned value, with the key of the dict being the new column name. However, this notation is not consistent with the behavior when dicts are passed into grouped DataFrames.

## 变换

When we transform data, we pass values from our dataframe into a function. The function then “transforms” the data. Unlike `.agg()`, which can take multiple values and return a single (aggregated) value, `.transform()` takes multiple values and returns a one-to-one transformation of the values. That is, it does not reduce the amount of data.

In [None]:
def my_zscore(x):
    return ((x - x.mean()) / x.std())

In [None]:
transform_z = df.groupby('year')['lifeExp'].transform(my_zscore)
transform_z

In [None]:
df.shape

In [None]:
transform_z.shape

In [None]:
from scipy.stats import zscore

In [None]:
sp_z_score = df.groupby('year')['lifeExp'].transform(zscore)
sp_z_score

In [None]:
zscore(df['lifeExp'])

在某些数据集中，用列的平均值填充缺失值也可能是有意义的。然而，在其他时候，根据某个组别的均值也可以。

In [None]:
import seaborn as sns
np.random.seed(42)

In [None]:
tips_10 = sns.load_dataset('tips').sample(10)
tips_10

In [None]:
tips_10.loc[np.random.permutation(tips_10.index)[:4], 'total_bill'] = np.nan
tips_10

In [None]:
tips_10.groupby('sex').count()

In [None]:
def fill_na_mean(x: pd.Series):
    avg = x.mean()
    return x.fillna(avg)

In [None]:
tips_10 = tips_10.assign(
    fill_total_bill=tips_10.groupby('sex')
    .total_bill
    .transform(fill_na_mean)
)

In [None]:
tips_10[['sex', 'total_bill', 'fill_total_bill']]

## 过滤

`.filter()` allows you to split your data by keys, and then perform some kind of boolean subsetting
on the data.

In [None]:
tips = sns.load_dataset('tips')
tips.shape

In [None]:
tips['size'].value_counts()

In [None]:
tips_filtered = (
    tips.groupby('size')
    .filter(lambda df_: df_['size'].count() >= 30)
)
tips_filtered.shape

In [None]:
tips_filtered['size'].value_counts()

## The `pandas.core.groupby.DataFrameGroupBy` object

In [None]:
tips_10 = sns.load_dataset('tips').sample(10, random_state=42)
tips_10

In [None]:
grouped = tips_10.groupby('sex')
print(grouped)

In [None]:
# group as key, list[index] as value
grouped.groups

If we specify the calculation we want right after the `.groupby()`, however, Python will perform the calculation on all the columns it can and silently drop the rest.

In [None]:
grouped.mean(numeric_only=True)

In [None]:
tips_10.columns

In [None]:
female = grouped.get_group('Female')
female

In [None]:
for key, group_data in grouped:
    print(key, group_data)

In [None]:
for sex_group in grouped:
    print(f'the type is: {type(sex_group)}\n')
    print(f'the length is: {len(sex_group)}\n')
    first_element = sex_group[0]
    print(f'the first element is: {first_element}\n')
    print(f'it has a type of: {type(sex_group[0])}\n')
    second_element = sex_group[1]
    print(f'the second element is:\n{second_element}\n')
    # get the type of the second element (dataframe)
    print(f'it has a type of: {type(second_element)}\n')
    # print what we have
    print(f'what we have:')
    print(sex_group)
    # stop after first iteration
    break

In [None]:
bill_sex_time = tips_10.groupby(['sex', 'time'])
bill_sex_time.mean(numeric_only=True)

### Flattening the Results (`.reset_index()`)

In [None]:
bill_sex_time.mean(numeric_only=True).reset_index()

In [None]:
tips_10.groupby(['sex', 'time'], as_index=False).mean(numeric_only=True)

## Working With a MultiIndex

In [None]:
intv_df = pd.read_csv('data/epi_sim.zip')
intv_df

In [None]:
count_only = (
    intv_df.groupby(['rep', 'intervened', 'tr'])['ig_type']
    .count()
)
count_only

In [None]:
type(count_only)

In [None]:
count_mean = count_only.groupby(level=[0, 1, 2]).mean()
count_mean

In [None]:
count_mean = (
    intv_df.groupby(['rep', 'intervened', 'tr'])['ig_type']
    .count()
    .groupby(level=[0, 1, 2])
    .mean()
)
# 不理解这样做的原因

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
fig = sns.lmplot(
    data=count_mean.reset_index(),
    x='intervened',
    y='ig_type',
    hue='rep',
    col='tr',
    fit_reg=False,
    palette='viridis'
)
plt.show()

In [None]:
cumulative_count = (
    intv_df.groupby(['rep', 'intervened', 'tr'])['ig_type']
    .count()
    .groupby(level=['rep'])
    .cumsum()
    .reset_index()
)

fig = sns.lmplot(
    data=cumulative_count,
    x='intervened',
    y='ig_type',
    hue='rep',
    col='tr',
    fit_reg=False,
)
plt.show()