# DataFrame: Aggregation

In [1]:
import pandas as pd

In [None]:
# Generate dummy data.
df = pd.DataFrame({
    "id": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
    "city": ["Amsterdam"] * 3 + ["Utrecht"] * 3 + ["Beek"] * 3,
    "state": ["Noord-Holland"] * 3 + ["Utrecht"] * 3 + ["Gelderland"] * 2 + ["Limburg"],
    "age": [22, 41, 36, 27, 22, 56, 72, 44, 16],
    "score": [8, 7, 4, 9, 6, 7, 6, 8, 7],
})
df

In [None]:
# Create grouped data using groupby().
df.groupby("city")

In [None]:
# List groups; tuple of group and associated DataFrame.
list(df.groupby("city"))

In [None]:
# Can loop through the groups...
for city, df_city in df.groupby("city"):
    print("City: ", city)
    print(df_city)
    print("-" * 42)

In [None]:
# An aggregation function summarizes the groups.
# Note: City becomes the index.
# Note: Sum concatenates categorical.
df.groupby("city").sum()

In [None]:
# Using only age column; returns a Series.
df.groupby("city")["age"].mean()

In [None]:
# Using agg() provides more control over the aggregations.
(
    df
    .groupby("city")
    .agg({
        "age": "mean",
        "score": "sum",
    })
)

In [None]:
# Can use a custom aggregation function.
(
    df
    .groupby("city")
    .agg({
        "age": lambda ages: (ages > 18).sum(),
    })
)

In [None]:
# Use a list to create multiple aggregations per column.
# Note: Creates an inconvenient MultiIndex on the columns...
(
    df
    .groupby("city")
    .agg({
        "age": ["mean", "std", "count", "min", "max"],
        "score": ["sum", "min", "max"],
    })
)

In [None]:
# Avoid the MultiIndex using this syntax.
aggregated = (
    df
    .groupby("city")
    .agg(
        age_mean=("age", "mean"),
        score_total=("score", "sum"),
        score_min=("score", "min"),
        score_max=("score", "max"),
    )
)
aggregated

In [None]:
# Group by multiple columns.
# Note: Creates a MultiIndex on the rows :-(
(
    df
    .groupby(["city", "state"])
    .agg(
        age_mean=("age", "mean"),
        age_std=("age", "std"),
        score_total=("score", "sum"),
        score_min=("score", "min"),
        score_max=("score", "max"),
    )
)

In [None]:
# Set as_index=False to avoid the row MultiIndex.
(
    df
    .groupby(["city", "state"], as_index=False)
    .agg(
        age_mean=("age", "mean"),
        age_std=("age", "std"),
        score_total=("score", "sum"),
        score_min=("score", "min"),
        score_max=("score", "max"),
    )
)

## Melt

In [None]:
# Create dummy data with 3 measurements.
df = pd.DataFrame(
    {
        "Temperature": [11.2, 15.3, 14.8, 12.5, 10.5],
        "Sunshine": [5.5, 7.5, 6.8, 5.6, 4.6],
        "Precipitation": [3.5, 0.5, 0.0, 0.0, 3.4],
    },
    index=pd.date_range("2022-3-1", "2022-3-5")
)
df

In [None]:
# Melt: Transform to key - value / long format.
df.melt()

In [None]:
# Keep the original index.
# Note: Index has duplicate values.
df.melt(ignore_index=False).sort_index()

In [None]:
# Use var_name and value_name to set column names.
df_long = df.melt(
    var_name="measure",
    value_name="value",
    ignore_index=False,
)
df_long

## Pivot

In [None]:
df_long

In [None]:
# Pivot: Create columns from values.
df_long.pivot(
    columns="measure",
    values="value",
)
