# DataFrame: Aggregation

In [None]:
import pandas as pd

In [None]:
# Generate dummy data.
df = pd.DataFrame({
    "id": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
    "city": ["Amsterdam"] * 3 + ["Utrecht"] * 3 + ["Beek"] * 3,
    "state": ["Noord-Holland"] * 3 + ["Utrecht"] * 3 + ["Gelderland"] * 2 + ["Limburg"],
    "age": [22, 41, 36, 27, 22, 56, 72, 44, 16],
    "score": [8, 7, 4, 9, 6, 7, 6, 8, 7],
})
df

In [None]:
# Create grouped data using groupby().
df.groupby("city")

In [None]:
# Can loop through the groups...
for city, df_city in df.groupby("city"):
    print("City: ", city)
    print(df_city)
    print("-" * 42)

In [None]:
# Use an aggregation function to summarize the groups.
# Note: city became the index of the new DataFrame.
df.groupby("city").sum()

In [None]:
# The agg() method provides more control over the aggregations.
(
    df
    .groupby("city")
    .agg({
        "age": "mean",
        "score": "sum",
    })
)

In [None]:
# To get multiple aggregations for a column, use a list.
# Note: Results in an inconvenient MultiIndex on the columns...
(
    df
    .groupby("city")
    .agg({
        "age": ["mean", "std", "count", "min", "max"],
        "score": ["sum", "min", "max"],
    })
)

In [None]:
# Can provide a custom aggregartion function.
(
    df
    .groupby("city")
    .agg({
        "age": lambda ages: (ages > 18).sum(),
    })
)

In [None]:
# Avoid a MultiIndex using this syntax.
aggregated = (
    df
    .groupby("city")
    .agg(
        age_mean=("age", "mean"),
        age_std=("age", "std"),
        score_total=("score", "sum"),
        score_min=("score", "min"),
        score_max=("score", "max"),
    )
)
aggregated

In [None]:
# Group by multiple columns using a list.
(
    df
    .groupby(["city", "state"])
    .agg(
        age_mean=("age", "mean"),
        age_std=("age", "std"),
        score_total=("score", "sum"),
        score_min=("score", "min"),
        score_max=("score", "max"),
    )
)

In [None]:
# Set as_index to False to avoid row MultiIndex.
(
    df
    .groupby(["city", "state"], as_index=False)
    .agg(
        age_mean=("age", "mean"),
        age_std=("age", "std"),
        score_total=("score", "sum"),
        score_min=("score", "min"),
        score_max=("score", "max"),
    )
)

## Melt

In [None]:
df = pd.DataFrame(
    {
        "Temperature": [11.2, 15.3, 14.8, 12.5, 10.5],
        "Sunshine": [5.5, 7.5, 6.8, 5.6, 4.6],
        "Precipitation": [3.5, 0.5, 0.0, 0.0, 3.4],
    },
    index=pd.date_range("2022-3-1", "2022-3-5")
)
df

In [None]:
# Transform to long format using melt().
df.melt()

In [None]:
# Keep the original index
df.melt(ignore_index=False)

In [None]:
# Use var_name and value_name to adjust column names
df_long = df.melt(
    var_name="measure",
    value_name="value",
    ignore_index=False,
)
df_long

## Pivot

In [None]:
# Pivot reverses long format back to wide format.
df_long.pivot(
    columns="measure",
    values="value",
)
