## CHAPTER 10
# Data Aggregation and Group Operations

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## 10.1 How to Think About Group Operations

In [2]:
df = pd.DataFrame(
    {
        "key1": ["a", "a", None, "b", "b", "a", None],
        "key2": pd.Series([1, 2, 1, 2, 1, None, 1], dtype="Int64"),
        "data1": np.random.standard_normal(7),
        "data2": np.random.standard_normal(7),
    }
)
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,1.425405,0.447358
1,a,2.0,0.000208,-0.684295
2,,1.0,-0.524617,0.458804
3,b,2.0,-0.844421,-0.048752
4,b,1.0,0.495661,0.122676
5,a,,0.580779,0.035878
6,,1.0,0.0045,0.411403


In [3]:
grouped = df["data1"].groupby(df["key1"])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000184FFE29010>

In [4]:
grouped.mean()

key1
a    0.668797
b   -0.174380
Name: data1, dtype: float64

In [5]:
means = df["data1"].groupby([df["key1"], df["key2"]]).mean()
means

key1  key2
a     1       1.425405
      2       0.000208
b     1       0.495661
      2      -0.844421
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,1,2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.425405,0.000208
b,0.495661,-0.844421


In [7]:
state = np.array(["OH", "CA", "CA", "OH", "OH", "CA", "OH"])
years = np.array([2005, 2005, 2006, 2005, 2006, 2005, 2006])
df["data1"].groupby([state, years]).mean()

CA  2005    0.290493
    2006   -0.524617
OH  2005    0.290492
    2006    0.250080
Name: data1, dtype: float64

In [8]:
df.groupby("key1").mean()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1.5,0.668797,-0.067019
b,1.5,-0.17438,0.036962


In [9]:
df.groupby("key2").mean(numeric_only=True)

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.350237,0.36006
2,-0.422107,-0.366523


In [10]:
df.groupby(["key1", "key2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,1.425405,0.447358
a,2,0.000208,-0.684295
b,1,0.495661,0.122676
b,2,-0.844421,-0.048752


In [11]:
df.groupby(["key1", "key2"]).size()

key1  key2
a     1       1
      2       1
b     1       1
      2       1
dtype: int64

In [12]:
df.groupby("key1", dropna=False).size()

key1
a      3
b      2
NaN    2
dtype: int64

In [13]:
df.groupby(["key1", "key2"], dropna=False).size()

key1  key2
a     1       1
      2       1
      <NA>    1
b     1       1
      2       1
NaN   1       2
dtype: int64

In [14]:
df.groupby("key1").count()

Unnamed: 0_level_0,key2,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,2,3,3
b,2,2,2


### Iterating over Groups

In [15]:
for name , group in df.groupby("key1"):
    print(name)
    print(group)

a
  key1  key2     data1     data2
0    a     1  1.425405  0.447358
1    a     2  0.000208 -0.684295
5    a  <NA>  0.580779  0.035878
b
  key1  key2     data1     data2
3    b     2 -0.844421 -0.048752
4    b     1  0.495661  0.122676


In [16]:
for (k1, k2), group in df.groupby(["key1", "key2"]):
    print((k1, k2))
    print(group)

('a', np.int64(1))
  key1  key2     data1     data2
0    a     1  1.425405  0.447358
('a', np.int64(2))
  key1  key2     data1     data2
1    a     2  0.000208 -0.684295
('b', np.int64(1))
  key1  key2     data1     data2
4    b     1  0.495661  0.122676
('b', np.int64(2))
  key1  key2     data1     data2
3    b     2 -0.844421 -0.048752


In [17]:
pieces = {name: group for name, group in df.groupby("key1")}
pieces["b"]

Unnamed: 0,key1,key2,data1,data2
3,b,2,-0.844421,-0.048752
4,b,1,0.495661,0.122676


In [18]:
pieces

{'a':   key1  key2     data1     data2
 0    a     1  1.425405  0.447358
 1    a     2  0.000208 -0.684295
 5    a  <NA>  0.580779  0.035878,
 'b':   key1  key2     data1     data2
 3    b     2 -0.844421 -0.048752
 4    b     1  0.495661  0.122676}

In [19]:
grouped = df.groupby({"key1": "key", "key2": "key",
                      "data1": "data", "data2": "data"}, axis="columns")
for group_key, group_values in grouped:
    print(group_key)
    print(group_values)

data
      data1     data2
0  1.425405  0.447358
1  0.000208 -0.684295
2 -0.524617  0.458804
3 -0.844421 -0.048752
4  0.495661  0.122676
5  0.580779  0.035878
6  0.004500  0.411403
key
   key1  key2
0     a     1
1     a     2
2  None     1
3     b     2
4     b     1
5     a  <NA>
6  None     1


  grouped = df.groupby({"key1": "key", "key2": "key",


### Selecting a Column or Subset of Columns

In [21]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,1.0,1.425405,0.447358
1,a,2.0,0.000208,-0.684295
2,,1.0,-0.524617,0.458804
3,b,2.0,-0.844421,-0.048752
4,b,1.0,0.495661,0.122676
5,a,,0.580779,0.035878
6,,1.0,0.0045,0.411403


In [25]:
df.groupby("key1")["data1"]
df.groupby("key1")[["data2"]]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000184FFE11010>

In [26]:
df["data1"].groupby(df["key1"])
df[["data2"]].groupby(df["key1"])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000184FFE11D90>

In [27]:
df.groupby(["key1", "key2"])[["data2"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,1,0.447358
a,2,-0.684295
b,1,0.122676
b,2,-0.048752


In [28]:
s_grouped = df.groupby(["key1", "key2"])["data2"]
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000184FFE54F50>

In [29]:
s_grouped.mean()

key1  key2
a     1       0.447358
      2      -0.684295
b     1       0.122676
      2      -0.048752
Name: data2, dtype: float64