# <b> Aggregation and Grouping</b>

In [104]:
import pandas as pd
import numpy as np
import seaborn as sns

In [105]:
class display(object):
    """Display HTML representation of multiple objects"""

    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}</div>"""

    def __init__(self, *args):
        self.args = args  # Store all arguments (names of variables as strings)

    def _repr_html_(self):
        return '\n'.join(
            self.template.format(a, eval(a)._repr_html_())
            for a in self.args
        )

    def __repr__(self):
        return '\n\n'.join(
            a + '\n' + repr(eval(a))
            for a in self.args
        )

In [106]:
planets = sns.load_dataset('planets')
planets.shape

(1035, 6)

In [107]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


## <b> Simple Aggregation in Pandas</b>

In [108]:
#a brief review of aggregations on NumPy arrays:
import numpy as np

arr = np.array([[1, 2, 3],
                [4, 5, 6]])

np.sum(arr)         # 21
np.mean(arr)        # 3.5
np.min(arr, axis=0) # array([1, 2, 3]) → min of each column
np.max(arr, axis=1) # array([3, 6]) → max of each row


array([3, 6])

### Common NumPy Aggregation Functions
```python
arr = np.array([[10, 20, 30],
                [5, 15, 25]])
```

| **Function**     | **Example Code**     | **Result**   | **Explanation**                                               |
|------------------|----------------------|--------------|---------------------------------------------------------------|
| `np.sum()`       | `np.sum(arr)`        | `105`        | Sum of all elements: 10 + 20 + 30 + 5 + 15 + 25               |
| `np.mean()`      | `np.mean(arr)`       | `17.5`       | Mean (average) of all values                                  |
| `np.std()`       | `np.std(arr)`        | `8.5391...`  | Standard deviation (spread of data)                          |
| `np.var()`       | `np.var(arr)`        | `72.25`      | Variance (square of the standard deviation)                  |
| `np.min()`       | `np.min(arr)`        | `5`          | Minimum value in the array                                    |
| `np.max()`       | `np.max(arr)`        | `30`         | Maximum value in the array                                    |
| `np.argmin()`    | `np.argmin(arr)`     | `3`          | Index of the minimum value (5 is at index 3 in flattened array) |
| `np.argmax()`    | `np.argmax(arr)`     | `2`          | Index of the maximum value (30 is at index 2)                 |


### 🧮 How to Calculate Standard Deviation (Sample Case)

Let’s say we have this dataset:

```python
data = [10, 12, 23, 23, 16, 23, 21, 16]
```

---

#### **Step 1: Find the Mean (Average)**

$
\text{mean} = \frac{10 + 12 + 23 + 23 + 16 + 23 + 21 + 16}{8} = \frac{144}{8} = 18
$

---

#### **Step 2: Subtract the Mean and Square the Differences**

$
(10 - 18)^2 = 64 \\
(12 - 18)^2 = 36 \\
(23 - 18)^2 = 25 \\
(23 - 18)^2 = 25 \\
(16 - 18)^2 = 4 \\
(23 - 18)^2 = 25 \\
(21 - 18)^2 = 9 \\
(16 - 18)^2 = 4
$

---

#### **Step 3: Find the Average of the Squared Differences (Variance)**

$
\text{variance} = \frac{64 + 36 + 25 + 25 + 4 + 25 + 9 + 4}{8} = \frac{192}{8} = 24
$

---

#### **Step 4: Take the Square Root (Standard Deviation)**

$
\text{standard deviation} = \sqrt{24} \approx 4.90
$

---

### ✅ Formula Recap (Population Standard Deviation)

$
\sigma = \sqrt{\frac{1}{N} \sum_{i=1}^{N} (x_i - \mu)^2}
$

Where:

- $\mu$: mean of the dataset  
- $x_i$: each individual value  
- $N$: number of values  

🔸 For **sample standard deviation**, divide by $N - 1$ instead of $N$.


We shouldn’t divide by 7 if we’re treating this data as the full population.

We should divide by 7 (instead of 8) if we’re treating this data as a sample drawn from a larger population
.

In [109]:
rng=np.random.RandomState(42)
ser=pd.Series(rng.rand(5))
ser

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64

In [110]:
# rng = np.random.default_rng(42)  # Create a random number generator with a seed of 42
# ser = pd.Series(rng.random(5))  # Generate 5 random numbers
# ser

In [111]:
ser.sum()

2.811925491708157

In [112]:
ser.mean()

0.5623850983416314

In [113]:
df =pd.DataFrame({'A':rng.rand(5),
                  'B': rng.rand(5)})
df

Unnamed: 0,A,B
0,0.155995,0.020584
1,0.058084,0.96991
2,0.866176,0.832443
3,0.601115,0.212339
4,0.708073,0.181825


In [114]:
df.mean()

A    0.477888
B    0.443420
dtype: float64

In [115]:
df.mean(axis = 0)

A    0.477888
B    0.443420
dtype: float64

In [116]:
df.mean(axis = 1)

0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

In [117]:
planets.isnull().any()

method            False
number            False
orbital_period     True
mass               True
distance           True
year              False
dtype: bool

In [118]:
planets[planets['orbital_period'].isnull()].shape   #(43, 6)
planets[planets['orbital_period'].isnull()].head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
29,Imaging,1,,,45.52,2005
30,Imaging,1,,,165.0,2007
31,Imaging,1,,,140.0,2004
33,Imaging,1,,,,2008
34,Imaging,1,,,145.0,2013


In [119]:
planets[planets['mass'].isnull()].shape   #(522, 6)

(522, 6)

In [120]:
planets[planets['distance'].isnull()].shape   #(227, 6)

(227, 6)

In [121]:
# If you only want to keep rows where both values are present:
s = planets[planets[['orbital_period', 'mass', 'distance']].notna().all(axis=1)]
s.shape    #(498, 6)

(498, 6)

In [122]:
planets.dropna().shape #(498, 6)

(498, 6)

In [123]:
planets.dropna().describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,498.0,498.0,498.0,498.0,498.0
mean,1.73494,835.778671,2.50932,52.068213,2007.37751
std,1.17572,1469.128259,3.636274,46.596041,4.167284
min,1.0,1.3283,0.0036,1.35,1989.0
25%,1.0,38.27225,0.2125,24.4975,2005.0
50%,1.0,357.0,1.245,39.94,2009.0
75%,2.0,999.6,2.8675,59.3325,2011.0
max,6.0,17337.5,25.0,354.0,2014.0


## <b>GroupBy: Split, Apply, Combine</b>

In [124]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'], 'data': range(1,7)})
df

Unnamed: 0,key,data
0,A,1
1,B,2
2,C,3
3,A,4
4,B,5
5,C,6


In [125]:
df.groupby('key')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000256CEDCA9F0>

In [126]:
result = df.groupby('key').sum()
result

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,5
B,7
C,9


In [127]:
result.shape

(3, 1)

In [128]:
result.reset_index('key')

Unnamed: 0,key,data
0,A,5
1,B,7
2,C,9


In [129]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [130]:
planets.groupby('method')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000256CD799040>

In [131]:
planets.groupby('method')['orbital_period']
planets.groupby('method')['orbital_period'].median()



method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64

In [132]:
planets['method'].nunique()


10

In [133]:
df = pd.DataFrame({
    'A': [1, 2],
    'B': [3, 4]
}, index=['x', 'y'])
df

Unnamed: 0,A,B
x,1,3
y,2,4


In [134]:
df.stack()
# df.stack().shape #(4,)

x  A    1
   B    3
y  A    2
   B    4
dtype: int64

In [135]:
planets.groupby('method')['year'].describe() #This gives you summary statistics (like count, mean, std, min, 25%, 50%, 75%, max) for the 'year' column, grouped by the 'method'.
#planets.groupby('method')['year'].describe().shape (10, 8)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,2011.5,2.12132,2010.0,2010.75,2011.5,2012.25,2013.0
Eclipse Timing Variations,9.0,2010.0,1.414214,2008.0,2009.0,2010.0,2011.0,2012.0
Imaging,38.0,2009.131579,2.781901,2004.0,2008.0,2009.0,2011.0,2013.0
Microlensing,23.0,2009.782609,2.859697,2004.0,2008.0,2010.0,2012.0,2013.0
Orbital Brightness Modulation,3.0,2011.666667,1.154701,2011.0,2011.0,2011.0,2012.0,2013.0
Pulsar Timing,5.0,1998.4,8.38451,1992.0,1992.0,1994.0,2003.0,2011.0
Pulsation Timing Variations,1.0,2007.0,,2007.0,2007.0,2007.0,2007.0,2007.0
Radial Velocity,553.0,2007.518987,4.249052,1989.0,2005.0,2009.0,2011.0,2014.0
Transit,397.0,2011.236776,2.077867,2002.0,2010.0,2012.0,2013.0,2014.0
Transit Timing Variations,4.0,2012.5,1.290994,2011.0,2011.75,2012.5,2013.25,2014.0


In [136]:
result = planets.groupby('method')['year'].describe()
print(result.index)        # Just method
print(result.columns)      # MultiIndex! (but looks flat)

Index(['Astrometry', 'Eclipse Timing Variations', 'Imaging', 'Microlensing',
       'Orbital Brightness Modulation', 'Pulsar Timing',
       'Pulsation Timing Variations', 'Radial Velocity', 'Transit',
       'Transit Timing Variations'],
      dtype='object', name='method')
Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], dtype='object')


In [137]:
planets.groupby('method')['year'].describe().stack()

method                          
Astrometry                 count       2.00000
                           mean     2011.50000
                           std         2.12132
                           min      2010.00000
                           25%      2010.75000
                                       ...    
Transit Timing Variations  min      2011.00000
                           25%      2011.75000
                           50%      2012.50000
                           75%      2013.25000
                           max      2014.00000
Length: 79, dtype: float64

In [138]:
b  = planets.groupby('method')['year'].describe().unstack()
b

       method                       
count  Astrometry                          2.0
       Eclipse Timing Variations           9.0
       Imaging                            38.0
       Microlensing                       23.0
       Orbital Brightness Modulation       3.0
                                         ...  
max    Pulsar Timing                    2011.0
       Pulsation Timing Variations      2007.0
       Radial Velocity                  2014.0
       Transit                          2014.0
       Transit Timing Variations        2014.0
Length: 80, dtype: float64

In [139]:
b.loc['count']

method
Astrometry                         2.0
Eclipse Timing Variations          9.0
Imaging                           38.0
Microlensing                      23.0
Orbital Brightness Modulation      3.0
Pulsar Timing                      5.0
Pulsation Timing Variations        1.0
Radial Velocity                  553.0
Transit                          397.0
Transit Timing Variations          4.0
dtype: float64

In [140]:
b.loc['min']

method
Astrometry                       2010.0
Eclipse Timing Variations        2008.0
Imaging                          2004.0
Microlensing                     2004.0
Orbital Brightness Modulation    2011.0
Pulsar Timing                    1992.0
Pulsation Timing Variations      2007.0
Radial Velocity                  1989.0
Transit                          2002.0
Transit Timing Variations        2011.0
dtype: float64

In [141]:
b['min', 'Astrometry']

2010.0

In [142]:
result = planets.groupby('method')['year'].describe()
print(result.index)        # Just method
print(result.columns)      # MultiIndex! (but looks flat)


Index(['Astrometry', 'Eclipse Timing Variations', 'Imaging', 'Microlensing',
       'Orbital Brightness Modulation', 'Pulsar Timing',
       'Pulsation Timing Variations', 'Radial Velocity', 'Transit',
       'Transit Timing Variations'],
      dtype='object', name='method')
Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], dtype='object')


In [143]:
result = planets.groupby('method')['year'].describe()
print(result.stack().head())  # You'll see a MultiIndex


method           
Astrometry  count       2.00000
            mean     2011.50000
            std         2.12132
            min      2010.00000
            25%      2010.75000
dtype: float64


## <b>Aggregate, Filter, Transform, Apply</b>

In [144]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data1': range(6),
'data2': rng.randint(0, 10, 6)},
columns = ['key', 'data1', 'data2'])
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [145]:
df.shape

(6, 3)

In [146]:
result = df.groupby('key').aggregate(['min', np.median, max])
result

  result = df.groupby('key').aggregate(['min', np.median, max])
  result = df.groupby('key').aggregate(['min', np.median, max])


Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,1.5,3,3,4.0,5
B,1,2.5,4,0,3.5,7
C,2,3.5,5,3,6.0,9


In [147]:
result['data1', 'min']  # This gives you the min of data1 for each group


key
A    0
B    1
C    2
Name: (data1, min), dtype: int64

In [148]:
result.columns

MultiIndex([('data1',    'min'),
            ('data1', 'median'),
            ('data1',    'max'),
            ('data2',    'min'),
            ('data2', 'median'),
            ('data2',    'max')],
           )

In [149]:
result.index

Index(['A', 'B', 'C'], dtype='object', name='key')

In [150]:
result.shape

(3, 6)

In [151]:
def filter_func(x):
    return x['data2'].std() > 4
display('df', "df.groupby('key').std()",
"df.groupby('key').filter(filter_func)")

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.12132,1.414214
B,2.12132,4.949747
C,2.12132,4.242641

Unnamed: 0,key,data1,data2
1,B,1,0
2,C,2,3
4,B,4,7
5,C,5,9


## <b>Specifying the Split Key</b>

In [152]:
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [154]:
L = [0, 1, 0, 1, 2, 0]
df.groupby(L).sum().drop('key', axis = 1)

Unnamed: 0,data1,data2
0,7,17
1,4,3
2,4,7


In [158]:
df.groupby('key').sum()
# .shape    #(3, 2)

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3,8
B,5,7
C,7,12


In [168]:
df3 = df.reset_index().drop('index', axis = 1)
df3


Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [166]:
df2 = df.set_index('key')
df2

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9


## <b>Grouping Example</b>

In [190]:
planets.head()
#(1035, 6)

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [191]:
planets['number'].unique()

array([1, 2, 3, 5, 4, 6, 7], dtype=int64)

In [195]:
decade = 10 * (planets['year'] // 10)
# decade.shape    #(1035,)    
# print(type(decade)) #<class 'pandas.core.series.Series'>
decade.tail()

1030    2000
1031    2000
1032    2000
1033    2000
1034    2000
Name: year, dtype: int64

In [196]:
planets.groupby(['method', decade])['number'].sum()

method                         year
Astrometry                     2010      2
Eclipse Timing Variations      2000      5
                               2010     10
Imaging                        2000     29
                               2010     21
Microlensing                   2000     12
                               2010     15
Orbital Brightness Modulation  2010      5
Pulsar Timing                  1990      9
                               2000      1
                               2010      1
Pulsation Timing Variations    2000      1
Radial Velocity                1980      1
                               1990     52
                               2000    475
                               2010    424
Transit                        2000     64
                               2010    712
Transit Timing Variations      2010      9
Name: number, dtype: int64

In [197]:
decade = decade.astype(str) + 's'
decade.name = 'decade'
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0
