a Series, Data‐Frame, or otherwise, is split into groups based on one or more keys that you provide.
The splitting is performed on a particular axis of an object. 

For example, a DataFrame can be grouped on its rows (axis=0) or its columns (axis=1). Once this is done, a
function is applied to each group, producing a new value.

Finally, the results of all those function applications are combined into a result object. The form of the resulting object will usually depend on what’s being done to the data.

# a simple group aggregation sample

![Example Image](groupbymechanics.PNG)

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

In [3]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
 ....: 'key2' : ['one', 'two', 'one', 'two', 'one'],
 ....: 'data1' : np.random.randn(5),
 ....: 'data2' : np.random.randn(5)})   
df    

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.317995,-1.290099
1,a,two,0.577277,-0.978998
2,b,one,-0.012996,0.429269
3,b,two,0.267171,-0.032036
4,a,one,-1.577427,-1.217361


In [5]:
grouped = df['data1'].groupby(df['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000017CAF0F53D0>

In [6]:
grouped.mean()

key1
a   -0.772715
b    0.127088
Name: data1, dtype: float64

In [7]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [8]:
means

key1  key2
a     one    -1.447711
      two     0.577277
b     one    -0.012996
      two     0.267171
Name: data1, dtype: float64

In [9]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-1.447711,0.577277
b,-0.012996,0.267171


In [10]:
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

df['data1'].groupby([states, years]).mean()


California  2005    0.577277
            2006   -0.012996
Ohio        2005   -0.525412
            2006   -1.577427
Name: data1, dtype: float64

In [12]:
df.groupby('key2').mean()

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
one,-0.969473,-0.692731
two,0.422224,-0.505517


In [13]:
df.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-1.447711,-1.25373
a,two,0.577277,-0.978998
b,one,-0.012996,0.429269
b,two,0.267171,-0.032036


In [14]:
df.groupby(['key1', 'key2']).size() #number of the participants in dataset 

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

# Iterating Over Groups

In [22]:
for name, group in df.groupby('key1'):    
    print(name)
    print(group)
    
# code lines distrubute the participants of key1. 

a
  key1 key2     data1     data2
0    a  one -1.317995 -1.290099
1    a  two  0.577277 -0.978998
4    a  one -1.577427 -1.217361
b
  key1 key2     data1     data2
2    b  one -0.012996  0.429269
3    b  two  0.267171 -0.032036


In [23]:
 for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)
# In the case of multiple keys, the first element in the tuple will be a tuple of key values

('a', 'one')
  key1 key2     data1     data2
0    a  one -1.317995 -1.290099
4    a  one -1.577427 -1.217361
('a', 'two')
  key1 key2     data1     data2
1    a  two  0.577277 -0.978998
('b', 'one')
  key1 key2     data1     data2
2    b  one -0.012996  0.429269
('b', 'two')
  key1 key2     data1     data2
3    b  two  0.267171 -0.032036


In [31]:
pieces = dict(list(df.groupby('key1')))
pieces

{'a':   key1 key2     data1     data2
 0    a  one -1.317995 -1.290099
 1    a  two  0.577277 -0.978998
 4    a  one -1.577427 -1.217361,
 'b':   key1 key2     data1     data2
 2    b  one -0.012996  0.429269
 3    b  two  0.267171 -0.032036}

In [32]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [40]:
grouped = df.groupby(df.dtypes, axis=1)

In [41]:
for dtype, group in grouped:
    print(dtype)
    print(group)
#seperate the dtypes inside of the data.

float64
      data1     data2
0 -1.317995 -1.290099
1  0.577277 -0.978998
2 -0.012996  0.429269
3  0.267171 -0.032036
4 -1.577427 -1.217361
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


# Selecting a Column or Subset of Columns

In [45]:
df.groupby('key1')['data1'].mean()

key1
a   -0.772715
b    0.127088
Name: data1, dtype: float64

In [74]:
df['data1'].groupby([df['key1'],df['key2']]).mean()

key1  key2
a     one    -1.447711
      two     0.577277
b     one    -0.012996
      two     0.267171
Name: data1, dtype: float64

In [55]:
df.groupby(['key1', 'key2'])[['data2']].mean() # [["data2"]]: which data do you want to groupby 

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-1.25373
a,two,-0.978998
b,one,0.429269
b,two,-0.032036


In [75]:
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000017CB1CE2970>

In [76]:
s_grouped.mean()

key1  key2
a     one    -1.253730
      two    -0.978998
b     one     0.429269
      two    -0.032036
Name: data2, dtype: float64

# Grouping with Dicts and Series

In [77]:
people = pd.DataFrame(np.random.randn(5, 5),
    columns=['a', 'b', 'c', 'd', 'e'],
    index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

In [78]:
people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values

In [79]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.974199,-0.880342,1.135269,2.520823,0.796803
Steve,-0.117395,-0.619025,0.433735,-0.102765,-0.309131
Wes,0.740082,,,-0.685116,0.239104
Jim,0.518507,0.014275,1.10844,0.249896,1.998932
Travis,-1.72032,-0.870739,0.660414,0.602178,-0.019016


In [81]:
# suppose I have a group correspondence for the columns and want to sum together the columns by group
mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}

In [82]:
by_column = people.groupby(mapping, axis=1)

In [90]:
by_column.sum()

Unnamed: 0,blue,red
Joe,3.656092,0.89066
Steve,0.33097,-1.045551
Wes,-0.685116,0.979186
Jim,1.358336,2.531713
Travis,1.262591,-2.610074


In [86]:
map_series = pd.Series(mapping)

In [87]:
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [88]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


# Grouping with Functions

In [94]:
#Suppose you wanted to group by the length of the names;
#while you could compute an array of string lengths, it’s simpler to just pass the len function
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,2.232788,-0.866067,2.243709,2.085603,3.034839
5,-0.117395,-0.619025,0.433735,-0.102765,-0.309131
6,-1.72032,-0.870739,0.660414,0.602178,-0.019016


In [95]:
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,0.740082,-0.880342,1.135269,-0.685116,0.239104
3,two,0.518507,0.014275,1.10844,0.249896,1.998932
5,one,-0.117395,-0.619025,0.433735,-0.102765,-0.309131
6,two,-1.72032,-0.870739,0.660414,0.602178,-0.019016


# Grouping by Index Levels

In [96]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
 ....: [1, 3, 5, 1, 3]],
 ....: names=['cty', 'tenor'])

In [102]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)

In [103]:
hier_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,1.725181,-0.660124,1.312349,0.278569,2.86922
1,1.193114,-0.358807,-0.280549,-0.114571,0.237905
2,1.570417,1.337562,-0.338722,0.174109,-0.072535
3,0.700825,-0.527574,0.171084,0.527926,-0.707716


In [104]:
hier_df.groupby(level='cty', axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


# Data Aggregation

![Example Image](groupbymethods.PNG)

In [105]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-1.317995,-1.290099
1,a,two,0.577277,-0.978998
2,b,one,-0.012996,0.429269
3,b,two,0.267171,-0.032036
4,a,one,-1.577427,-1.217361


In [110]:
grouped = df.groupby('key1')
grouped.mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.772715,-1.162153
b,0.127088,0.198616


In [107]:
grouped['data1'].quantile(0.9)  
# quantile: Return values at the given quantile over requested axis.

key1
a    0.198222
b    0.239154
Name: data1, dtype: float64

In [111]:
#using your own aggregation functions, pass any function that aggregates an array to the aggregate or agg method:

def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped.agg(peak_to_peak)
#agg function ! 

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2.154704,0.311101
b,0.280166,0.461305


In [None]:
#316-335 