In [1]:
"""
# Chapter 10 Data Aggregation and Group Operations.
Categorizing a dataset and applying a function to each group, whether an aggreation or transformation,
is a critical componet of a data analysis workflow. 

# GroupBy Mechanics: - Split - Apply - Combine -
In the first stage of the process:
data contained in a pandas object is spit into groups based on one or more keys that you provide.
The splittting is performed on a particular axis of an object,
then after a function is applied to each group, producing a new value.
Resulting object kind if dependent of mutation to data.
"""

%matplotlib notebook
# Financial Time Series 
import numpy as np 
import pandas as pd
from pylab import mpl, plt
plt.style.use('seaborn')
mpl.rcParams['font.family'] = 'serif'
%matplotlib inline
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', 1000)

In [2]:
df  = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                    'key2' : ['One', 'two', 'one', 'two', 'one'],
                    'data1' : np.random.randn(5),
                    'data2' : np.random.randn(5)
                })
display(df)

# TO compute the mean of the data1 column suing the labels from key1. 
grouped = df['data1'].groupby(df['key1'])

display(grouped.std())

means = df['data1'].groupby([df['key1'], df['key2']]).mean()

# Here we grouped the data using two keys, and the resulting series now has a hierarchial index consisting of the unique of keys observed
  
display(means)

display(means.unstack())

# Regardless of the objective in using groupby, a generally useful GroupBy method is size, which reurns a Series containing group sizes:

display(df.groupby(['key1', 'key2']).size())

Unnamed: 0,key1,key2,data1,data2
0,a,One,-0.144112,-0.51252
1,a,two,1.92071,-0.478941
2,b,one,0.805816,2.50769
3,b,two,0.013982,-0.996579
4,a,one,-0.118414,2.367331


key1
a    1.184777
b    0.559911
Name: data1, dtype: float64

key1  key2
a     One    -0.144112
      one    -0.118414
      two     1.920710
b     one     0.805816
      two     0.013982
Name: data1, dtype: float64

key2,One,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,-0.144112,-0.118414,1.92071
b,,0.805816,0.013982


key1  key2
a     One     1
      one     1
      two     1
b     one     1
      two     1
dtype: int64

In [3]:
"""
# Iterating Over Groups: The GroupBy object supports iteration,
genarating a sequence of 2-tuples containing  the group  name along with the chuck of data. 
"""

for name, group in df.groupby('key1'):
    print(name)
    print(group)
    
        
# In the case of multiple keys, the first element in the tuple of key values:

for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)
    
#  Of course,you can choose to do whaetever with thee pieces of data.
# Computing a dict of the data pieces as a one liner:

pieces = dict(list(df.groupby('key1')))

display(pieces['b'])

# By default groupby groups on axis =0, but you can hroup on any of the other axes.

a
  key1 key2     data1     data2
0    a  One -0.144112 -0.512520
1    a  two  1.920710 -0.478941
4    a  one -0.118414  2.367331
b
  key1 key2     data1     data2
2    b  one  0.805816  2.507690
3    b  two  0.013982 -0.996579
('a', 'One')
  key1 key2     data1    data2
0    a  One -0.144112 -0.51252
('a', 'one')
  key1 key2     data1     data2
4    a  one -0.118414  2.367331
('a', 'two')
  key1 key2    data1     data2
1    a  two  1.92071 -0.478941
('b', 'one')
  key1 key2     data1    data2
2    b  one  0.805816  2.50769
('b', 'two')
  key1 key2     data1     data2
3    b  two  0.013982 -0.996579


Unnamed: 0,key1,key2,data1,data2
2,b,one,0.805816,2.50769
3,b,two,0.013982,-0.996579


In [4]:
"""
# Selecting a Column or Subset of Columns:
It may be desirable to aggregate only a fre columns. 

To compute means for just the data2 column and get the reslut as a df 
"""

s_grouped = df.groupby(['key1', 'key2'])['data2']

display(s_grouped.mean())


key1  key2
a     One    -0.512520
      one     2.367331
      two    -0.478941
b     one     2.507690
      two    -0.996579
Name: data2, dtype: float64

In [10]:
"""
# Grouping with Dicts andf Series

Grouping information may exist in a form other than an array. 

"""
people = pd.DataFrame(np.random.randn(5, 5),
                     columns=['a', 'b', 'c', 'd', 'e'],
                     index=['Joe', 'Steven', 'Wes', 'Jim', 'Travis'])

people.iloc[2:3, [1,2]] = np.nan

display(people)

# Suppose a group corrspondence for the columns and want to sum togehter the clumns by group

mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
          'd': 'blue', 'e': 'red', 'f': 'orange'}

by_column = people.groupby(mapping, axis=1)

display(by_column.sum())

# The same fucntionality holds for Serues, which can be viewd as a fixed size-mapping

map_serries = pd.Series(mapping)

display(map_serries)

display(people.groupby(map_serries, axis=1).count())

Unnamed: 0,a,b,c,d,e
Joe,-0.519065,0.741475,-0.574417,-1.433153,0.338756
Steven,-0.448253,-0.924646,1.342309,-0.053818,-1.463022
Wes,-0.001318,,,-0.602676,-0.357464
Jim,1.930604,-1.307524,-0.445912,-0.651534,-0.302983
Travis,-0.635849,1.631111,-0.427774,1.534683,1.532533


Unnamed: 0,blue,red
Joe,-2.00757,0.561166
Steven,1.288491,-2.835921
Wes,-0.602676,-0.358781
Jim,-1.097446,0.320097
Travis,1.106909,2.527794


a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

Unnamed: 0,blue,red
Joe,2,3
Steven,2,3
Wes,1,2
Jim,2,3
Travis,2,3
